annotate mirgene_functions.py @ 3:106c4aea4650 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 18:54:12 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
1 import itertools
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
2 import time
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
3 import sys
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
4 import os
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
5 import urllib.request
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
6 import gzip
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
8 import subprocess
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
9 import argparse
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
10 from collections import OrderedDict
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
11 from matplotlib.backends.backend_pdf import PdfPages
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
12 import pandas as pd
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
13 from math import pi
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
14 import numpy as np
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
15 import matplotlib.pyplot as plt
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
16 from matplotlib.ticker import PercentFormatter
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
17 import seaborn as sns
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
18 import scipy.stats as stats
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
19 from plotnine import *
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
20 import math
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
21 import re
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
22 import matplotlib.ticker as mtick
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
23 import copy
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
24
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
25
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
26
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
27
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
28 """---------------------- Simple Functions -----------------------"""
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
29
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
30 # Read a file and return it as a list
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
31 def read(path, flag):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
32 if flag == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
33 with open(path) as fp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
34 file=fp.readlines()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
35 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
36 return file
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
37
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
38 if flag == 1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
39 with open(path) as fp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
40 file = fp.read().splitlines()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
41 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
42 return file
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
43
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
44 # Write a list to a txt file
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
45 def write(path, list):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
46 with open(path,'w') as fp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
47 for x in list:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
48 fp.write(str("\t".join(x[1:-1])))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
49 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
50
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
51 """---------------------- RNA-seq Functions ----------------------"""
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
52
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
53 # Detect the longest common substring sequence between two mirnas
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
54 def longestSubstring(str1, str2):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
55
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
56 from difflib import SequenceMatcher
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
57 # initialize SequenceMatcher object with
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
58 # input string
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
59 seqMatch = SequenceMatcher(None, str1, str2)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
60
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
61 # find match of longest sub-string
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
62 # output will be like Match(a=0, b=0, size=5)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
64
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
65 # print longest substring
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
66 if (match.size != 0):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
67 return str1[match.a: match.a + match.size]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
68 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
69 print('No longest common sub-string found')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
70
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
71
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
72
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
73 ########################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
74 def collapse_sam(path):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
75
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
76 ini_sam=read(path,0)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
79
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
80 uni_seq = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
81 for x in main_sam:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
82
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
83 if [x[2], x[9]] not in uni_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
84 uni_seq.append([x[2], x[9]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
85
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
86 new_main_sam=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
87 incr_num=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
88 for i in range(len(uni_seq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
89 count=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
90 incr_num+=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
91 for y in main_sam:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
93 count+=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
94 temp=y
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
96 temp[0]=str(incr_num)+"-"+str(count)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
97 new_main_sam.append(temp)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
98
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
99 new_sam=intro_sam+new_main_sam
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
100
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
101 return new_sam
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
102
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
103 #################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
104
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
106
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
107 # read the sam file
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
108
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
109 ini_sam=read(path,0)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
112
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
113 # Calculate the shifted positions for every isomir and add them to the name of it
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
114 sorted_uni_arms = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
115
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
116 for i in range(len(mature_mirnas)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
117 tmp_count_reads = 0 # calculate the total number of reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
118 tmp_count_seq = 0 # calculate the total number of sequences
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
119 for j in range(len(unique_seq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
120
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
121 if mature_mirnas[i] == unique_seq[j][2]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
122
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
123 temp_mature = mature_mirnas[i+1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
124 off_part = longestSubstring(temp_mature, unique_seq[j][9])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
125
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
126 mat_diff = temp_mature.split(off_part)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
128
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
129 unique_diff = unique_seq[j][9].split(off_part)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
131
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
132 # Problem with hsa-miR-8485
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
133 if mat_diff[1]!=0 and unique_diff[1]!=0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
134 unique_seq[j]=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
135 pre_pos = 0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
136 post_pos = 0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
137
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
138 elif mat_diff[0]!=0 and unique_diff[0]!=0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
139 unique_seq[j]=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
140 pre_pos = 0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
141 post_pos = 0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
142
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
143 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
144 pre_pos = mat_diff[0]-unique_diff[0]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
145 post_pos = unique_diff[1]-mat_diff[1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
147 tmp_count_seq = tmp_count_seq+1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
148
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
149
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
150 if pre_pos != 0 or post_pos != 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
151 if pre_pos == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
153 elif post_pos == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
155 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
157
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
158 for x in range(unique_seq.count(1)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
159 unique_seq.remove(1)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
160 if tmp_count_reads != 0 and tmp_count_seq != 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
162
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
163 # Store name of arms, number of sequences and number of reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
165
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
166 for y in sorted_uni_arms:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
167 counts=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
168 seqs=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
169 for x in unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
171 counts+=int(x[0].split("-")[1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
172 seqs+=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
173
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
174 y[1]=seqs
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
175 y[2]=counts
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
176
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
177 LHE=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
178
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
179 l.acquire()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
180 if con=="c":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
181 LHE.extend(z[2] for z in unique_seq)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
182 for y in unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
183 samples_mirna_names.append([y[2],y[9]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
185 LHE_names.extend(LHE)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
188 names.append(name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
189 samples.append(unique_seq)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
190 data.append([con,name,unique_seq,sorted_uni_arms])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
191 ini_sample.append(new_main_sam)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
192
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
193 if con=="t":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
194 LHE.extend(z[2] for z in unique_seq)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
195 for y in unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
196 samples_mirna_names.append([y[2],y[9]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
198 LHE_names.extend(LHE)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
201 names.append(name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
202 samples.append(unique_seq)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
203 data.append([con,name,unique_seq,sorted_uni_arms])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
204 ini_sample.append(new_main_sam)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
205 l.release()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
206
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
207
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
208 ######################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
209 """
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
210 Read a sam file from Bowtie and do the followings:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
211
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
212 1) Remove reverse stranded mapped reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
213 2) Remove unmapped reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
214 3) Remove all sequences with reads less than 11 reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
215 4) Sort the arms with the most sequences in decreading rate
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
216 5) Sort the sequences of every arm with the most reads in decreasing rate
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
217 6) Calculate total number of sequences of every arm
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
218 7) Calculate total number of reads of sequences of every arm.
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
219 8) Store all the informations in a txt file
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
220
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
221 """
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
222
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
224
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
225
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
226 ini_sam=read(path,0)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
228 unique_seq=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
230 uni_seq=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
231
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
232 # Calculate the shifted positions for every isomir and add them to the name of it
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
233 sorted_uni_arms = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
234 for i in range(1,len(mature_mirnas),2):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
235 tmp_count_reads = 0 # calculate the total number of reads
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
236 tmp_count_seq = 0 # calculate the total number of sequences
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
237
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
238 for j in range(len(unique_seq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
239
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
240 temp_mature = mature_mirnas[i].strip().replace("U", "T")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
241
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
242 if temp_mature in unique_seq[j][9]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
243
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
244 off_part = longestSubstring(temp_mature, unique_seq[j][9])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
245
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
246 mat_diff = temp_mature.split(off_part)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
248
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
249 unique_diff = unique_seq[j][9].split(off_part)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
250 if len(unique_diff)<=2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
252
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
253 pre_pos = mat_diff[0]-unique_diff[0]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
254 post_pos = unique_diff[1]-mat_diff[1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
255
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
256 lengthofmir = len(off_part) + post_pos
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
257 if pre_pos == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
259 tmp_count_seq = tmp_count_seq + 1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
260
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
261 if pre_pos == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
262 t_name=copy.deepcopy(unique_seq[j])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
264 uni_seq.append(t_name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
265
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
266 if tmp_count_reads != 0 and tmp_count_seq != 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
268
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
269
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
272
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
273 LHE=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
274
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
275 l.acquire()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
276 if con=="c":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
277 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
278 for x in unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
279 if x[2]!="*":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
280 n_samples_mirna_names.append([x[2],x[9]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
282 n_LHE_names.extend(LHE)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
283 names.append(name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
284 data.append([con,name,unique_seq,sorted_uni_arms])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
285
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
286
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
287 if con=="t":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
288 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
289 for x in unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
290 if x[2]!="*":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
291 n_samples_mirna_names.append([x[2],x[9]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
293 n_LHE_names.extend(LHE)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
294 names.append(name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
295 data.append([con,name,unique_seq,sorted_uni_arms])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
296 l.release()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
297
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
298 #################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
299
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
300 def deseq2_temp(samples_mirna_names,deseq,con,l):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
301
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
302 samples_mirna_names.sort(key=lambda x:[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
303
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
304 for i in range(len(deseq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
305 for y in samples_mirna_names:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
306 flag = 0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
307 for x in deseq[i]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
308 if y[0] == x[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
309 flag = 1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
310 break
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
311
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
312 if flag == 0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
313 deseq[i].append([y[0], "0", y[1]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
314
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
316 deseq_final = [[x[0],x[2]] for x in deseq[0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
318
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
319 l.acquire()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
320 if con=="c":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
321 q1.put(deseq_final)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
322
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
323 if con=="t":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
324 q2.put(deseq_final)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
325 l.release()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
326
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
327
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
328 ##################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
329
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
331
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
334
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
335 LH8E_add_names.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
336 LH2E_add_names.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
339
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
340 LH2E.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
341 LH8E.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
344
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
345 print("LHE_names")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
346 print([len(LH8E_add_names),len(LH2E_add_names)])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
347 print([len(LH8E),len(LH2E)])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
348
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
349 zeros=["0"]*(len(LH8E[0])-2)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
350 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
351 LH8E=LH8E+LH8E_add_names
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
352
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
353 zeros=["0"]*(len(LH2E[0])-2)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
354 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
355 LH2E=LH2E+LH2E_add_names
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
356
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
357 dupes=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
358 final_LH2E =[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
359
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
360 for num,_ in enumerate(LH2E):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
361
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
362 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
363 final_LH2E.append(LH2E[num][1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
364 final_LH2E.append(LH2E[num][0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
365 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
366 dupes.append(LH2E[num][1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
367
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
368 dupes=list(set(dupes))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
369
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
370 dupes=[[x] for x in dupes]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
371
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
372 for x in LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
373 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
374 if x[1]==y[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
375 fl=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
376 if len(y)==1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
377 y.append(x[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
378 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
379 for i in range(1,len(y)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
380 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
381 fl=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
382 if len(x[0])<len(y[i]):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
383 del y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
384 y.append(x[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
385 break
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
386
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
387 if fl==0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
388 y.append((x[0]))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
389
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
390 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
391 if len(y)>2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
392 for i in range(len(y)-1,1,-1):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
393 y[1]=y[1]+"/"+y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
394 del y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
395
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
396
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
397 for x in LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
398 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
399 if x[1]==y[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
400 x[0]=y[1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
401
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
402 for x in LH8E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
403 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
404 if x[1]==y[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
405 x[0]=y[1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
406
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
407
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
408
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
409 LH2E.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
410 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
411
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
412 LH8E.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
413 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
414
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
415 if int(per)!=-1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
416 percent=int(per)/100
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
417 print(percent)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
418 print(count)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
419
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
420 c_col_filter=round(percent*(len(LH2E[1])-2))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
421 t_col_filter=round(percent*(len(LH8E[1])-2))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
422
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
423 for i, _ in enumerate(LH2E):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
424 c_cols=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
425 t_cols=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
426
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
427 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
428 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
429
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
430 if c_cols>=c_col_filter or t_cols>=t_col_filter:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
431 filter_LH8E.append(LH8E[i])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
432 filter_LH2E.append(LH2E[i])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
433
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
434 raw_LH2E.extend(LH2E)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
435 raw_LH8E.extend(LH8E)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
436
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
437 ##################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
438
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
439 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
440
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
441 if flag == 1 and int(per)!=-1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
442 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
443 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
444 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
445 for y in names_tre:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
446 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
447
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
448 for x in fil_LH8E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
449 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
450 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
451
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
452 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
453 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
454 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
455 for y in names_con:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
456 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
457
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
458 for x in fil_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
459 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
460 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
461
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
462
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
463 if flag == 2 and int(per)!=-1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
464 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
465 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
466 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
467 for y in names_tre:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
468 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
469
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
470
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
471 for x in fil_LH8E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
472 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
473 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
474
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
475 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
476 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
477 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
478 for y in names_con:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
479 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
480
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
481 for x in fil_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
482 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
483 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
484
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
485
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
486 if flag == 1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
487 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
488 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
489 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
490 for y in names_tre:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
491 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
492
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
493 for x in raw_LH8E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
494 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
495 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
496
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
497 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
498 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
499 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
500 for y in names_con:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
501 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
502
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
503 for x in raw_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
504 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
505 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
506
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
507 if flag == 2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
508 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
509 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
510 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
511 for y in names_tre:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
512 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
513
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
514
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
515 for x in raw_LH8E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
516 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
517 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
518
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
519 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
520 fp.write("Name\t")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
521 fp.write("Sequence")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
522 for y in names_con:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
523 fp.write("\t"+y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
524
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
525 for x in raw_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
526 fp.write("\n%s" % "\t".join(x))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
527 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
528
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
529 ####################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
530
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
531 def ssamples(names,samp,folder,pro):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
532
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
533 for i in range(2,len(samp[0])):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
534
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
535 fp = open(folder+names[i-2]+'.txt','w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
536 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
537
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
538 for x in samp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
539 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
540 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
541
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
542 ####################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
543
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
544 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
545
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
546 if f==1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
547 # Write a txt file with all the information
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
548 if con=="c":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
549 fp = open('split1/'+name, 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
550
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
551 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
552 if con=="t":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
553 fp = open('split2/'+name, 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
554 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
555
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
556 for i in range(len(sorted_uni_arms)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
557 temp = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
558 for j in range(len(unique_seq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
559
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
560 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
561
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
562 temp.append(unique_seq[j])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
563
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
564 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
565 fp.write("*********************************************************************************************************\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
566 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
567 fp.write("*********************************************************************************************************\n\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
568 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
569 fp.write("\n" + "\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
570 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
571
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
572 if f==2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
573
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
574 if con=="c":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
575 fp = open('split3/'+name, 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
576 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
577 if con=="t":
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
578 fp = open('split4/'+name, 'w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
579 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
580
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
581 for i in range(len(sorted_uni_arms)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
582 temp = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
583 for j in range(len(unique_seq)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
584 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
585 temp.append(unique_seq[j])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
586 if temp!=[]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
587 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
588 fp.write("*********************************************************************************************************\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
589 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
590 fp.write("*********************************************************************************************************\n\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
591 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
592 fp.write("\n" + "\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
593 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
594
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
595
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
596 ##########################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
597
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
598 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
599
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
600 unique_iso = []
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
601 for x in pre_unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
602 if len(x[2].split("_"))==3:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
603 for y in pre_unique_seq:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
604 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
605 if any(y[2] in lst2 for lst2 in unique_iso)==False:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
606 y[2]=">"+y[2]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
607 unique_iso.append(y)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
608 l.acquire()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
609 for x in unique_iso:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
610 mat_mirnas.append(x[2])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
611 mat_mirnas.append(x[9])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
612 l.release()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
613
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
614 #########################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
615
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
616 def merging_names(LH2E_copy,new):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
617
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
618 dupes=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
619 final_LH2E =[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
620
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
621 for num in range(len(LH2E_copy)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
622
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
623 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
624 final_LH2E.append(LH2E_copy[num][1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
625 final_LH2E.append(LH2E_copy[num][0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
626 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
627 dupes.append(LH2E_copy[num][1])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
628
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
629 dupes=list(set(dupes))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
630
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
631 for i in range(len(dupes)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
632 dupes[i]=[dupes[i]]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
633
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
634 for x in LH2E_copy:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
635 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
636 if x[1]==y[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
637 fl=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
638 if len(y)==1:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
639 y.append(x[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
640 else:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
641 for i in range(1,len(y)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
642 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
643 fl=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
644 if len(x[0])<len(y[i]):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
645 del y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
646 y.append(x[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
647 break
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
648
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
649 if fl==0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
650 y.append((x[0]))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
651
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
652 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
653 if len(y)>2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
654 for i in range(len(y)-1,1,-1):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
655 y[1]=y[1]+"/"+y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
656 del y[i]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
657
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
658
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
659 for x in LH2E_copy:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
660 for y in dupes:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
661 if x[1]==y[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
662 x[0]=y[1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
663
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
664 LH2E_copy.sort()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
665 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
666 new.extend(LH2E_copy)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
667
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
668 ######################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
669
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
670 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
671
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
672 for i in range(2,len(tem_samp[0])):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
673
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
674 fp = open(folder+tem_names[i-2]+'.txt','w')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
675 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
676
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
677 for x in tem_samp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
678 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
679
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
680 for j in range(len(non_names)):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
681 if non_names[j]==tem_names[i-2]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
682 for x in non_samp:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
683 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
684 fp.close()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
685
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
686 #################################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
687
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
688 def download_matures(matures,org_name):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
689
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
690 mature_mir=[]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
691
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
692 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
693 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
694
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
695 data = urllib.request.urlopen(mat_url).read()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
696 file_mirna = data.decode('utf-8')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
697 mature_mir = file_mirna.split("\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
698 mature_mir = [x.replace(">","") for x in mature_mir]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
699 del mature_mir[-1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
700
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
701 data = urllib.request.urlopen(star_url).read()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
702 file_mirna = data.decode('utf-8')
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
703 star_mir = file_mirna.split("\n")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
704 star_mir = [x.replace(">","") for x in star_mir]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
705 del star_mir[-1]
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
706
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
707 mature_mir.extend(star_mir)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
708
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
709 for i in range(1,len(mature_mir),2):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
710 mature_mir[i]=mature_mir[i].replace("U","T")
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
711
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
712 matures.extend(mature_mir)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
713
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
714 ###################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
715
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
716 def non_template_ref(sc,st,all_isoforms):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
717
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
718 pre_uni_seq_con = list(sc)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
719 pre_uni_seq_tre = list(st)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
720
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
721 for x in pre_uni_seq_con:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
722 for y in x:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
723 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
724 all_isoforms.append(y[2])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
725 all_isoforms.append(y[9])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
726
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
727 for x in pre_uni_seq_tre:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
728 for y in x:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
729 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
730 all_isoforms.append(y[2])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
731 all_isoforms.append(y[9])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
732
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
733 ################################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
734
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
735 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
736
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
737 for y in mir_names:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
738 flag=0
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
739 for x in sample:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
740 if y[0]==x[0]:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
741 flag=1
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
742 break
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
743 if flag==0:
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
744 sample.append([y[0],"0",y[1]])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
745
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
746 sample.sort(key=lambda x: x[0])
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
747 sample=list(sample for sample,_ in itertools.groupby(sample))
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
748
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
749 l.acquire()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
750 new_d.append(sample)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
751 sample_order.append(sample_name)
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
752 l.release()
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
753
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
754 ###############################################################################################################################################################################################
106c4aea4650 Uploaded
glogobyte
parents:
diff changeset
755