Mercurial > repos > glogobyte > isoread
comparison mirgene_functions.py @ 3:106c4aea4650 draft
Uploaded
| author | glogobyte |
|---|---|
| date | Fri, 16 Oct 2020 18:54:12 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:f0c4c70ceb00 | 3:106c4aea4650 |
|---|---|
| 1 import itertools | |
| 2 import time | |
| 3 import sys | |
| 4 import os | |
| 5 import urllib.request | |
| 6 import gzip | |
| 7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | |
| 8 import subprocess | |
| 9 import argparse | |
| 10 from collections import OrderedDict | |
| 11 from matplotlib.backends.backend_pdf import PdfPages | |
| 12 import pandas as pd | |
| 13 from math import pi | |
| 14 import numpy as np | |
| 15 import matplotlib.pyplot as plt | |
| 16 from matplotlib.ticker import PercentFormatter | |
| 17 import seaborn as sns | |
| 18 import scipy.stats as stats | |
| 19 from plotnine import * | |
| 20 import math | |
| 21 import re | |
| 22 import matplotlib.ticker as mtick | |
| 23 import copy | |
| 24 | |
| 25 | |
| 26 | |
| 27 | |
| 28 """---------------------- Simple Functions -----------------------""" | |
| 29 | |
| 30 # Read a file and return it as a list | |
| 31 def read(path, flag): | |
| 32 if flag == 0: | |
| 33 with open(path) as fp: | |
| 34 file=fp.readlines() | |
| 35 fp.close() | |
| 36 return file | |
| 37 | |
| 38 if flag == 1: | |
| 39 with open(path) as fp: | |
| 40 file = fp.read().splitlines() | |
| 41 fp.close() | |
| 42 return file | |
| 43 | |
| 44 # Write a list to a txt file | |
| 45 def write(path, list): | |
| 46 with open(path,'w') as fp: | |
| 47 for x in list: | |
| 48 fp.write(str("\t".join(x[1:-1]))) | |
| 49 fp.close() | |
| 50 | |
| 51 """---------------------- RNA-seq Functions ----------------------""" | |
| 52 | |
| 53 # Detect the longest common substring sequence between two mirnas | |
| 54 def longestSubstring(str1, str2): | |
| 55 | |
| 56 from difflib import SequenceMatcher | |
| 57 # initialize SequenceMatcher object with | |
| 58 # input string | |
| 59 seqMatch = SequenceMatcher(None, str1, str2) | |
| 60 | |
| 61 # find match of longest sub-string | |
| 62 # output will be like Match(a=0, b=0, size=5) | |
| 63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
| 64 | |
| 65 # print longest substring | |
| 66 if (match.size != 0): | |
| 67 return str1[match.a: match.a + match.size] | |
| 68 else: | |
| 69 print('No longest common sub-string found') | |
| 70 | |
| 71 | |
| 72 | |
| 73 ######################################################################################################################################################## | |
| 74 def collapse_sam(path): | |
| 75 | |
| 76 ini_sam=read(path,0) | |
| 77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
| 78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] | |
| 79 | |
| 80 uni_seq = [] | |
| 81 for x in main_sam: | |
| 82 | |
| 83 if [x[2], x[9]] not in uni_seq: | |
| 84 uni_seq.append([x[2], x[9]]) | |
| 85 | |
| 86 new_main_sam=[] | |
| 87 incr_num=0 | |
| 88 for i in range(len(uni_seq)): | |
| 89 count=0 | |
| 90 incr_num+=1 | |
| 91 for y in main_sam: | |
| 92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: | |
| 93 count+=1 | |
| 94 temp=y | |
| 95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | |
| 96 temp[0]=str(incr_num)+"-"+str(count) | |
| 97 new_main_sam.append(temp) | |
| 98 | |
| 99 new_sam=intro_sam+new_main_sam | |
| 100 | |
| 101 return new_sam | |
| 102 | |
| 103 ################################################################################################################################################################# | |
| 104 | |
| 105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): | |
| 106 | |
| 107 # read the sam file | |
| 108 | |
| 109 ini_sam=read(path,0) | |
| 110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
| 111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | |
| 112 | |
| 113 # Calculate the shifted positions for every isomir and add them to the name of it | |
| 114 sorted_uni_arms = [] | |
| 115 | |
| 116 for i in range(len(mature_mirnas)): | |
| 117 tmp_count_reads = 0 # calculate the total number of reads | |
| 118 tmp_count_seq = 0 # calculate the total number of sequences | |
| 119 for j in range(len(unique_seq)): | |
| 120 | |
| 121 if mature_mirnas[i] == unique_seq[j][2]: | |
| 122 | |
| 123 temp_mature = mature_mirnas[i+1] | |
| 124 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
| 125 | |
| 126 mat_diff = temp_mature.split(off_part) | |
| 127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
| 128 | |
| 129 unique_diff = unique_seq[j][9].split(off_part) | |
| 130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
| 131 | |
| 132 # Problem with hsa-miR-8485 | |
| 133 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
| 134 unique_seq[j]=1 | |
| 135 pre_pos = 0 | |
| 136 post_pos = 0 | |
| 137 | |
| 138 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
| 139 unique_seq[j]=1 | |
| 140 pre_pos = 0 | |
| 141 post_pos = 0 | |
| 142 | |
| 143 else: | |
| 144 pre_pos = mat_diff[0]-unique_diff[0] | |
| 145 post_pos = unique_diff[1]-mat_diff[1] | |
| 146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
| 147 tmp_count_seq = tmp_count_seq+1 | |
| 148 | |
| 149 | |
| 150 if pre_pos != 0 or post_pos != 0: | |
| 151 if pre_pos == 0: | |
| 152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
| 153 elif post_pos == 0: | |
| 154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
| 155 else: | |
| 156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
| 157 | |
| 158 for x in range(unique_seq.count(1)): | |
| 159 unique_seq.remove(1) | |
| 160 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
| 161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | |
| 162 | |
| 163 # Store name of arms, number of sequences and number of reads | |
| 164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
| 165 | |
| 166 for y in sorted_uni_arms: | |
| 167 counts=0 | |
| 168 seqs=0 | |
| 169 for x in unique_seq: | |
| 170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | |
| 171 counts+=int(x[0].split("-")[1]) | |
| 172 seqs+=1 | |
| 173 | |
| 174 y[1]=seqs | |
| 175 y[2]=counts | |
| 176 | |
| 177 LHE=[] | |
| 178 | |
| 179 l.acquire() | |
| 180 if con=="c": | |
| 181 LHE.extend(z[2] for z in unique_seq) | |
| 182 for y in unique_seq: | |
| 183 samples_mirna_names.append([y[2],y[9]]) | |
| 184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
| 185 LHE_names.extend(LHE) | |
| 186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
| 187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
| 188 names.append(name) | |
| 189 samples.append(unique_seq) | |
| 190 data.append([con,name,unique_seq,sorted_uni_arms]) | |
| 191 ini_sample.append(new_main_sam) | |
| 192 | |
| 193 if con=="t": | |
| 194 LHE.extend(z[2] for z in unique_seq) | |
| 195 for y in unique_seq: | |
| 196 samples_mirna_names.append([y[2],y[9]]) | |
| 197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
| 198 LHE_names.extend(LHE) | |
| 199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
| 200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
| 201 names.append(name) | |
| 202 samples.append(unique_seq) | |
| 203 data.append([con,name,unique_seq,sorted_uni_arms]) | |
| 204 ini_sample.append(new_main_sam) | |
| 205 l.release() | |
| 206 | |
| 207 | |
| 208 ###################################################################################################################################### | |
| 209 """ | |
| 210 Read a sam file from Bowtie and do the followings: | |
| 211 | |
| 212 1) Remove reverse stranded mapped reads | |
| 213 2) Remove unmapped reads | |
| 214 3) Remove all sequences with reads less than 11 reads | |
| 215 4) Sort the arms with the most sequences in decreading rate | |
| 216 5) Sort the sequences of every arm with the most reads in decreasing rate | |
| 217 6) Calculate total number of sequences of every arm | |
| 218 7) Calculate total number of reads of sequences of every arm. | |
| 219 8) Store all the informations in a txt file | |
| 220 | |
| 221 """ | |
| 222 | |
| 223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): | |
| 224 | |
| 225 | |
| 226 ini_sam=read(path,0) | |
| 227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
| 228 unique_seq=[] | |
| 229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
| 230 uni_seq=[] | |
| 231 | |
| 232 # Calculate the shifted positions for every isomir and add them to the name of it | |
| 233 sorted_uni_arms = [] | |
| 234 for i in range(1,len(mature_mirnas),2): | |
| 235 tmp_count_reads = 0 # calculate the total number of reads | |
| 236 tmp_count_seq = 0 # calculate the total number of sequences | |
| 237 | |
| 238 for j in range(len(unique_seq)): | |
| 239 | |
| 240 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
| 241 | |
| 242 if temp_mature in unique_seq[j][9]: | |
| 243 | |
| 244 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
| 245 | |
| 246 mat_diff = temp_mature.split(off_part) | |
| 247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
| 248 | |
| 249 unique_diff = unique_seq[j][9].split(off_part) | |
| 250 if len(unique_diff)<=2: | |
| 251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
| 252 | |
| 253 pre_pos = mat_diff[0]-unique_diff[0] | |
| 254 post_pos = unique_diff[1]-mat_diff[1] | |
| 255 | |
| 256 lengthofmir = len(off_part) + post_pos | |
| 257 if pre_pos == 0: | |
| 258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
| 259 tmp_count_seq = tmp_count_seq + 1 | |
| 260 | |
| 261 if pre_pos == 0: | |
| 262 t_name=copy.deepcopy(unique_seq[j]) | |
| 263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
| 264 uni_seq.append(t_name) | |
| 265 | |
| 266 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
| 267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | |
| 268 | |
| 269 | |
| 270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
| 271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
| 272 | |
| 273 LHE=[] | |
| 274 | |
| 275 l.acquire() | |
| 276 if con=="c": | |
| 277 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
| 278 for x in unique_seq: | |
| 279 if x[2]!="*": | |
| 280 n_samples_mirna_names.append([x[2],x[9]]) | |
| 281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
| 282 n_LHE_names.extend(LHE) | |
| 283 names.append(name) | |
| 284 data.append([con,name,unique_seq,sorted_uni_arms]) | |
| 285 | |
| 286 | |
| 287 if con=="t": | |
| 288 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
| 289 for x in unique_seq: | |
| 290 if x[2]!="*": | |
| 291 n_samples_mirna_names.append([x[2],x[9]]) | |
| 292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
| 293 n_LHE_names.extend(LHE) | |
| 294 names.append(name) | |
| 295 data.append([con,name,unique_seq,sorted_uni_arms]) | |
| 296 l.release() | |
| 297 | |
| 298 ################################################################################################################################################################################################################# | |
| 299 | |
| 300 def deseq2_temp(samples_mirna_names,deseq,con,l): | |
| 301 | |
| 302 samples_mirna_names.sort(key=lambda x:[0]) | |
| 303 | |
| 304 for i in range(len(deseq)): | |
| 305 for y in samples_mirna_names: | |
| 306 flag = 0 | |
| 307 for x in deseq[i]: | |
| 308 if y[0] == x[0]: | |
| 309 flag = 1 | |
| 310 break | |
| 311 | |
| 312 if flag == 0: | |
| 313 deseq[i].append([y[0], "0", y[1]]) | |
| 314 | |
| 315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] | |
| 316 deseq_final = [[x[0],x[2]] for x in deseq[0]] | |
| 317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] | |
| 318 | |
| 319 l.acquire() | |
| 320 if con=="c": | |
| 321 q1.put(deseq_final) | |
| 322 | |
| 323 if con=="t": | |
| 324 q2.put(deseq_final) | |
| 325 l.release() | |
| 326 | |
| 327 | |
| 328 ################################################################################################################################################################################################################## | |
| 329 | |
| 330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): | |
| 331 | |
| 332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] | |
| 333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] | |
| 334 | |
| 335 LH8E_add_names.sort() | |
| 336 LH2E_add_names.sort() | |
| 337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) | |
| 338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) | |
| 339 | |
| 340 LH2E.sort() | |
| 341 LH8E.sort() | |
| 342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
| 343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
| 344 | |
| 345 print("LHE_names") | |
| 346 print([len(LH8E_add_names),len(LH2E_add_names)]) | |
| 347 print([len(LH8E),len(LH2E)]) | |
| 348 | |
| 349 zeros=["0"]*(len(LH8E[0])-2) | |
| 350 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] | |
| 351 LH8E=LH8E+LH8E_add_names | |
| 352 | |
| 353 zeros=["0"]*(len(LH2E[0])-2) | |
| 354 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] | |
| 355 LH2E=LH2E+LH2E_add_names | |
| 356 | |
| 357 dupes=[] | |
| 358 final_LH2E =[] | |
| 359 | |
| 360 for num,_ in enumerate(LH2E): | |
| 361 | |
| 362 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: | |
| 363 final_LH2E.append(LH2E[num][1]) | |
| 364 final_LH2E.append(LH2E[num][0]) | |
| 365 else: | |
| 366 dupes.append(LH2E[num][1]) | |
| 367 | |
| 368 dupes=list(set(dupes)) | |
| 369 | |
| 370 dupes=[[x] for x in dupes] | |
| 371 | |
| 372 for x in LH2E: | |
| 373 for y in dupes: | |
| 374 if x[1]==y[0]: | |
| 375 fl=0 | |
| 376 if len(y)==1: | |
| 377 y.append(x[0]) | |
| 378 else: | |
| 379 for i in range(1,len(y)): | |
| 380 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
| 381 fl=1 | |
| 382 if len(x[0])<len(y[i]): | |
| 383 del y[i] | |
| 384 y.append(x[0]) | |
| 385 break | |
| 386 | |
| 387 if fl==0: | |
| 388 y.append((x[0])) | |
| 389 | |
| 390 for y in dupes: | |
| 391 if len(y)>2: | |
| 392 for i in range(len(y)-1,1,-1): | |
| 393 y[1]=y[1]+"/"+y[i] | |
| 394 del y[i] | |
| 395 | |
| 396 | |
| 397 for x in LH2E: | |
| 398 for y in dupes: | |
| 399 if x[1]==y[0]: | |
| 400 x[0]=y[1] | |
| 401 | |
| 402 for x in LH8E: | |
| 403 for y in dupes: | |
| 404 if x[1]==y[0]: | |
| 405 x[0]=y[1] | |
| 406 | |
| 407 | |
| 408 | |
| 409 LH2E.sort() | |
| 410 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
| 411 | |
| 412 LH8E.sort() | |
| 413 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
| 414 | |
| 415 if int(per)!=-1: | |
| 416 percent=int(per)/100 | |
| 417 print(percent) | |
| 418 print(count) | |
| 419 | |
| 420 c_col_filter=round(percent*(len(LH2E[1])-2)) | |
| 421 t_col_filter=round(percent*(len(LH8E[1])-2)) | |
| 422 | |
| 423 for i, _ in enumerate(LH2E): | |
| 424 c_cols=0 | |
| 425 t_cols=0 | |
| 426 | |
| 427 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) | |
| 428 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) | |
| 429 | |
| 430 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
| 431 filter_LH8E.append(LH8E[i]) | |
| 432 filter_LH2E.append(LH2E[i]) | |
| 433 | |
| 434 raw_LH2E.extend(LH2E) | |
| 435 raw_LH8E.extend(LH8E) | |
| 436 | |
| 437 ################################################################################################################################################################################################################## | |
| 438 | |
| 439 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): | |
| 440 | |
| 441 if flag == 1 and int(per)!=-1: | |
| 442 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | |
| 443 fp.write("Name\t") | |
| 444 fp.write("Sequence") | |
| 445 for y in names_tre: | |
| 446 fp.write("\t"+y) | |
| 447 | |
| 448 for x in fil_LH8E: | |
| 449 fp.write("\n%s" % "\t".join(x)) | |
| 450 fp.close() | |
| 451 | |
| 452 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | |
| 453 fp.write("Name\t") | |
| 454 fp.write("Sequence") | |
| 455 for y in names_con: | |
| 456 fp.write("\t"+y) | |
| 457 | |
| 458 for x in fil_LH2E: | |
| 459 fp.write("\n%s" % "\t".join(x)) | |
| 460 fp.close() | |
| 461 | |
| 462 | |
| 463 if flag == 2 and int(per)!=-1: | |
| 464 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | |
| 465 fp.write("Name\t") | |
| 466 fp.write("Sequence") | |
| 467 for y in names_tre: | |
| 468 fp.write("\t"+y) | |
| 469 | |
| 470 | |
| 471 for x in fil_LH8E: | |
| 472 fp.write("\n%s" % "\t".join(x)) | |
| 473 fp.close() | |
| 474 | |
| 475 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | |
| 476 fp.write("Name\t") | |
| 477 fp.write("Sequence") | |
| 478 for y in names_con: | |
| 479 fp.write("\t"+y) | |
| 480 | |
| 481 for x in fil_LH2E: | |
| 482 fp.write("\n%s" % "\t".join(x)) | |
| 483 fp.close() | |
| 484 | |
| 485 | |
| 486 if flag == 1: | |
| 487 fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | |
| 488 fp.write("Name\t") | |
| 489 fp.write("Sequence") | |
| 490 for y in names_tre: | |
| 491 fp.write("\t"+y) | |
| 492 | |
| 493 for x in raw_LH8E: | |
| 494 fp.write("\n%s" % "\t".join(x)) | |
| 495 fp.close() | |
| 496 | |
| 497 fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | |
| 498 fp.write("Name\t") | |
| 499 fp.write("Sequence") | |
| 500 for y in names_con: | |
| 501 fp.write("\t"+y) | |
| 502 | |
| 503 for x in raw_LH2E: | |
| 504 fp.write("\n%s" % "\t".join(x)) | |
| 505 fp.close() | |
| 506 | |
| 507 if flag == 2: | |
| 508 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | |
| 509 fp.write("Name\t") | |
| 510 fp.write("Sequence") | |
| 511 for y in names_tre: | |
| 512 fp.write("\t"+y) | |
| 513 | |
| 514 | |
| 515 for x in raw_LH8E: | |
| 516 fp.write("\n%s" % "\t".join(x)) | |
| 517 fp.close() | |
| 518 | |
| 519 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | |
| 520 fp.write("Name\t") | |
| 521 fp.write("Sequence") | |
| 522 for y in names_con: | |
| 523 fp.write("\t"+y) | |
| 524 | |
| 525 for x in raw_LH2E: | |
| 526 fp.write("\n%s" % "\t".join(x)) | |
| 527 fp.close() | |
| 528 | |
| 529 #################################################################################################################################################################################################################### | |
| 530 | |
| 531 def ssamples(names,samp,folder,pro): | |
| 532 | |
| 533 for i in range(2,len(samp[0])): | |
| 534 | |
| 535 fp = open(folder+names[i-2]+'.txt','w') | |
| 536 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
| 537 | |
| 538 for x in samp: | |
| 539 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
| 540 fp.close() | |
| 541 | |
| 542 #################################################################################################################################################################################################################### | |
| 543 | |
| 544 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
| 545 | |
| 546 if f==1: | |
| 547 # Write a txt file with all the information | |
| 548 if con=="c": | |
| 549 fp = open('split1/'+name, 'w') | |
| 550 | |
| 551 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 552 if con=="t": | |
| 553 fp = open('split2/'+name, 'w') | |
| 554 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 555 | |
| 556 for i in range(len(sorted_uni_arms)): | |
| 557 temp = [] | |
| 558 for j in range(len(unique_seq)): | |
| 559 | |
| 560 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | |
| 561 | |
| 562 temp.append(unique_seq[j]) | |
| 563 | |
| 564 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
| 565 fp.write("*********************************************************************************************************\n") | |
| 566 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
| 567 fp.write("*********************************************************************************************************\n\n") | |
| 568 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
| 569 fp.write("\n" + "\n") | |
| 570 fp.close() | |
| 571 | |
| 572 if f==2: | |
| 573 | |
| 574 if con=="c": | |
| 575 fp = open('split3/'+name, 'w') | |
| 576 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 577 if con=="t": | |
| 578 fp = open('split4/'+name, 'w') | |
| 579 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
| 580 | |
| 581 for i in range(len(sorted_uni_arms)): | |
| 582 temp = [] | |
| 583 for j in range(len(unique_seq)): | |
| 584 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: | |
| 585 temp.append(unique_seq[j]) | |
| 586 if temp!=[]: | |
| 587 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
| 588 fp.write("*********************************************************************************************************\n") | |
| 589 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
| 590 fp.write("*********************************************************************************************************\n\n") | |
| 591 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
| 592 fp.write("\n" + "\n") | |
| 593 fp.close() | |
| 594 | |
| 595 | |
| 596 ########################################################################################################################## | |
| 597 | |
| 598 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | |
| 599 | |
| 600 unique_iso = [] | |
| 601 for x in pre_unique_seq: | |
| 602 if len(x[2].split("_"))==3: | |
| 603 for y in pre_unique_seq: | |
| 604 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | |
| 605 if any(y[2] in lst2 for lst2 in unique_iso)==False: | |
| 606 y[2]=">"+y[2] | |
| 607 unique_iso.append(y) | |
| 608 l.acquire() | |
| 609 for x in unique_iso: | |
| 610 mat_mirnas.append(x[2]) | |
| 611 mat_mirnas.append(x[9]) | |
| 612 l.release() | |
| 613 | |
| 614 ######################################################################################################################### | |
| 615 | |
| 616 def merging_names(LH2E_copy,new): | |
| 617 | |
| 618 dupes=[] | |
| 619 final_LH2E =[] | |
| 620 | |
| 621 for num in range(len(LH2E_copy)): | |
| 622 | |
| 623 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: | |
| 624 final_LH2E.append(LH2E_copy[num][1]) | |
| 625 final_LH2E.append(LH2E_copy[num][0]) | |
| 626 else: | |
| 627 dupes.append(LH2E_copy[num][1]) | |
| 628 | |
| 629 dupes=list(set(dupes)) | |
| 630 | |
| 631 for i in range(len(dupes)): | |
| 632 dupes[i]=[dupes[i]] | |
| 633 | |
| 634 for x in LH2E_copy: | |
| 635 for y in dupes: | |
| 636 if x[1]==y[0]: | |
| 637 fl=0 | |
| 638 if len(y)==1: | |
| 639 y.append(x[0]) | |
| 640 else: | |
| 641 for i in range(1,len(y)): | |
| 642 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
| 643 fl=1 | |
| 644 if len(x[0])<len(y[i]): | |
| 645 del y[i] | |
| 646 y.append(x[0]) | |
| 647 break | |
| 648 | |
| 649 if fl==0: | |
| 650 y.append((x[0])) | |
| 651 | |
| 652 for y in dupes: | |
| 653 if len(y)>2: | |
| 654 for i in range(len(y)-1,1,-1): | |
| 655 y[1]=y[1]+"/"+y[i] | |
| 656 del y[i] | |
| 657 | |
| 658 | |
| 659 for x in LH2E_copy: | |
| 660 for y in dupes: | |
| 661 if x[1]==y[0]: | |
| 662 x[0]=y[1] | |
| 663 | |
| 664 LH2E_copy.sort() | |
| 665 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) | |
| 666 new.extend(LH2E_copy) | |
| 667 | |
| 668 ###################################################################################################################################################### | |
| 669 | |
| 670 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): | |
| 671 | |
| 672 for i in range(2,len(tem_samp[0])): | |
| 673 | |
| 674 fp = open(folder+tem_names[i-2]+'.txt','w') | |
| 675 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
| 676 | |
| 677 for x in tem_samp: | |
| 678 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
| 679 | |
| 680 for j in range(len(non_names)): | |
| 681 if non_names[j]==tem_names[i-2]: | |
| 682 for x in non_samp: | |
| 683 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
| 684 fp.close() | |
| 685 | |
| 686 ################################################################################################################################################################################################################# | |
| 687 | |
| 688 def download_matures(matures,org_name): | |
| 689 | |
| 690 mature_mir=[] | |
| 691 | |
| 692 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | |
| 693 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | |
| 694 | |
| 695 data = urllib.request.urlopen(mat_url).read() | |
| 696 file_mirna = data.decode('utf-8') | |
| 697 mature_mir = file_mirna.split("\n") | |
| 698 mature_mir = [x.replace(">","") for x in mature_mir] | |
| 699 del mature_mir[-1] | |
| 700 | |
| 701 data = urllib.request.urlopen(star_url).read() | |
| 702 file_mirna = data.decode('utf-8') | |
| 703 star_mir = file_mirna.split("\n") | |
| 704 star_mir = [x.replace(">","") for x in star_mir] | |
| 705 del star_mir[-1] | |
| 706 | |
| 707 mature_mir.extend(star_mir) | |
| 708 | |
| 709 for i in range(1,len(mature_mir),2): | |
| 710 mature_mir[i]=mature_mir[i].replace("U","T") | |
| 711 | |
| 712 matures.extend(mature_mir) | |
| 713 | |
| 714 ################################################################################################################### | |
| 715 | |
| 716 def non_template_ref(sc,st,all_isoforms): | |
| 717 | |
| 718 pre_uni_seq_con = list(sc) | |
| 719 pre_uni_seq_tre = list(st) | |
| 720 | |
| 721 for x in pre_uni_seq_con: | |
| 722 for y in x: | |
| 723 if y[2] not in all_isoforms and len(y[2].split("_"))>2: | |
| 724 all_isoforms.append(y[2]) | |
| 725 all_isoforms.append(y[9]) | |
| 726 | |
| 727 for x in pre_uni_seq_tre: | |
| 728 for y in x: | |
| 729 if y[2] not in all_isoforms and len(y[2].split("_"))>2: | |
| 730 all_isoforms.append(y[2]) | |
| 731 all_isoforms.append(y[9]) | |
| 732 | |
| 733 ################################################################################################################################################################################################ | |
| 734 | |
| 735 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): | |
| 736 | |
| 737 for y in mir_names: | |
| 738 flag=0 | |
| 739 for x in sample: | |
| 740 if y[0]==x[0]: | |
| 741 flag=1 | |
| 742 break | |
| 743 if flag==0: | |
| 744 sample.append([y[0],"0",y[1]]) | |
| 745 | |
| 746 sample.sort(key=lambda x: x[0]) | |
| 747 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
| 748 | |
| 749 l.acquire() | |
| 750 new_d.append(sample) | |
| 751 sample_order.append(sample_name) | |
| 752 l.release() | |
| 753 | |
| 754 ############################################################################################################################################################################################### | |
| 755 |
