| 17 | 1 import itertools | 
|  | 2 import time | 
|  | 3 import sys | 
|  | 4 import os | 
|  | 5 import urllib.request | 
|  | 6 import gzip | 
|  | 7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | 
|  | 8 import subprocess | 
|  | 9 import argparse | 
|  | 10 from collections import OrderedDict | 
|  | 11 from matplotlib.backends.backend_pdf import PdfPages | 
|  | 12 import pandas as pd | 
|  | 13 from math import pi | 
|  | 14 import numpy as np | 
|  | 15 import matplotlib.pyplot as plt | 
|  | 16 from matplotlib.ticker import PercentFormatter | 
|  | 17 import seaborn as sns | 
|  | 18 import scipy.stats as stats | 
|  | 19 from plotnine import * | 
|  | 20 import math | 
|  | 21 import re | 
|  | 22 import matplotlib.ticker as mtick | 
|  | 23 import copy | 
|  | 24 | 
|  | 25 | 
|  | 26 | 
|  | 27 | 
|  | 28 """---------------------- Simple Functions -----------------------""" | 
|  | 29 | 
|  | 30 # Read a file and return it as a list | 
|  | 31 def read(path, flag): | 
|  | 32     if flag == 0: | 
|  | 33         with open(path) as fp: | 
|  | 34             file=fp.readlines() | 
|  | 35         fp.close() | 
|  | 36         return file | 
|  | 37 | 
|  | 38     if flag == 1: | 
|  | 39         with open(path) as fp: | 
|  | 40             file = fp.read().splitlines() | 
|  | 41         fp.close() | 
|  | 42         return file | 
|  | 43 | 
|  | 44 # Write a list to a txt file | 
|  | 45 def write(path, list): | 
|  | 46     with open(path,'w') as fp: | 
|  | 47         for x in list: | 
|  | 48             fp.write(str("\t".join(x[1:-1]))) | 
|  | 49     fp.close() | 
|  | 50 | 
|  | 51 """---------------------- RNA-seq Functions ----------------------""" | 
|  | 52 | 
|  | 53 # Detect the longest common substring sequence between two mirnas | 
|  | 54 def longestSubstring(str1, str2): | 
|  | 55 | 
|  | 56     from difflib import SequenceMatcher | 
|  | 57     # initialize SequenceMatcher object with | 
|  | 58     # input string | 
|  | 59     seqMatch = SequenceMatcher(None, str1, str2) | 
|  | 60 | 
|  | 61     # find match of longest sub-string | 
|  | 62     # output will be like Match(a=0, b=0, size=5) | 
|  | 63     match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | 
|  | 64 | 
|  | 65     # print longest substring | 
|  | 66     if (match.size != 0): | 
|  | 67         return str1[match.a: match.a + match.size] | 
|  | 68     else: | 
|  | 69         print('No longest common sub-string found') | 
|  | 70 | 
|  | 71 | 
|  | 72 | 
|  | 73 ######################################################################################################################################################## | 
|  | 74 def collapse_sam(path): | 
|  | 75 | 
|  | 76     ini_sam=read(path,0) | 
|  | 77     main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 78     intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in  x.split("\t")[0]] | 
|  | 79 | 
|  | 80     uni_seq = [] | 
|  | 81     for x in main_sam: | 
|  | 82 | 
|  | 83         if [x[2], x[9]] not in uni_seq: | 
|  | 84             uni_seq.append([x[2], x[9]]) | 
|  | 85 | 
|  | 86     new_main_sam=[] | 
|  | 87     incr_num=0 | 
|  | 88     for i in range(len(uni_seq)): | 
|  | 89         count=0 | 
|  | 90         incr_num+=1 | 
|  | 91         for y in main_sam: | 
|  | 92             if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: | 
|  | 93                count+=1 | 
|  | 94                temp=y | 
|  | 95         temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 
|  | 96         temp[0]=str(incr_num)+"-"+str(count) | 
|  | 97         new_main_sam.append(temp) | 
|  | 98 | 
|  | 99     new_sam=intro_sam+new_main_sam | 
|  | 100 | 
|  | 101     return new_sam | 
|  | 102 | 
|  | 103 ################################################################################################################################################################# | 
|  | 104 | 
|  | 105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): | 
|  | 106 | 
|  | 107     # read the sam file | 
|  | 108 | 
|  | 109     ini_sam=read(path,0) | 
|  | 110     new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 111     unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | 
|  | 112 | 
|  | 113     # Calculate the shifted positions for every isomir and add them to the name of it | 
|  | 114     sorted_uni_arms = [] | 
|  | 115 | 
|  | 116     for i in range(len(mature_mirnas)): | 
|  | 117         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 118         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 119         for j in range(len(unique_seq)): | 
|  | 120 | 
|  | 121          if mature_mirnas[i] == unique_seq[j][2]: | 
|  | 122 | 
|  | 123                 temp_mature = mature_mirnas[i+1] | 
|  | 124                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 125 | 
|  | 126                 mat_diff = temp_mature.split(off_part) | 
|  | 127                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 128 | 
|  | 129                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 130                 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 131 | 
|  | 132                 # Problem with hsa-miR-8485 | 
|  | 133                 if mat_diff[1]!=0 and unique_diff[1]!=0: | 
|  | 134                     unique_seq[j]=1 | 
|  | 135                     pre_pos = 0 | 
|  | 136                     post_pos = 0 | 
|  | 137 | 
|  | 138                 elif mat_diff[0]!=0 and unique_diff[0]!=0: | 
|  | 139                     unique_seq[j]=1 | 
|  | 140                     pre_pos = 0 | 
|  | 141                     post_pos = 0 | 
|  | 142 | 
|  | 143                 else: | 
|  | 144                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 145                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 146                    tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 147                    tmp_count_seq = tmp_count_seq+1 | 
|  | 148 | 
|  | 149 | 
|  | 150                 if pre_pos != 0 or post_pos != 0: | 
|  | 151                     if pre_pos == 0: | 
|  | 152                         unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | 
|  | 153                     elif post_pos == 0: | 
|  | 154                         unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | 
|  | 155                     else: | 
|  | 156                         unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | 
|  | 157 | 
|  | 158         for x in range(unique_seq.count(1)): | 
|  | 159            unique_seq.remove(1) | 
|  | 160         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 161            sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | 
|  | 162 | 
|  | 163     # Store name of arms, number of sequences and number of reads | 
|  | 164     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 165 | 
|  | 166     for y in sorted_uni_arms: | 
|  | 167        counts=0 | 
|  | 168        seqs=0 | 
|  | 169        for x in unique_seq: | 
|  | 170            if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | 
|  | 171               counts+=int(x[0].split("-")[1]) | 
|  | 172               seqs+=1 | 
|  | 173 | 
|  | 174        y[1]=seqs | 
|  | 175        y[2]=counts | 
|  | 176 | 
|  | 177     LHE=[] | 
|  | 178 | 
|  | 179     l.acquire() | 
|  | 180     if con=="c": | 
|  | 181        LHE.extend(z[2] for z in unique_seq) | 
|  | 182        for y in unique_seq: | 
|  | 183            samples_mirna_names.append([y[2],y[9]]) | 
|  | 184        deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | 
|  | 185        LHE_names.extend(LHE) | 
|  | 186        unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | 
|  | 187        unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | 
|  | 188        names.append(name) | 
|  | 189        samples.append(unique_seq) | 
|  | 190        data.append([con,name,unique_seq,sorted_uni_arms]) | 
|  | 191        ini_sample.append(new_main_sam) | 
|  | 192 | 
|  | 193     if con=="t": | 
|  | 194        LHE.extend(z[2] for z in unique_seq) | 
|  | 195        for y in unique_seq: | 
|  | 196            samples_mirna_names.append([y[2],y[9]]) | 
|  | 197        deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | 
|  | 198        LHE_names.extend(LHE) | 
|  | 199        unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | 
|  | 200        unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | 
|  | 201        names.append(name) | 
|  | 202        samples.append(unique_seq) | 
|  | 203        data.append([con,name,unique_seq,sorted_uni_arms]) | 
|  | 204        ini_sample.append(new_main_sam) | 
|  | 205     l.release() | 
|  | 206 | 
|  | 207 | 
|  | 208 ###################################################################################################################################### | 
|  | 209 """ | 
|  | 210 Read a sam file from Bowtie and do the followings: | 
|  | 211 | 
|  | 212 1) Remove reverse stranded mapped reads | 
|  | 213 2) Remove unmapped reads | 
|  | 214 3) Remove all sequences with reads less than 11 reads | 
|  | 215 4) Sort the arms with the most sequences in decreading rate | 
|  | 216 5) Sort the sequences of every arm with the most reads in decreasing rate | 
|  | 217 6) Calculate total number of sequences of every arm | 
|  | 218 7) Calculate total number of reads of sequences of every arm. | 
|  | 219 8) Store all the informations in a txt file | 
|  | 220 | 
|  | 221 """ | 
|  | 222 | 
|  | 223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): | 
|  | 224 | 
|  | 225 | 
|  | 226     ini_sam=read(path,0) | 
|  | 227     new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 228     unique_seq=[] | 
|  | 229     unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | 
|  | 230     uni_seq=[] | 
|  | 231 | 
|  | 232     # Calculate the shifted positions for every isomir and add them to the name of it | 
|  | 233     sorted_uni_arms = [] | 
|  | 234     for i in range(1,len(mature_mirnas),2): | 
|  | 235         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 236         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 237 | 
|  | 238         for j in range(len(unique_seq)): | 
|  | 239 | 
|  | 240             temp_mature = mature_mirnas[i].strip().replace("U", "T") | 
|  | 241 | 
|  | 242             if temp_mature in unique_seq[j][9]: | 
|  | 243 | 
|  | 244                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 245 | 
|  | 246                 mat_diff = temp_mature.split(off_part) | 
|  | 247                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 248 | 
|  | 249                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 250                 if len(unique_diff)<=2: | 
|  | 251                    unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 252 | 
|  | 253                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 254                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 255 | 
|  | 256                    lengthofmir = len(off_part) + post_pos | 
|  | 257                    if pre_pos == 0: | 
|  | 258                       tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 259                       tmp_count_seq = tmp_count_seq + 1 | 
|  | 260 | 
|  | 261                       if pre_pos == 0: | 
|  | 262                           t_name=copy.deepcopy(unique_seq[j]) | 
|  | 263                           t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | 
|  | 264                           uni_seq.append(t_name) | 
|  | 265 | 
|  | 266         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 267             sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | 
|  | 268 | 
|  | 269 | 
|  | 270     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 271     unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | 
|  | 272 | 
|  | 273     LHE=[] | 
|  | 274 | 
|  | 275     l.acquire() | 
|  | 276     if con=="c": | 
|  | 277        LHE.extend(x[2] for x in unique_seq if x[2]!="*") | 
|  | 278        for x in unique_seq: | 
|  | 279            if x[2]!="*": | 
|  | 280               n_samples_mirna_names.append([x[2],x[9]]) | 
|  | 281        n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | 
|  | 282        n_LHE_names.extend(LHE) | 
|  | 283        names.append(name) | 
|  | 284        data.append([con,name,unique_seq,sorted_uni_arms]) | 
|  | 285 | 
|  | 286 | 
|  | 287     if con=="t": | 
|  | 288        LHE.extend(x[2] for x in unique_seq if x[2]!="*") | 
|  | 289        for x in unique_seq: | 
|  | 290            if x[2]!="*": | 
|  | 291               n_samples_mirna_names.append([x[2],x[9]]) | 
|  | 292        n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | 
|  | 293        n_LHE_names.extend(LHE) | 
|  | 294        names.append(name) | 
|  | 295        data.append([con,name,unique_seq,sorted_uni_arms]) | 
|  | 296     l.release() | 
|  | 297 | 
|  | 298 ################################################################################################################################################################################################################# | 
|  | 299 | 
|  | 300 def deseq2_temp(samples_mirna_names,deseq,con,l): | 
|  | 301 | 
|  | 302     samples_mirna_names.sort(key=lambda x:[0]) | 
|  | 303 | 
|  | 304     for i in range(len(deseq)): | 
|  | 305         for y in samples_mirna_names: | 
|  | 306             flag = 0 | 
|  | 307             for x in deseq[i]: | 
|  | 308                 if y[0] == x[0]: | 
|  | 309                     flag = 1 | 
|  | 310                     break | 
|  | 311 | 
|  | 312             if flag == 0: | 
|  | 313                 deseq[i].append([y[0], "0", y[1]]) | 
|  | 314 | 
|  | 315     [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] | 
|  | 316     deseq_final = [[x[0],x[2]] for x in deseq[0]] | 
|  | 317     [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] | 
|  | 318 | 
|  | 319     l.acquire() | 
|  | 320     if con=="c": | 
|  | 321        q1.put(deseq_final) | 
|  | 322 | 
|  | 323     if con=="t": | 
|  | 324        q2.put(deseq_final) | 
|  | 325     l.release() | 
|  | 326 | 
|  | 327 | 
|  | 328 ################################################################################################################################################################################################################## | 
|  | 329 | 
|  | 330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): | 
|  | 331 | 
|  | 332  LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] | 
|  | 333  LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] | 
|  | 334 | 
|  | 335  LH8E_add_names.sort() | 
|  | 336  LH2E_add_names.sort() | 
|  | 337  LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) | 
|  | 338  LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) | 
|  | 339 | 
|  | 340  LH2E.sort() | 
|  | 341  LH8E.sort() | 
|  | 342  LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | 
|  | 343  LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | 
|  | 344 | 
|  | 345  zeros=["0"]*(len(LH8E[0])-2) | 
|  | 346  [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] | 
|  | 347  LH8E=LH8E+LH8E_add_names | 
|  | 348 | 
|  | 349  zeros=["0"]*(len(LH2E[0])-2) | 
|  | 350  [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] | 
|  | 351  LH2E=LH2E+LH2E_add_names | 
|  | 352 | 
|  | 353  dupes=[] | 
|  | 354  final_LH2E =[] | 
|  | 355 | 
|  | 356  for num,_ in enumerate(LH2E): | 
|  | 357 | 
|  | 358     if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: | 
|  | 359         final_LH2E.append(LH2E[num][1]) | 
|  | 360         final_LH2E.append(LH2E[num][0]) | 
|  | 361     else: | 
|  | 362         dupes.append(LH2E[num][1]) | 
|  | 363 | 
|  | 364  dupes=list(set(dupes)) | 
|  | 365 | 
|  | 366  dupes=[[x] for x in dupes] | 
|  | 367 | 
|  | 368  for x in LH2E: | 
|  | 369      for y in dupes: | 
|  | 370          if x[1]==y[0]: | 
|  | 371              fl=0 | 
|  | 372              if len(y)==1: | 
|  | 373                  y.append(x[0]) | 
|  | 374              else: | 
|  | 375                  for i in range(1,len(y)): | 
|  | 376                      if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | 
|  | 377                         fl=1 | 
|  | 378                         if len(x[0])<len(y[i]): | 
|  | 379                            del y[i] | 
|  | 380                            y.append(x[0]) | 
|  | 381                            break | 
|  | 382 | 
|  | 383                  if fl==0: | 
|  | 384                     y.append((x[0])) | 
|  | 385 | 
|  | 386  for y in dupes: | 
|  | 387     if len(y)>2: | 
|  | 388         for i in range(len(y)-1,1,-1): | 
|  | 389             y[1]=y[1]+"/"+y[i] | 
|  | 390             del y[i] | 
|  | 391 | 
|  | 392 | 
|  | 393  for x in LH2E: | 
|  | 394      for y in dupes: | 
|  | 395          if x[1]==y[0]: | 
|  | 396             x[0]=y[1] | 
|  | 397 | 
|  | 398  for x in LH8E: | 
|  | 399     for y in dupes: | 
|  | 400         if x[1]==y[0]: | 
|  | 401            x[0]=y[1] | 
|  | 402 | 
|  | 403 | 
|  | 404 | 
|  | 405  LH2E.sort() | 
|  | 406  LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | 
|  | 407 | 
|  | 408  LH8E.sort() | 
|  | 409  LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | 
|  | 410 | 
|  | 411  if int(per)!=-1: | 
|  | 412     percent=int(per)/100 | 
|  | 413 | 
|  | 414     c_col_filter=round(percent*(len(LH2E[1])-2)) | 
|  | 415     t_col_filter=round(percent*(len(LH8E[1])-2)) | 
|  | 416 | 
|  | 417     for i, _ in enumerate(LH2E): | 
|  | 418         c_cols=0 | 
|  | 419         t_cols=0 | 
|  | 420 | 
|  | 421         c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) | 
|  | 422         t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) | 
|  | 423 | 
|  | 424         if c_cols>=c_col_filter or t_cols>=t_col_filter: | 
|  | 425            filter_LH8E.append(LH8E[i]) | 
|  | 426            filter_LH2E.append(LH2E[i]) | 
|  | 427 | 
|  | 428  raw_LH2E.extend(LH2E) | 
|  | 429  raw_LH8E.extend(LH8E) | 
|  | 430 | 
|  | 431 ################################################################################################################################################################################################################## | 
|  | 432 | 
|  | 433 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): | 
|  | 434 | 
|  | 435  if flag == 1 and int(per)!=-1: | 
|  | 436     fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | 
|  | 437     fp.write("Name\t") | 
|  | 438     fp.write("Sequence") | 
|  | 439     for y in names_tre: | 
|  | 440        fp.write("\t"+y) | 
|  | 441 | 
|  | 442     for x in fil_LH8E: | 
|  | 443         fp.write("\n%s" % "\t".join(x)) | 
|  | 444     fp.close() | 
|  | 445 | 
|  | 446     fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | 
|  | 447     fp.write("Name\t") | 
|  | 448     fp.write("Sequence") | 
|  | 449     for y in names_con: | 
|  | 450        fp.write("\t"+y) | 
|  | 451 | 
|  | 452     for x in fil_LH2E: | 
|  | 453         fp.write("\n%s" % "\t".join(x)) | 
|  | 454     fp.close() | 
|  | 455 | 
|  | 456 | 
|  | 457  if flag == 2 and int(per)!=-1: | 
|  | 458     fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | 
|  | 459     fp.write("Name\t") | 
|  | 460     fp.write("Sequence") | 
|  | 461     for y in names_tre: | 
|  | 462        fp.write("\t"+y) | 
|  | 463 | 
|  | 464 | 
|  | 465     for x in fil_LH8E: | 
|  | 466         fp.write("\n%s" % "\t".join(x)) | 
|  | 467     fp.close() | 
|  | 468 | 
|  | 469     fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | 
|  | 470     fp.write("Name\t") | 
|  | 471     fp.write("Sequence") | 
|  | 472     for y in names_con: | 
|  | 473        fp.write("\t"+y) | 
|  | 474 | 
|  | 475     for x in fil_LH2E: | 
|  | 476         fp.write("\n%s" % "\t".join(x)) | 
|  | 477     fp.close() | 
|  | 478 | 
|  | 479 | 
|  | 480  if flag == 1: | 
|  | 481     fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | 
|  | 482     fp.write("Name\t") | 
|  | 483     fp.write("Sequence") | 
|  | 484     for y in names_tre: | 
|  | 485        fp.write("\t"+y) | 
|  | 486 | 
|  | 487     for x in raw_LH8E: | 
|  | 488         fp.write("\n%s" % "\t".join(x)) | 
|  | 489     fp.close() | 
|  | 490 | 
|  | 491     fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | 
|  | 492     fp.write("Name\t") | 
|  | 493     fp.write("Sequence") | 
|  | 494     for y in names_con: | 
|  | 495        fp.write("\t"+y) | 
|  | 496 | 
|  | 497     for x in raw_LH2E: | 
|  | 498         fp.write("\n%s" % "\t".join(x)) | 
|  | 499     fp.close() | 
|  | 500 | 
|  | 501  if flag == 2: | 
|  | 502     fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | 
|  | 503     fp.write("Name\t") | 
|  | 504     fp.write("Sequence") | 
|  | 505     for y in names_tre: | 
|  | 506        fp.write("\t"+y) | 
|  | 507 | 
|  | 508 | 
|  | 509     for x in raw_LH8E: | 
|  | 510         fp.write("\n%s" % "\t".join(x)) | 
|  | 511     fp.close() | 
|  | 512 | 
|  | 513     fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | 
|  | 514     fp.write("Name\t") | 
|  | 515     fp.write("Sequence") | 
|  | 516     for y in names_con: | 
|  | 517        fp.write("\t"+y) | 
|  | 518 | 
|  | 519     for x in raw_LH2E: | 
|  | 520         fp.write("\n%s" % "\t".join(x)) | 
|  | 521     fp.close() | 
|  | 522 | 
|  | 523 #################################################################################################################################################################################################################### | 
|  | 524 | 
|  | 525 def ssamples(names,samp,folder,pro): | 
|  | 526 | 
|  | 527     for i in range(2,len(samp[0])): | 
|  | 528 | 
|  | 529        fp = open(folder+names[i-2]+'.txt','w') | 
|  | 530        fp.write("miRNA id"+"\t"+names[i-2]+"\n") | 
|  | 531 | 
|  | 532        for x in samp: | 
|  | 533            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 534        fp.close() | 
|  | 535 | 
|  | 536 #################################################################################################################################################################################################################### | 
|  | 537 | 
|  | 538 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | 
|  | 539 | 
|  | 540  if f==1: | 
|  | 541     # Write a txt file with all the information | 
|  | 542     if con=="c": | 
|  | 543        fp = open('split1/'+name, 'w') | 
|  | 544 | 
|  | 545        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 546     if con=="t": | 
|  | 547        fp = open('split2/'+name, 'w') | 
|  | 548        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 549 | 
|  | 550     for i in range(len(sorted_uni_arms)): | 
|  | 551         temp = [] | 
|  | 552         for j in range(len(unique_seq)): | 
|  | 553 | 
|  | 554             if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | 
|  | 555 | 
|  | 556                 temp.append(unique_seq[j]) | 
|  | 557 | 
|  | 558         temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 559         fp.write("*********************************************************************************************************\n") | 
|  | 560         fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 561         fp.write("*********************************************************************************************************\n\n") | 
|  | 562         [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 563         fp.write("\n" + "\n") | 
|  | 564     fp.close() | 
|  | 565 | 
|  | 566  if f==2: | 
|  | 567 | 
|  | 568     if con=="c": | 
|  | 569        fp = open('split3/'+name, 'w') | 
|  | 570        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 571     if con=="t": | 
|  | 572        fp = open('split4/'+name, 'w') | 
|  | 573        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 574 | 
|  | 575     for i in range(len(sorted_uni_arms)): | 
|  | 576         temp = [] | 
|  | 577         for j in range(len(unique_seq)): | 
|  | 578                if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: | 
|  | 579                   temp.append(unique_seq[j]) | 
|  | 580         if temp!=[]: | 
|  | 581            temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 582            fp.write("*********************************************************************************************************\n") | 
|  | 583            fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 584            fp.write("*********************************************************************************************************\n\n") | 
|  | 585            [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 586            fp.write("\n" + "\n") | 
|  | 587     fp.close() | 
|  | 588 | 
|  | 589 | 
|  | 590 ########################################################################################################################## | 
|  | 591 | 
|  | 592 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | 
|  | 593 | 
|  | 594     unique_iso = [] | 
|  | 595     for x in pre_unique_seq: | 
|  | 596        if len(x[2].split("_"))==3: | 
|  | 597           for y in pre_unique_seq: | 
|  | 598               if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | 
|  | 599                  if any(y[2] in lst2 for lst2 in unique_iso)==False: | 
|  | 600                     y[2]=">"+y[2] | 
|  | 601                     unique_iso.append(y) | 
|  | 602     l.acquire() | 
|  | 603     for x in unique_iso: | 
|  | 604         mat_mirnas.append(x[2]) | 
|  | 605         mat_mirnas.append(x[9]) | 
|  | 606     l.release() | 
|  | 607 | 
|  | 608 ######################################################################################################################### | 
|  | 609 | 
|  | 610 def merging_names(LH2E_copy,new): | 
|  | 611 | 
|  | 612     dupes=[] | 
|  | 613     final_LH2E =[] | 
|  | 614 | 
|  | 615     for num in range(len(LH2E_copy)): | 
|  | 616 | 
|  | 617         if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: | 
|  | 618            final_LH2E.append(LH2E_copy[num][1]) | 
|  | 619            final_LH2E.append(LH2E_copy[num][0]) | 
|  | 620         else: | 
|  | 621            dupes.append(LH2E_copy[num][1]) | 
|  | 622 | 
|  | 623     dupes=list(set(dupes)) | 
|  | 624 | 
|  | 625     for i in range(len(dupes)): | 
|  | 626         dupes[i]=[dupes[i]] | 
|  | 627 | 
|  | 628     for x in LH2E_copy: | 
|  | 629         for y in dupes: | 
|  | 630             if x[1]==y[0]: | 
|  | 631                fl=0 | 
|  | 632                if len(y)==1: | 
|  | 633                   y.append(x[0]) | 
|  | 634                else: | 
|  | 635                   for i in range(1,len(y)): | 
|  | 636                       if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | 
|  | 637                          fl=1 | 
|  | 638                          if len(x[0])<len(y[i]): | 
|  | 639                             del y[i] | 
|  | 640                             y.append(x[0]) | 
|  | 641                             break | 
|  | 642 | 
|  | 643                   if fl==0: | 
|  | 644                      y.append((x[0])) | 
|  | 645 | 
|  | 646     for y in dupes: | 
|  | 647         if len(y)>2: | 
|  | 648            for i in range(len(y)-1,1,-1): | 
|  | 649                y[1]=y[1]+"/"+y[i] | 
|  | 650                del y[i] | 
|  | 651 | 
|  | 652 | 
|  | 653     for x in LH2E_copy: | 
|  | 654         for y in dupes: | 
|  | 655             if x[1]==y[0]: | 
|  | 656                x[0]=y[1] | 
|  | 657 | 
|  | 658     LH2E_copy.sort() | 
|  | 659     LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) | 
|  | 660     new.extend(LH2E_copy) | 
|  | 661 | 
|  | 662 ###################################################################################################################################################### | 
|  | 663 | 
|  | 664 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): | 
|  | 665 | 
|  | 666     for i in range(2,len(tem_samp[0])): | 
|  | 667 | 
|  | 668        fp = open(folder+tem_names[i-2]+'.txt','w') | 
|  | 669        fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | 
|  | 670 | 
|  | 671        for x in tem_samp: | 
|  | 672            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 673 | 
|  | 674        for j in range(len(non_names)): | 
|  | 675            if non_names[j]==tem_names[i-2]: | 
|  | 676               for x in non_samp: | 
|  | 677                   fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | 
|  | 678        fp.close() | 
|  | 679 | 
|  | 680 ################################################################################################################################################################################################################# | 
|  | 681 | 
|  | 682 def download_matures(matures,org_name): | 
|  | 683 | 
|  | 684     mature_mir=[] | 
|  | 685 | 
|  | 686     mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | 
|  | 687     star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | 
|  | 688 | 
|  | 689     data = urllib.request.urlopen(mat_url).read() | 
|  | 690     file_mirna = data.decode('utf-8') | 
|  | 691     mature_mir = file_mirna.split("\n") | 
|  | 692     mature_mir = [x.replace(">","") for x in mature_mir] | 
|  | 693     del mature_mir[-1] | 
|  | 694 | 
|  | 695     data = urllib.request.urlopen(star_url).read() | 
|  | 696     file_mirna = data.decode('utf-8') | 
|  | 697     star_mir = file_mirna.split("\n") | 
|  | 698     star_mir = [x.replace(">","") for x in star_mir] | 
|  | 699     del star_mir[-1] | 
|  | 700 | 
|  | 701     mature_mir.extend(star_mir) | 
|  | 702 | 
|  | 703     for i in range(1,len(mature_mir),2): | 
|  | 704         mature_mir[i]=mature_mir[i].replace("U","T") | 
|  | 705 | 
|  | 706     matures.extend(mature_mir) | 
|  | 707 | 
|  | 708 ################################################################################################################### | 
|  | 709 | 
|  | 710 def non_template_ref(sc,st,all_isoforms): | 
|  | 711 | 
|  | 712   pre_uni_seq_con = list(sc) | 
|  | 713   pre_uni_seq_tre = list(st) | 
|  | 714 | 
|  | 715   for x in pre_uni_seq_con: | 
|  | 716       for y in x: | 
|  | 717           if y[2] not in all_isoforms and len(y[2].split("_"))>2: | 
|  | 718              all_isoforms.append(y[2]) | 
|  | 719              all_isoforms.append(y[9]) | 
|  | 720 | 
|  | 721   for x in pre_uni_seq_tre: | 
|  | 722       for y in x: | 
|  | 723           if y[2] not in all_isoforms and len(y[2].split("_"))>2: | 
|  | 724              all_isoforms.append(y[2]) | 
|  | 725              all_isoforms.append(y[9]) | 
|  | 726 | 
|  | 727 ################################################################################################################################################################################################ | 
|  | 728 | 
|  | 729 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): | 
|  | 730 | 
|  | 731     for y in mir_names: | 
|  | 732         flag=0 | 
|  | 733         for x in sample: | 
|  | 734             if y[0]==x[0]: | 
|  | 735                flag=1 | 
|  | 736                break | 
|  | 737         if flag==0: | 
|  | 738            sample.append([y[0],"0",y[1]]) | 
|  | 739 | 
|  | 740     sample.sort(key=lambda x: x[0]) | 
|  | 741     sample=list(sample for sample,_ in itertools.groupby(sample)) | 
|  | 742 | 
|  | 743     l.acquire() | 
|  | 744     new_d.append(sample) | 
|  | 745     sample_order.append(sample_name) | 
|  | 746     l.release() | 
|  | 747 | 
|  | 748 ############################################################################################################################################################################################### | 
|  | 749 |