view mirbase_functions.py @ 1:561b0abcae87 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 12:15:50 +0000
parents 258aaaa465f3
children
line wrap: on
line source

import itertools
import time
import sys
import os
import urllib.request
import gzip
from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
import subprocess
import argparse
from collections import OrderedDict
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
from math import pi
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scipy.stats as stats
from plotnine import *
import math
import re
import matplotlib.ticker as mtick
import copy


"""---------------------- Simple Functions -----------------------"""

# Read a file and return it as a list
def read(path, flag):
    if flag == 0:
        with open(path) as fp:
            file=fp.readlines()
        fp.close()
        return file

    if flag == 1:
        with open(path) as fp:
            file = fp.read().splitlines()
        fp.close()
        return file

# Write a list to a txt file
def write(path, list):
    with open(path,'w') as fp:
        for x in list:
            fp.write(str("\t".join(x[1:-1])))
    fp.close()

"""---------------------- RNA-seq Functions ----------------------"""

# Detect the longest common substring sequence between two mirnas
def longestSubstring(str1, str2):

    from difflib import SequenceMatcher
    # initialize SequenceMatcher object with
    # input string
    seqMatch = SequenceMatcher(None, str1, str2)

    # find match of longest sub-string
    # output will be like Match(a=0, b=0, size=5)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))

    # print longest substring
    if (match.size != 0):
        return str1[match.a: match.a + match.size]
    else:
        print('No longest common sub-string found')



########################################################################################################################################################

def collapse_sam(path):

    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in  x.split("\t")[0]]

    uni_seq = []
    for x in main_sam:

        if [x[2], x[9]] not in uni_seq:
            uni_seq.append([x[2], x[9]])

    new_main_sam=[]
    incr_num=0
    for i in range(len(uni_seq)):
        count=0
        incr_num+=1
        for y in main_sam:
            if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
               count+=1
               temp=y
        temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        temp[0]=str(incr_num)+"-"+str(count)
        new_main_sam.append(temp)

    new_sam=intro_sam+new_main_sam

    return new_sam

#################################################################################################################################################################################################################

def duplicate_chroms_isoforms(List):

 dupes=[]

 for num in range(len(List)):

    if  [List[num][9],List[num][0],List[num][2]] not in dupes :
        dupes.append([List[num][9],List[num][0],List[num][2]])

 for x in List:
     for y in dupes:
         if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
            y.append(x[2])


 double_List = [x[:] for x in List]

 chr_order=[]
 for x in dupes:
     temp = []
     for i in range(2,len(x)):
         if x[i].split("chr")[1].split("(")[0].isdigit():
            temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
         else:
            temp.append(x[i].split("chr")[1][0:4])

     for z in temp:
         if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
             temp = [str(j) for j in temp]
     temp=list(set(temp))
     temp.sort()
     chr_order.append(temp)

 final_dupes=[]
 for i in range(len(dupes)):
     final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
     for x in chr_order[i]:
         result = re.match("[-+]?\d+$", str(x))
         if len(chr_order[i]) == len(set(chr_order[i])):
           if result is not None:

             if int(x)<0:
                final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
             else:
                final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
           else:
                final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
         else:
             if result is not None:
                 if int(x) < 0:
                     final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
                 else:
                     final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
             else:
                 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)

 final_dupes.sort()
 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))

 for i in range(len(double_List)):
     for x in final_dupes:

         if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
            gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
            double_List[i][2] = x[1]+gg

         if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
            double_List[i][2]=x[1]
            List[i][2] = x[1]

 List.sort()
 new_list=list(List for List,_ in itertools.groupby(List))

 double_List.sort()
 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))

 return new_list, new_double_List


#############################################################################################################################################################################################################

def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):

    # read the sam file
    ini_sam=read(path,0)
    new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]

    sorted_uni_arms = []

    for i in range(len(mature_mirnas)):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences
        for j in range(len(unique_seq)):

         if "{" in unique_seq[j][2].split("_")[0]:
             official=unique_seq[j][2].split("_")[0][:-4]
         else:
             official=unique_seq[j][2].split("_")[0]

         if mature_mirnas[i].split(" ")[0][1:] == official:

                temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                # Problem with hsa-miR-8485
                if mat_diff[1]!=0 and unique_diff[1]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                elif mat_diff[0]!=0 and unique_diff[0]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                else:
                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]
                   tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                   tmp_count_seq = tmp_count_seq+1

                if pre_pos != 0 or post_pos != 0:
                    if pre_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
                    elif post_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
                    else:
                        unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)

        for x in range(unique_seq.count(1)):
           unique_seq.remove(1)
        if tmp_count_reads != 0 and tmp_count_seq != 0:
           sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)

    for y in sorted_uni_arms:
       counts=0
       seqs=0
       for x in double_fil_uni_seq:
           if y[0]==x[2].split("_")[0]:
              counts+=int(x[0].split("-")[1])
              seqs+=1

       y[1]=seqs
       y[2]=counts

    LHE=[]
    l.acquire()
    if con=="c":
       LHE.extend(z[2] for z in double_fil_uni_seq)
       for y in double_fil_uni_seq:
           samples_mirna_names.append([y[2],y[9]])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
       LHE_names.extend(LHE)
       unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
       names.append(name)
       samples.append(dedup_unique_seq)
       data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
       ini_sample.append(new_main_sam)

    if con=="t":
       LHE.extend(z[2] for z in double_fil_uni_seq)
       for y in double_fil_uni_seq:
           samples_mirna_names.append([y[2],y[9]])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
       LHE_names.extend(LHE)
       unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
       names.append(name)
       samples.append(dedup_unique_seq)
       data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
       ini_sample.append(new_main_sam)
    l.release()


######################################################################################################################################

"""

Read a sam file from Bowtie and do the followings:

1) Remove reverse stranded mapped reads
2) Remove unmapped reads 
3) Remove all sequences with reads less than 11 reads
4) Sort the arms with the most sequences in decreading rate
5) Sort the sequences of every arm with the most reads in decreasing rate
6) Calculate total number of sequences of every arm
7) Calculate total number of reads of sequences of every arm.
8) Store all the informations in a txt file 

"""

def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):

    ini_sam=read(path,0)
    new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq=[]
    unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]

    uni_seq=[]
    # Calculate the shifted positions for every isomir and add them to the name of it
    sorted_uni_arms = []
    for i in range(1,len(mature_mirnas),2):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences

        for j in range(len(unique_seq)):

            temp_mature = mature_mirnas[i].strip().replace("U", "T")

            if temp_mature in unique_seq[j][9]:

                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                if len(unique_diff)<=2:
                   unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]

                   lengthofmir = len(off_part) + post_pos
                   if pre_pos == 0:
                      tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                      tmp_count_seq = tmp_count_seq + 1

                      if pre_pos == 0:

                          t_name=unique_seq[j].copy()
                          t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
                          uni_seq.append(t_name)


        if tmp_count_reads != 0 and tmp_count_seq != 0:
            sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])


    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))

    LHE=[]

    l.acquire()
    if con=="c":
       LHE.extend(x[2] for x in unique_seq if x[2]!="*")
       for x in unique_seq:
           if x[2]!="*":
              n_samples_mirna_names.append([x[2],x[9]])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       n_LHE_names.extend(LHE)
       names.append(name)
       data.append([con,name,unique_seq,sorted_uni_arms])


    if con=="t":
       LHE.extend(x[2] for x in unique_seq if x[2]!="*")
       for x in unique_seq:
           if x[2]!="*":
              n_samples_mirna_names.append([x[2],x[9]])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       n_LHE_names.extend(LHE)
       names.append(name)
       data.append([con,name,unique_seq,sorted_uni_arms])
    l.release()

#####################################################################################################################################################################################################################
def deseq2_temp(samples_mirna_names,deseq,con,l):

    samples_mirna_names.sort(key=lambda x:[0])
    for i in range(len(deseq)):
        for y in samples_mirna_names:
            flag = 0
            for x in deseq[i]:
                if y[0] == x[0]:
                    flag = 1
                    break

            if flag == 0:
                deseq[i].append([y[0], "0", y[1]])

    [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
    deseq_final = [[x[0],x[2]] for x in deseq[0]]
    [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]

    l.acquire()
    if con=="c":
       q1.put(deseq_final)

    if con=="t":
       q2.put(deseq_final)
    l.release()


####################################################################################################################################################################################################################

def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):

 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]

 LH8E_add_names.sort()
 LH2E_add_names.sort()
 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))

 LH2E.sort()
 LH8E.sort()
 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))

 print("LHE_names")
 print([len(LH8E_add_names),len(LH2E_add_names)])
 print([len(LH8E),len(LH2E)])

 zeros=["0"]*(len(LH8E[0])-2)
 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
 LH8E=LH8E+LH8E_add_names

 zeros=["0"]*(len(LH2E[0])-2)
 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
 LH2E=LH2E+LH2E_add_names

 dupes=[]
 final_LH2E =[]

 for num,_ in enumerate(LH2E):

    if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
        final_LH2E.append(LH2E[num][1])
        final_LH2E.append(LH2E[num][0])
    else:
        dupes.append(LH2E[num][1])


 dupes=list(set(dupes))

 dupes=[[x] for x in dupes]

 for x in LH2E:
     for y in dupes:
         if x[1]==y[0]:
             fl=0
             if len(y)==1:
                 y.append(x[0])
             else:
                 for i in range(1,len(y)):
                     if y[i].split("_")[0]==x[0].split("_")[0]:
                        fl=1
                        if len(x[0])<len(y[i]):
                           del y[i]
                           y.append(x[0])
                           break

                 if fl==0:
                    y.append((x[0]))

 for y in dupes:
    if len(y)>2:
        for i in range(len(y)-1,1,-1):
            y[1]=y[1]+"/"+y[i]
            del y[i]

 for x in LH2E:
     for y in dupes:
         if x[1]==y[0]:
            x[0]=y[1]

 for x in LH8E:
    for y in dupes:
        if x[1]==y[0]:
           x[0]=y[1]


 LH2E.sort()
 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))

 LH8E.sort()
 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))

 if int(per)!=-1:
    percent=int(per)/100
    print(percent)
    print(count)

    c_col_filter=round(percent*(len(LH2E[1])-2))
    t_col_filter=round(percent*(len(LH8E[1])-2))

    for i, _ in enumerate(LH2E):
        c_cols=0
        t_cols=0

        c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
        t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])

        if c_cols>=c_col_filter or t_cols>=t_col_filter:
           filter_LH8E.append(LH8E[i])
           filter_LH2E.append(LH2E[i])

 raw_LH2E.extend(LH2E)
 raw_LH8E.extend(LH8E)

##################################################################################################################################################################################################################

def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):

 if flag == 1 and int(per)!=-1:
    fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)

    for x in fil_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in fil_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2 and int(per)!=-1:
    fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)


    for x in fil_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in fil_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 1:
    fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)

    for x in raw_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in raw_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2:
    fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)


    for x in raw_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in raw_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


#########################################################################################################################################

def ssamples(names,samp,folder,pro):

    for i in range(2,len(samp[0])):

       fp = open(folder+names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+names[i-2]+"\n")

       for x in samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
       fp.close()

##################################################################################################################

def DB_write(con,name,unique_seq,sorted_uni_arms,f):

 if f==1:
    # Write a txt file with all the information
    if con=="c":
       fp = open('split1/'+name, 'w')

       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split2/'+name, 'w') 
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):

            if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:

                temp.append(unique_seq[j])

        temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
        fp.write("*********************************************************************************************************\n")
        fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
        fp.write("*********************************************************************************************************\n\n")
        [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
        fp.write("\n" + "\n")
    fp.close()

 if f==2:

    if con=="c":
       fp = open('split3/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split4/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):
               if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
                  temp.append(unique_seq[j])
        if temp!=[]:
           temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
           fp.write("*********************************************************************************************************\n")
           fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
           fp.write("*********************************************************************************************************\n\n")
           [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
           fp.write("\n" + "\n")
    fp.close()


##########################################################################################################################

def new_mat_seq(pre_unique_seq,mat_mirnas,l):

    unique_iso = []
    for x in pre_unique_seq:
       if len(x[2].split("_"))==3:
          for y in pre_unique_seq:
              if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
                 if any(y[2] in lst2 for lst2 in unique_iso)==False:
                    y[2]=">"+y[2]
                    unique_iso.append(y)
    l.acquire()
    for x in unique_iso:
        mat_mirnas.append(x[2])
        mat_mirnas.append(x[9])
    l.release()

#########################################################################################################################

def merging_names(LH2E_copy,new):

    dupes=[]
    final_LH2E =[]

    for num in range(len(LH2E_copy)):

        if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
           final_LH2E.append(LH2E_copy[num][1])
           final_LH2E.append(LH2E_copy[num][0])
        else:
           dupes.append(LH2E_copy[num][1])

    dupes=list(set(dupes))

    for i in range(len(dupes)):
        dupes[i]=[dupes[i]]

    for x in LH2E_copy:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]==x[0].split("_")[0]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]


    for x in LH2E_copy:
        for y in dupes:
            if x[1]==y[0]:
               x[0]=y[1]

    LH2E_copy.sort()
    LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))

    new.extend(LH2E_copy)


######################################################################################################################################################

def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):

    for i in range(2,len(tem_samp[0])):

       fp = open(folder+tem_names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")

       for x in tem_samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")

       for j in range(len(non_names)):
           if non_names[j]==tem_names[i-2]:
              for x in non_samp:
                  fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
       fp.close()

###################################################################################################################################################################################################################

def download_matures(matures,org_name):

    #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
    url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
    data = urllib.request.urlopen(url).read()
    file_mirna = gzip.decompress(data).decode('utf-8')
    file_mirna = file_mirna.split("\n")

    for i in range(0,len(file_mirna)-1,2):

        if org_name in file_mirna[i]:
           matures.append(file_mirna[i])
           matures.append(file_mirna[i+1])

###################################################################################################################################################################################################################
def non_template_ref(sc,st,all_isoforms):

  pre_uni_seq_con = list(sc)
  pre_uni_seq_tre = list(st)

  for x in pre_uni_seq_con:
      for y in x:
          if ">"+y[2] not in all_isoforms and ")_" in y[2] :
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])


  for x in pre_uni_seq_tre:
      for y in x:
          if ">"+y[2] not in all_isoforms and ")_" in y[2]:
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])

################################################################################################################################################################################################

def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):

    for y in mir_names:
        flag=0
        for x in sample:
            if y[0]==x[0]:
               flag=1
               break
        if flag==0:
           sample.append([y[0],"0",y[1]])

    sample.sort(key=lambda x: x[0])
    sample=list(sample for sample,_ in itertools.groupby(sample))

    l.acquire()
    new_d.append(sample)
    sample_order.append(sample_name)
    l.release()

###############################################################################################################################################################################################