view mirbase_ultra_v2.py @ 0:99e8a03f8802 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 18:53:05 +0000
parents
children
line wrap: on
line source

from mirbase_functions import *
from mirbase_graphs import *
import itertools
import time
import sys
import os
import urllib.request
import gzip
from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
import subprocess
import argparse
from collections import OrderedDict
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
from math import pi
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scipy.stats as stats
from plotnine import *
import math
import re
import matplotlib.ticker as mtick
import copy

subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])

parser = argparse.ArgumentParser()
parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
parser.add_argument("-f", "--flag", help="choose the database", action="store")
parser.add_argument("-umis", "--umi", help="choose the database", action="store")
parser.add_argument("-percentage", "--per", help="choose the database", action="store")
parser.add_argument("-counts", "--count", help="choose the database", action="store")
parser.add_argument("-name1", "--n1", help="choose the database", action="store")
parser.add_argument("-name2", "--n2", help="choose the database", action="store")
args = parser.parse_args()

#########################################################################################################################################

def collapse_sam(path):

    ini_sam=read(path,0)
    main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in  x.split("\t")[0]]

    uni_seq = []
    for x in main_sam:

        if [x[2], x[9]] not in uni_seq:
            uni_seq.append([x[2], x[9]])

    new_main_sam=[]
    incr_num=0
    for i in range(len(uni_seq)):
        count=0
        incr_num+=1
        for y in main_sam:
            if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
               count+=1
               temp=y
        temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        temp[0]=str(incr_num)+"-"+str(count)
        new_main_sam.append(temp)

    new_sam=intro_sam+new_main_sam

    return new_sam

#################################################################################################################################################################################################################

def duplicate_chroms_isoforms(List):

 dupes=[]

 for num in range(len(List)):

    if  [List[num][9],List[num][0],List[num][2]] not in dupes :
        dupes.append([List[num][9],List[num][0],List[num][2]])

 for x in List:
     for y in dupes:
         if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
            y.append(x[2])


 double_List = [x[:] for x in List]

 chr_order=[]
 for x in dupes:
     temp = []
     for i in range(2,len(x)):
         if x[i].split("chr")[1].split("(")[0].isdigit():
            temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
         else:
            temp.append(x[i].split("chr")[1][0:4])

     for z in temp:
         if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
             temp = [str(j) for j in temp]
     temp=list(set(temp))
     temp.sort()
     chr_order.append(temp)

 final_dupes=[]
 for i in range(len(dupes)):
     final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
     for x in chr_order[i]:
         result = re.match("[-+]?\d+$", str(x))
         if len(chr_order[i]) == len(set(chr_order[i])):
           if result is not None:

             if int(x)<0:
                final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
             else:
                final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
           else:
                final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
         else:
             if result is not None:
                 if int(x) < 0:
                     final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
                 else:
                     final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
             else:
                 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)

 final_dupes.sort()
 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))

 for i in range(len(double_List)):
     for x in final_dupes:

         if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
            gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
            double_List[i][2] = x[1]+gg

         if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
            double_List[i][2]=x[1]
            List[i][2] = x[1]

 List.sort()
 new_list=list(List for List,_ in itertools.groupby(List))

 double_List.sort()
 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))

 return new_list, new_double_List


#############################################################################################################################################################################################################

def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):

    # read the sam file
    ini_sam=read(path,0)
    new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]

    sorted_uni_arms = []

    for i in range(len(mature_mirnas)):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences
        for j in range(len(unique_seq)):

         if "{" in unique_seq[j][2].split("_")[0]:
             official=unique_seq[j][2].split("_")[0][:-4]
         else:
             official=unique_seq[j][2].split("_")[0]

         if mature_mirnas[i].split(" ")[0][1:] == official:

                temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                # Problem with hsa-miR-8485
                if mat_diff[1]!=0 and unique_diff[1]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                elif mat_diff[0]!=0 and unique_diff[0]!=0:
                    unique_seq[j]=1
                    pre_pos = 0
                    post_pos = 0

                else:
                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]
                   tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                   tmp_count_seq = tmp_count_seq+1

                if pre_pos != 0 or post_pos != 0:
                    if pre_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
                    elif post_pos == 0:
                        unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
                    else:
                        unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)

        for x in range(unique_seq.count(1)):
           unique_seq.remove(1)
        if tmp_count_reads != 0 and tmp_count_seq != 0:
           sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)

    for y in sorted_uni_arms:
       counts=0
       seqs=0
       for x in double_fil_uni_seq:
           if y[0]==x[2].split("_")[0]:
              counts+=int(x[0].split("-")[1])
              seqs+=1

       y[1]=seqs
       y[2]=counts

    LHE=[]
    l.acquire()
    if con=="c":
       LHE.extend(z[2] for z in double_fil_uni_seq)
       for y in double_fil_uni_seq:
           samples_mirna_names.append([y[2],y[9]])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
       LHE_names.extend(LHE)
       unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
       names.append(name)
       samples.append(dedup_unique_seq)
       data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
       ini_sample.append(new_main_sam)

    if con=="t":
       LHE.extend(z[2] for z in double_fil_uni_seq)
       for y in double_fil_uni_seq:
           samples_mirna_names.append([y[2],y[9]])
       deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
       LHE_names.extend(LHE)
       unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
       unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
       names.append(name)
       samples.append(dedup_unique_seq)
       data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
       ini_sample.append(new_main_sam)
    l.release()


######################################################################################################################################
"""

Read a sam file from Bowtie and do the followings:

1) Remove reverse stranded mapped reads
2) Remove unmapped reads 
3) Remove all sequences with reads less than 11 reads
4) Sort the arms with the most sequences in decreading rate
5) Sort the sequences of every arm with the most reads in decreasing rate
6) Calculate total number of sequences of every arm
7) Calculate total number of reads of sequences of every arm.
8) Store all the informations in a txt file 

"""

def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):

    ini_sam=read(path,0)
    new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
    unique_seq=[]
    unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]

    uni_seq=[]
    # Calculate the shifted positions for every isomir and add them to the name of it
    sorted_uni_arms = []
    for i in range(1,len(mature_mirnas),2):
        tmp_count_reads = 0   # calculate the total number of reads
        tmp_count_seq = 0     # calculate the total number of sequences

        for j in range(len(unique_seq)):

            temp_mature = mature_mirnas[i].strip().replace("U", "T")

            if temp_mature in unique_seq[j][9]:

                off_part = longestSubstring(temp_mature, unique_seq[j][9])

                mat_diff = temp_mature.split(off_part)
                mat_diff = [len(mat_diff[0]), len(mat_diff[1])]

                unique_diff = unique_seq[j][9].split(off_part)
                if len(unique_diff)<=2:
                   unique_diff = [len(unique_diff[0]), len(unique_diff[1])]

                   pre_pos = mat_diff[0]-unique_diff[0]
                   post_pos = unique_diff[1]-mat_diff[1]

                   lengthofmir = len(off_part) + post_pos
                   if pre_pos == 0:
                      tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
                      tmp_count_seq = tmp_count_seq + 1

                      if pre_pos == 0:

                          t_name=unique_seq[j].copy()
                          t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
                          uni_seq.append(t_name)


        if tmp_count_reads != 0 and tmp_count_seq != 0:
            sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])


    sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
    unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))

    LHE=[]

    l.acquire()
    if con=="c":
       LHE.extend(x[2] for x in unique_seq if x[2]!="*")
       for x in unique_seq:
           if x[2]!="*":
              n_samples_mirna_names.append([x[2],x[9]])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       n_LHE_names.extend(LHE)
       names.append(name)
       data.append([con,name,unique_seq,sorted_uni_arms])


    if con=="t":
       LHE.extend(x[2] for x in unique_seq if x[2]!="*")
       for x in unique_seq:
           if x[2]!="*":
              n_samples_mirna_names.append([x[2],x[9]])
       n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
       n_LHE_names.extend(LHE)
       names.append(name)
       data.append([con,name,unique_seq,sorted_uni_arms])
    l.release()

#####################################################################################################################################################################################################################
def deseq2_temp(samples_mirna_names,deseq,con,l):

    samples_mirna_names.sort(key=lambda x:[0])
    for i in range(len(deseq)):
        for y in samples_mirna_names:
            flag = 0
            for x in deseq[i]:
                if y[0] == x[0]:
                    flag = 1
                    break

            if flag == 0:
                deseq[i].append([y[0], "0", y[1]])

    [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
    deseq_final = [[x[0],x[2]] for x in deseq[0]]
    [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]

    l.acquire()
    if con=="c":
       q1.put(deseq_final)

    if con=="t":
       q2.put(deseq_final)
    l.release()


####################################################################################################################################################################################################################

def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E):

 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]

 LH8E_add_names.sort()
 LH2E_add_names.sort()
 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))

 LH2E.sort()
 LH8E.sort()
 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))

 print("LHE_names")
 print([len(LH8E_add_names),len(LH2E_add_names)])
 print([len(LH8E),len(LH2E)])

 zeros=["0"]*(len(LH8E[0])-2)
 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
 LH8E=LH8E+LH8E_add_names

 zeros=["0"]*(len(LH2E[0])-2)
 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
 LH2E=LH2E+LH2E_add_names

 dupes=[]
 final_LH2E =[]

 for num,_ in enumerate(LH2E):

    if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
        final_LH2E.append(LH2E[num][1])
        final_LH2E.append(LH2E[num][0])
    else:
        dupes.append(LH2E[num][1])


 dupes=list(set(dupes))

 dupes=[[x] for x in dupes]

 for x in LH2E:
     for y in dupes:
         if x[1]==y[0]:
             fl=0
             if len(y)==1:
                 y.append(x[0])
             else:
                 for i in range(1,len(y)):
                     if y[i].split("_")[0]==x[0].split("_")[0]:
                        fl=1
                        if len(x[0])<len(y[i]):
                           del y[i]
                           y.append(x[0])
                           break

                 if fl==0:
                    y.append((x[0]))

 for y in dupes:
    if len(y)>2:
        for i in range(len(y)-1,1,-1):
            y[1]=y[1]+"/"+y[i]
            del y[i]

 for x in LH2E:
     for y in dupes:
         if x[1]==y[0]:
            x[0]=y[1]

 for x in LH8E:
    for y in dupes:
        if x[1]==y[0]:
           x[0]=y[1]


 LH2E.sort()
 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))

 LH8E.sort()
 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))

 LH8E_new=[]
 LH2E_new=[]

 if int(args.per)!=-1:
    percent=int(args.per)/100
    print(percent)
    print(args.count)

    c_col_filter=round(percent*(len(LH2E[1])-2))
    t_col_filter=round(percent*(len(LH8E[1])-2))

    for i, _ in enumerate(LH2E):
        c_cols=0
        t_cols=0

        c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)])
        t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)])

        if c_cols>=c_col_filter or t_cols>=t_col_filter:
           LH8E_new.append(LH8E[i])
           LH2E_new.append(LH2E[i])

 filter_LH2E.extend(LH2E_new)
 filter_LH8E.extend(LH8E_new)
 raw_LH2E.extend(LH2E)
 raw_LH8E.extend(LH8E)

##################################################################################################################################################################################################################

def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag):

 if flag == 1 and int(args.per)!=-1:
    fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)

    for x in fil_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in fil_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2 and int(args.per)!=-1:
    fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)


    for x in fil_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in fil_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 1:
    fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)

    for x in raw_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in raw_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


 if flag == 2:
    fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_tre:
       fp.write("\t"+y)


    for x in raw_LH8E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()

    fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w')
    fp.write("Name\t")
    fp.write("Sequence")
    for y in names_con:
       fp.write("\t"+y)

    for x in raw_LH2E:
        fp.write("\n%s" % "\t".join(x))
    fp.close()


#########################################################################################################################################

def ssamples(names,samp,folder,pro):

    for i in range(2,len(samp[0])):

       fp = open(folder+names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+names[i-2]+"\n")

       for x in samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
       fp.close()

##################################################################################################################

def DB_write(con,name,unique_seq,sorted_uni_arms,f):

 if f==1:
    # Write a txt file with all the information
    if con=="c":
       fp = open('split1/'+name, 'w')

       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split2/'+name, 'w') 
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):

            if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:

                temp.append(unique_seq[j])

        temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
        fp.write("*********************************************************************************************************\n")
        fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
        fp.write("*********************************************************************************************************\n\n")
        [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
        fp.write("\n" + "\n")
    fp.close()

 if f==2:

    if con=="c":
       fp = open('split3/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
    if con=="t":
       fp = open('split4/'+name, 'w')
       fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))


    for i in range(len(sorted_uni_arms)):
        temp = []
        for j in range(len(unique_seq)):
               if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
                  temp.append(unique_seq[j])
        if temp!=[]:
           temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
           fp.write("*********************************************************************************************************\n")
           fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
           fp.write("*********************************************************************************************************\n\n")
           [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
           fp.write("\n" + "\n")
    fp.close()


##########################################################################################################################

def new_mat_seq(pre_unique_seq,mat_mirnas,l):

    unique_iso = []
    for x in pre_unique_seq:
       if len(x[2].split("_"))==3:
          for y in pre_unique_seq:
              if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
                 if any(y[2] in lst2 for lst2 in unique_iso)==False:
                    y[2]=">"+y[2]
                    unique_iso.append(y)
    l.acquire()
    for x in unique_iso:
        mat_mirnas.append(x[2])
        mat_mirnas.append(x[9])
    l.release()

#########################################################################################################################
def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts):

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
    t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
    c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E]
    t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E]

    c_templ = 0
    c_tem_counts = 0
    c_mature = 0
    c_mat_counts = 0
    t_templ = 0
    t_tem_counts = 0
    t_mature = 0
    t_mat_counts = 0

    c_non = len(c_non_samples)
    c_non_counts = sum(x[2] for x in c_non_samples)
    t_non = len(t_non_samples)
    t_non_counts = sum(x[2] for x in t_non_samples)

    c_unmap = c_unmap - c_non
    t_unmap = c_unmap - t_non

    c_unmap_counts=c_unmap_counts - c_non_counts
    t_unmap_counts=t_unmap_counts - t_non_counts


    for x in c_samples:

       	if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               c_mature+=1
               c_mat_counts += x[2]
            else:
               c_templ+=1
               c_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  c_mature+=1
                  c_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              c_templ+=1
              c_tem_counts += x[2]

    for x in t_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               t_mature+=1
               t_mat_counts += x[2]
            else:
               t_templ+=1
               t_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  t_mature+=1
                  t_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              t_templ+=1
              t_tem_counts += x[2]

    fig = plt.figure(figsize=(7,5))
    labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template'
    sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
    ax1 = plt.subplot2grid((1,2),(0,0))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title('Control Group (reads)',fontsize=12)
    labels = 'miRNA RefSeq','Template', 'Unmapped','non-template'
    sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
    ax2 = plt.subplot2grid((1,2),(0,1))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title('Treated Group (reads)', fontsize=12)
    plt.savefig('pie_non.png',dpi=300)

######################################################################################################################################################

def merging_names(LH2E_copy,new):

    dupes=[]
    final_LH2E =[]

    for num in range(len(LH2E_copy)):

        if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
           final_LH2E.append(LH2E_copy[num][1])
           final_LH2E.append(LH2E_copy[num][0])
        else:
           dupes.append(LH2E_copy[num][1])

    dupes=list(set(dupes))

    for i in range(len(dupes)):
        dupes[i]=[dupes[i]]

    for x in LH2E_copy:
        for y in dupes:
            if x[1]==y[0]:
               fl=0
               if len(y)==1:
                  y.append(x[0])
               else:
                  for i in range(1,len(y)):
                      if y[i].split("_")[0]==x[0].split("_")[0]:
                         fl=1
                         if len(x[0])<len(y[i]):
                            del y[i]
                            y.append(x[0])
                            break

                  if fl==0:
                     y.append((x[0]))

    for y in dupes:
        if len(y)>2:
           for i in range(len(y)-1,1,-1):
               y[1]=y[1]+"/"+y[i]
               del y[i]


    for x in LH2E_copy:
        for y in dupes:
            if x[1]==y[0]:
               x[0]=y[1]

    LH2E_copy.sort()
    LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))

    new.extend(LH2E_copy)


######################################################################################################################################################
def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts):

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
    t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]

    c_templ = 0
    c_tem_counts = 0
    c_mature = 0
    c_mat_counts = 0
    t_templ = 0
    t_tem_counts = 0
    t_mature = 0
    t_mat_counts = 0

    for x in c_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               c_mature+=1
               c_mat_counts += x[2]
            else:
               c_templ+=1
               c_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  c_mature+=1
                  c_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              c_templ+=1
              c_tem_counts += x[2]

    for x in t_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               t_mature+=1
               t_mat_counts += x[2]
            else:
               t_templ+=1
               t_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  t_mature+=1
                  t_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              t_templ+=1
              t_tem_counts += x[2]


    fig = plt.figure()
    labels = 'miRNA RefSeq','Template', 'Unmapped'
    sizes = [c_mat_counts, c_tem_counts, c_unmap_counts]
    colors = ['gold', 'yellowgreen', 'lightskyblue']
    explode = (0.2, 0.05, 0.1)
    ax1 = plt.subplot2grid((1,2),(0,0))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title('Control group (reads)', fontsize=12)
    labels = 'miRNA RefSeq','Template', 'Unmapped'
    sizes = [t_mat_counts, t_tem_counts, t_unmap_counts]
    colors = ['gold', 'yellowgreen', 'lightskyblue']
    explode = (0.2, 0.05, 0.1)
    ax2 = plt.subplot2grid((1,2),(0,1))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title('Treated group (reads)',fontsize = 12)
    plt.savefig('pie_tem.png',dpi=300)

###################################################################################################################################################################################################################

def make_spider(merge_LH2E,merge_LH8E):

  c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
  t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]

  c_5 = 0
  c_5_counts = 0
  c_3 = 0
  c_3_counts = 0
  c_both =0
  c_both_counts=0
  c_mature = 0
  c_mat_counts = 0
  c_exception=0
  c_exception_counts=0


  t_5 = 0
  t_5_counts = 0
  t_3 = 0
  t_3_counts = 0
  t_both = 0
  t_both_counts = 0
  t_mature = 0
  t_mat_counts = 0
  t_exception = 0
  t_exception_counts=0

  for x in c_samples:

      if "/" not in x[0]:
          if "chr" in  x[0].split("_")[-1]:
             c_mature+=1
             c_mat_counts += x[2]
          elif 0 == int(x[0].split("_")[-1]):
             c_5+=1
             c_5_counts += x[2]
          elif 0 == int(x[0].split("_")[-2]):
             c_3+=1
             c_3_counts += x[2]
          else:
             c_both+=1
             c_both_counts+=x[2]

      else:
         f=0
         for y in x[0].split("/"):
             if "chr" in y.split("_")[-1]:
                c_mature+=1
                c_mat_counts += x[2]
                f=1
                break
         if f==0:
            for y in x[0].split("/"):
                c_exception+=1
                c_exception_counts += x[2]


  for x in t_samples:

      if "/" not in x[0]:
          if "chr" in  x[0].split("_")[-1]:
             t_mature+=1
             t_mat_counts += x[2]
          elif 0 == int(x[0].split("_")[-1]):
             t_5+=1
             t_5_counts += x[2]
          elif 0 == int(x[0].split("_")[-2]):
             t_3+=1
             t_3_counts += x[2]
          else:
             t_both+=1
             t_both_counts+=x[2]

      else:
         f=0
         for y in x[0].split("/"):
             if "chr" in y.split("_")[-1]:
                t_mature+=1
                t_mat_counts += x[2]
                f=1
                break
         if f==0:
            for y in x[0].split("/"):
                t_exception+=1
                t_exception_counts += x[2]


  c_all = c_5+c_3+c_both+c_mature+c_exception
  c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts

  t_all = t_5+t_3+t_both+t_mature + t_exception
  t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts

  c_5 = round(c_5/c_all*100,2)
  c_3 = round(c_3/c_all*100,2)
  c_both = round(c_both/c_all*100,2)
  c_mature = round(c_mature/c_all*100,2)
  c_exception = round(c_exception/c_all*100,2)

  c_5_counts = round(c_5_counts/c_all_counts*100,2)
  c_3_counts = round(c_3_counts/c_all_counts*100,2)
  c_both_counts = round(c_both_counts/c_all_counts*100,2)
  c_mat_counts = round(c_mat_counts/c_all_counts*100,2)
  c_exception_counts = round(c_exception_counts/c_all_counts*100,2)

  t_5 = round(t_5/t_all*100,2)
  t_3 = round(t_3/t_all*100,2)
  t_both = round(t_both/t_all*100,2)
  t_mature = round(t_mature/t_all*100,2)
  t_exception = round(t_exception/t_all*100,2)

  t_5_counts = round(t_5_counts/t_all_counts*100,2)
  t_3_counts = round(t_3_counts/t_all_counts*100,2)
  t_both_counts = round(t_both_counts/t_all_counts*100,2)
  t_mat_counts = round(t_mat_counts/t_all_counts*100,2)
  t_exception_counts = round(t_exception_counts/t_all_counts*100,2)

  radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception)
  radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts)

  df=pd.DataFrame({
  'group':['Controls','Treated'],
  """5' and 3' isomiRs""":[c_both,t_both],
  """3' isomiRs""":[c_3,t_3],
  'miRNA RefSeq':[c_mature,t_mature],
  """5' isomiRs""":[c_5,t_5],
  'Others*':[c_exception,t_exception]})

  df1=pd.DataFrame({
  'group':['Controls','Treated'],
  """5' and 3' isomiRs""":[c_both_counts,t_both_counts],
  """3' isomiRs""":[c_3_counts,t_3_counts],
  'miRNA RefSeq':[c_mat_counts,t_mat_counts],
  """5' isomiRs""":[c_5_counts,t_5_counts],
  'Others*':[c_exception_counts,t_exception_counts]})

  spider_last(df,radar_max,1)
  spider_last(df1,radar_max_counts,2)


def spider_last(df,radar_max,flag):
  # ------- PART 1: Create background
  fig = plt.figure()
  # number of variable
  categories=list(df)[1:]
  N = len(categories)

  # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
  angles = [n / float(N) * 2 * pi for n in range(N)]
  angles += angles[:1]

  # Initialise the spider plot
  ax = plt.subplot(111, polar=True)

  # If you want the first axis to be on top:
  ax.set_theta_offset(pi/2)
  ax.set_theta_direction(-1)

  # Draw one axe per variable + add labels labels yet
  plt.xticks(angles[:-1], categories, fontsize=11)

  # Draw ylabels
  radar_max=round(radar_max+radar_max*0.1)
  mul=len(str(radar_max))-1
  maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul)
  sep = round(maxi/4)
  plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10)
  plt.ylim(0, maxi)

  # ------- PART 2: Add plots

  # Plot each individual = each line of the data
  # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

  # Ind1
  values=df.loc[0].drop('group').values.flatten().tolist()
  values += values[:1]
  ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls")
  ax.fill(angles, values, 'b', alpha=0.1)

  # Ind2
  values=df.loc[1].drop('group').values.flatten().tolist()
  values += values[:1]
  ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated")
  ax.fill(angles, values, 'r', alpha=0.1)

  # Add legend
  if flag==1:
     plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
     plt.savefig('spider_non_red.png',dpi=300)
  else:
     plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
     plt.savefig('spider_red.png',dpi=300)


#############################################################################################################################################################################################################
def hist_red(samples,flag):
    lengths=[]
    cat=[]
    total_reads=0
    seq=[]

    if flag == "c":
        title = "Length Distribution of Control group (Redudant reads)"
    if flag == "t":
        title = "Length Distribution of Treated group (Redudant reads)"

    for i in samples:
        for x in i:
            lengths.append(len(x[9]))
            if x[1]=="0":
               seq.append([x[9],x[0].split("-")[1],"Mapped"])
               cat.append("Mapped")
            if x[1] == "4":
               seq.append([x[9],x[0].split("-")[1],"Unmapped"])
               cat.append("Unmapped")

    uni_len=list(set(lengths))
    uni_len=[x for x in uni_len if x<=35]
    low=min(lengths)
    up=max(lengths)
    seq.sort()
    uni_seq=list(seq for seq,_ in itertools.groupby(seq))
    dim=up-low

    if dim>20:
       s=5
    else:
       s=8

    total_reads+=sum([int(x[1]) for x in uni_seq])

    map_reads=[]
    unmap_reads=[]
    length=[]
    for y in uni_len:
        map_temp=0
        unmap_temp=0
        for x in uni_seq:
            if len(x[0])==y and x[2]=="Mapped":
               map_temp+=int(x[1])
            if len(x[0])==y and x[2]=="Unmapped":
               unmap_temp+=int(x[1])
        if y<=35:
           length.append(y)
           map_reads.append(round(map_temp/total_reads*100,2))
           unmap_reads.append(round(unmap_temp/total_reads*100,2))

    ylim=max([sum(x) for x in zip(unmap_reads, map_reads)])
    ylim=ylim+ylim*20/100
    fig, ax = plt.subplots()
    width=0.8
    ax.bar(length, unmap_reads, width, label='Unmapped')
    h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped')
    plt.xticks(np.arange(length[0], length[-1]+1, 1))
    plt.xlabel('Length (nt)',fontsize=14)
    plt.ylabel('Percentage',fontsize=14)
    plt.title(title,fontsize=14)
    ax.legend()
    plt.ylim([0, ylim])
    ax.grid(axis='y',linewidth=0.2)

    if flag=='c':
       plt.savefig('c_hist_red.png',dpi=300)

    if flag=='t':
       plt.savefig('t_hist_red.png',dpi=300)

#################################################################################################################


def logo_seq_red(merge, flag):

    if flag=="c":
       titlos="Control group (Redundant)"
       file_logo="c_logo.png"
       file_bar="c_bar.png"
    if flag=="t":
       titlos="Treated group (Redundant)"
       file_logo="t_logo.png"
       file_bar="t_bar.png"

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge]

    A=[0]*5
    C=[0]*5
    G=[0]*5
    T=[0]*5
    total_reads=0

    for y in c_samples:
        if "/" in y[0]:
            length=[]
            for x in y[0].split("/"):
                length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])

            best=min(length)
            total_reads+=best[2]
            for i in range(5):
                if i<len(best[1]):
                    if best[1][i] == "A":
                        A[i]+=best[2]
                    elif best[1][i] == "C":
                        C[i]+=best[2]
                    elif best[1][i] == "G":
                        G[i]+=best[2]
                    else:
                        T[i]+=best[2]
        else:
            total_reads+=y[2]
            for i in range(5):
                if i<len(y[0].split("_")[-1]):
                    if y[0].split("_")[-1][i] == "A":
                        A[i]+=(y[2])
                    elif y[0].split("_")[-1][i] == "C":
                        C[i]+=(y[2])
                    elif y[0].split("_")[-1][i] == "G":
                        G[i]+=(y[2])
                    else:
                        T[i]+=y[2]

    A[:] = [round(x*100,1) / total_reads for x in A]
    C[:] = [round(x*100,1) / total_reads for x in C]
    G[:] = [round(x*100,1) / total_reads for x in G]
    T[:] = [round(x*100,1) / total_reads for x in T]



    data = {'A':A,'C':C,'G':G,'T':T}
    df = pd.DataFrame(data, index=[1,2,3,4,5])
    h=df.plot.bar(color=tuple(["g", "b","gold","r"]) )
    h.grid(axis='y',linewidth=0.2)
    plt.xticks(rotation=0, ha="right")
    plt.ylabel("Counts (%)",fontsize=18)
    plt.xlabel("Positions (nt)",fontsize=18)
    plt.title(titlos,fontsize=20)
    plt.tight_layout()
    plt.savefig(file_bar, dpi=300)

    import logomaker as lm
    crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
    crp_logo.style_spines(visible=False)
    crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
    crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)

    # style using Axes methods
    crp_logo.ax.set_title(titlos,fontsize=18)
    crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5)
    crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5)
    crp_logo.ax.xaxis.set_ticks_position('none')
    crp_logo.ax.xaxis.set_tick_params(pad=-1)
    figure = plt.gcf()
    figure.set_size_inches(6, 4)
    crp_logo.fig.savefig(file_logo,dpi=300)

##########################################################################################################################################################################################################



def logo_seq_non_red(merge_LH2E):

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]

    A=[0]*5
    C=[0]*5
    G=[0]*5
    T=[0]*5

    for y in c_samples:
        if "/" in y[0]:
            length=[]
            for x in y[0].split("/"):
                length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])

            best=min(length)
            for i in range(5):
                if i<len(best[1]):
                    if best[1][i] == "A":
                        A[i]+=1
                    elif best[1][i] == "C":
                        C[i]+=1
                    elif best[1][i] == "G":
                        G[i]+=1
                    else:
                        T[i]+=1
        else:
            for i in range(5):
                if i<len(y[0].split("_")[-1]):
                    if y[0].split("_")[-1][i] == "A":
                        A[i]+=1
                    elif y[0].split("_")[-1][i] == "C":
                        C[i]+=1
                    elif y[0].split("_")[-1][i] == "G":
                        G[i]+=1
                    else:
                        T[i]+=1

    data = {'A':A,'C':C,'G':G,'T':T}
    df = pd.DataFrame(data, index=[1,2,3,4,5])
    h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"]))
    h.set_xlabel("Positions (nt)")
    h.set_ylabel("Unique sequences")
    plt.xticks(rotation=0, ha="right")
    plt.tight_layout()
    plt.savefig("bar2.png", dpi=300)


    import logomaker as lm
    crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')

    # style using Logo methods
    crp_logo.style_spines(visible=False)
    crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
    crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)

    # style using Axes methods
    crp_logo.ax.set_ylabel("Unique sequences", labelpad=5)
    crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5)
    crp_logo.ax.xaxis.set_ticks_position('none')
    crp_logo.ax.xaxis.set_tick_params(pad=-1)
    crp_logo.ax.set_title("Non-redundant")
    figure = plt.gcf()
    crp_logo.fig.savefig('logo2.png', dpi=300)


###################################################################################################################################################################################################################

def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):

    for i in range(2,len(tem_samp[0])):

       fp = open(folder+tem_names[i-2]+'.txt','w')
       fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")

       for x in tem_samp:
           fp.write("%s" % "\t".join([x[0],x[i]])+"\n")

       for j in range(len(non_names)):
           if non_names[j]==tem_names[i-2]:
              for x in non_samp:
                  fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
       fp.close()

###################################################################################################################################################################################################################

def download_matures(matures,org_name):

    #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
    url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
    data = urllib.request.urlopen(url).read()
    file_mirna = gzip.decompress(data).decode('utf-8')
    file_mirna = file_mirna.split("\n")

    for i in range(0,len(file_mirna)-1,2):

        if org_name in file_mirna[i]:
           matures.append(file_mirna[i])
           matures.append(file_mirna[i+1])

###################################################################################################################################################################################################################
def non_template_ref(sc,st,all_isoforms):

  pre_uni_seq_con = list(sc)
  pre_uni_seq_tre = list(st)

  for x in pre_uni_seq_con:
      for y in x:
          if ">"+y[2] not in all_isoforms and ")_" in y[2] :
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])


  for x in pre_uni_seq_tre:
      for y in x:
          if ">"+y[2] not in all_isoforms and ")_" in y[2]:
             all_isoforms.append(">"+y[2])
             all_isoforms.append(y[9])

################################################################################################################################################################################################

def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):

    for y in mir_names:
        flag=0
        for x in sample:
            if y[0]==x[0]:
               flag=1
               break
        if flag==0:
           sample.append([y[0],"0",y[1]])

    sample.sort(key=lambda x: x[0])
    sample=list(sample for sample,_ in itertools.groupby(sample))

    l.acquire()
    new_d.append(sample)
    sample_order.append(sample_name)
    l.release()

###############################################################################################################################################################################################

if __name__ == '__main__':

 starttime = time.time()

 q1 = Queue()
 q2 = Queue()
 lock = Lock()
 manager = Manager()

 mature_mirnas=manager.list()
 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
 ps_mature.start()

 args.control[0]=args.control[0][1:]
 args.control[len(args.control)-1][:-1]
 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]

 args.treated[0]=args.treated[0][1:]
 args.treated[len(args.treated)-1][:-1]
 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]


##############  Detection of templated isoforms ################

 radar = manager.list([0,0,0,0])
 samples = manager.list()
 data= manager.list()
 names_con=manager.list()
 samples_mirna_names=manager.list()
 deseq=manager.list()
 unmap_seq=manager.Value('i',0)
 unmap_counts=manager.Value('i',0)
 LH2E_names=manager.list()
 ini_c_samples = manager.list()


 radar1 = manager.list([0,0,0,0])
 samples1 = manager.list()
 data1 = manager.list()
 names_tre = manager.list()
 samples_mirna_names1=manager.list()
 deseq1=manager.list()
 unmap1_seq = manager.Value('i',0)
 unmap1_counts = manager.Value('i',0)
 LH8E_names=manager.list()
 ini_t_samples = manager.list()
 ps_mature.join()


 mature_mirnas=list(mature_mirnas)


 starttime1 = time.time()
 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])

 [p.start() for p in ps_sam]
 [p.join() for p in ps_sam]
 print('SAM took {} seconds'.format(time.time() - starttime1))

 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
 [x.start() for x in ps_hist]

 starttime200=time.time()

 sc = list(samples)
 st = list(samples1)

 names_con=list(names_con)
 names_tre=list(names_tre)
 samples_mirna_names=list(samples_mirna_names)
 samples_mirna_names.sort()
 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))

 samples_mirna_names1=list(samples_mirna_names1)
 samples_mirna_names1.sort()
 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))

 deseq=list(deseq)
 deseq1=list(deseq1)

 new_names_con=manager.list()
 new_names_tre=manager.list()
 new_deseq=manager.list()
 new_deseq1=manager.list()
 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])

 [z.start() for z in ps_deseq]
 [z.join() for z in ps_deseq]
 new_deseq=list(new_deseq)
 new_deseq1=list(new_deseq1)

 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]

 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]

 print('Deseq took {} seconds'.format(time.time() - starttime200))

 merg_nam_LH2E=manager.list()
 merg_nam_LH8E=manager.list()

 LH2E_copy=copy.deepcopy(list(LH2E))
 LH8E_copy=copy.deepcopy(list(LH8E))

 fil_sort_tre=manager.list()
 fil_sort_con=manager.list()
 raw_sort_tre=manager.list()
 raw_sort_con=manager.list()

 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con))
 ps_main.start()

 if args.anal=="2":
    all_iso = manager.list()
    ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
    ps_non_iso.start()

 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
 [x.start() for x in ps_merge]
 [x.join() for x in ps_merge]

 merg_nam_LH2E=list(merg_nam_LH2E)
 merg_nam_LH8E=list(merg_nam_LH8E)

 starttime2 = time.time()
 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
 if args.anal == "1":
    procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])

 [p.start() for p in procs]


 if args.anal=="1":
    [x.join() for x in ps_hist]
    [p.join() for p in procs]
    ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
    ps_pdf.start()

 print('Graphs took {} seconds'.format(time.time() - starttime2))

 ps_main.join()

 fil_sort_con=list(fil_sort_con)
 fil_sort_tre=list(fil_sort_tre)
 if fil_sort_con==[]:
    fil_sort_con=raw_sort_con
    fil_sort_tre=raw_sort_tre

 raw_sort_con=list(raw_sort_con)
 raw_sort_tre=list(raw_sort_tre)
 names_con=list(new_names_con)
 names_tre=list(new_names_tre)

 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1))
 ps_write.start()

 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
 [p.start() for p in ps1_matrix]

 if args.anal=="1":
    ps_pdf.join()
 if args.anal=="2":
    [p.join() for p in procs]
    [x.join() for x in ps_hist]

 ps_write.join()
 [p.join() for p in ps1_matrix]


############################## Detection of Both  #######################################

 starttime10 = time.time()

 if args.anal == "2":

  n_data= manager.list()
  n_names_con=manager.list()
  n_samples_mirna_names=manager.list()
  n_deseq=manager.list()
  n_LH2E_names=manager.list()

  n_data1 = manager.list()
  n_names_tre = manager.list()
  n_samples_mirna_names1=manager.list()
  n_deseq1=manager.list()
  n_LH8E_names=manager.list()


  new_mat_mirnas = list(mature_mirnas)
  ps_non_iso.join()

  all_iso=list(all_iso)
  new_mat_mirnas.extend(all_iso)

  starttime11=time.time()

  ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
  ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])

  [p.start() for p in ps_sam]
  [p.join() for p in ps_sam]

  print('Non-sam took {} seconds'.format(time.time() - starttime11))

  starttime12=time.time()

  n_names_con=list(n_names_con)
  n_names_tre=list(n_names_tre)
  n_samples_mirna_names=list(n_samples_mirna_names)
  n_samples_mirna_names.sort()
  n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))

  n_samples_mirna_names1=list(n_samples_mirna_names1)
  n_samples_mirna_names1.sort()
  n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))

  n_deseq=list(n_deseq)
  n_deseq1=list(n_deseq1)

  new_n_names_con=manager.list()
  new_n_names_tre=manager.list()
  n_new_deseq=manager.list()
  n_new_deseq1=manager.list()
  ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
  ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])

  [x.start() for x in ps_deseq]
  [x.join() for x in ps_deseq]
  n_new_deseq=list(n_new_deseq)
  n_new_deseq1=list(n_new_deseq1)

  print([len(n_new_deseq[0]),len(n_new_deseq[1])])
  print([len(n_new_deseq1[0]),len(n_new_deseq1[1])])

  n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
  [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]

  n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
  [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]

  print('Non-deseq took {} seconds'.format(time.time() - starttime12))

  merg_nam_n_LH2E=manager.list()
  merg_nam_n_LH8E=manager.list()

  n_LH2E_copy=copy.deepcopy(list(n_LH2E))
  n_LH8E_copy=copy.deepcopy(list(n_LH8E))

  n_sort_tre=manager.list()
  n_sort_con=manager.list()

  n_fil_sort_con=manager.list()
  n_fil_sort_tre=manager.list()
  n_raw_sort_con=manager.list()
  n_raw_sort_tre=manager.list()

  ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con))
  ps_main.start()

  ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
  ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
  [p.start() for p in ps_merge]
  [p.join() for p in ps_merge]

  merg_nam_n_LH2E=list(merg_nam_n_LH2E)
  merg_nam_n_LH8E=list(merg_nam_n_LH8E)

  procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
  procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
  procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])

  starttime13=time.time()
  [p.start() for p in procs]
  [p.join() for p in procs]

  print('Graphs took {} seconds'.format(time.time() - starttime13))

  procs1 = Process(target=pdf_before_DE,args=(args.anal))
  procs1.start()

  starttime14=time.time()
  ps_main.join()

  n_fil_sort_con=list(n_fil_sort_con)
  n_fil_sort_tre=list(n_fil_sort_tre)
  if n_fil_sort_con==[]:
     n_fil_sort_con=n_raw_sort_con
     n_fil_sort_tre=n_raw_sort_tre

  n_raw_sort_con=list(n_raw_sort_con)
  n_raw_sort_tre=list(n_raw_sort_tre)
  n_names_con=list(new_n_names_con)
  n_names_tre=list(new_n_names_tre)

  ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2))
  ps_write.start()

  ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
  ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
  [p.start() for p in ps1_matrix]

  ps_write.join()
  [p.join() for p in ps1_matrix]
  procs1.join()
 print('That took {} seconds'.format(time.time() - starttime10))
 print('That took {} seconds'.format(time.time() - starttime))