Mercurial > repos > glogobyte > viztool
changeset 0:258aaaa465f3 draft
Uploaded
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 10:48:17 +0000 |
parents | |
children | 561b0abcae87 |
files | mirbase_functions.py |
diffstat | 1 files changed, 829 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mirbase_functions.py Fri Oct 16 10:48:17 2020 +0000 @@ -0,0 +1,829 @@ +import itertools +import time +import sys +import os +import urllib.request +import gzip +from multiprocessing import Process, Queue, Lock, Pool, Manager, Value +import subprocess +import argparse +from collections import OrderedDict +from matplotlib.backends.backend_pdf import PdfPages +import pandas as pd +from math import pi +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import PercentFormatter +import seaborn as sns +import scipy.stats as stats +from plotnine import * +import math +import re +import matplotlib.ticker as mtick +import copy + + +"""---------------------- Simple Functions -----------------------""" + +# Read a file and return it as a list +def read(path, flag): + if flag == 0: + with open(path) as fp: + file=fp.readlines() + fp.close() + return file + + if flag == 1: + with open(path) as fp: + file = fp.read().splitlines() + fp.close() + return file + +# Write a list to a txt file +def write(path, list): + with open(path,'w') as fp: + for x in list: + fp.write(str("\t".join(x[1:-1]))) + fp.close() + +"""---------------------- RNA-seq Functions ----------------------""" + +# Detect the longest common substring sequence between two mirnas +def longestSubstring(str1, str2): + + from difflib import SequenceMatcher + # initialize SequenceMatcher object with + # input string + seqMatch = SequenceMatcher(None, str1, str2) + + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + + # print longest substring + if (match.size != 0): + return str1[match.a: match.a + match.size] + else: + print('No longest common sub-string found') + + + +######################################################################################################################################################## + +def collapse_sam(path): + + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] + + uni_seq = [] + for x in main_sam: + + if [x[2], x[9]] not in uni_seq: + uni_seq.append([x[2], x[9]]) + + new_main_sam=[] + incr_num=0 + for i in range(len(uni_seq)): + count=0 + incr_num+=1 + for y in main_sam: + if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: + count+=1 + temp=y + temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + temp[0]=str(incr_num)+"-"+str(count) + new_main_sam.append(temp) + + new_sam=intro_sam+new_main_sam + + return new_sam + +################################################################################################################################################################################################################# + +def duplicate_chroms_isoforms(List): + + dupes=[] + + for num in range(len(List)): + + if [List[num][9],List[num][0],List[num][2]] not in dupes : + dupes.append([List[num][9],List[num][0],List[num][2]]) + + for x in List: + for y in dupes: + if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: + y.append(x[2]) + + + double_List = [x[:] for x in List] + + chr_order=[] + for x in dupes: + temp = [] + for i in range(2,len(x)): + if x[i].split("chr")[1].split("(")[0].isdigit(): + temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) + else: + temp.append(x[i].split("chr")[1][0:4]) + + for z in temp: + if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: + temp = [str(j) for j in temp] + temp=list(set(temp)) + temp.sort() + chr_order.append(temp) + + final_dupes=[] + for i in range(len(dupes)): + final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) + for x in chr_order[i]: + result = re.match("[-+]?\d+$", str(x)) + if len(chr_order[i]) == len(set(chr_order[i])): + if result is not None: + + if int(x)<0: + final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" + else: + final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" + else: + final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) + else: + if result is not None: + if int(x) < 0: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" + else: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" + else: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) + + final_dupes.sort() + final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) + + for i in range(len(double_List)): + for x in final_dupes: + + if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: + gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) + double_List[i][2] = x[1]+gg + + if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: + double_List[i][2]=x[1] + List[i][2] = x[1] + + List.sort() + new_list=list(List for List,_ in itertools.groupby(List)) + + double_List.sort() + new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) + + return new_list, new_double_List + + +############################################################################################################################################################################################################# + +def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): + + # read the sam file + ini_sam=read(path,0) + new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] + + sorted_uni_arms = [] + + for i in range(len(mature_mirnas)): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + for j in range(len(unique_seq)): + + if "{" in unique_seq[j][2].split("_")[0]: + official=unique_seq[j][2].split("_")[0][:-4] + else: + official=unique_seq[j][2].split("_")[0] + + if mature_mirnas[i].split(" ")[0][1:] == official: + + temp_mature = mature_mirnas[i+1].strip().replace("U", "T") + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + # Problem with hsa-miR-8485 + if mat_diff[1]!=0 and unique_diff[1]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + elif mat_diff[0]!=0 and unique_diff[0]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + else: + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq+1 + + if pre_pos != 0 or post_pos != 0: + if pre_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) + elif post_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) + else: + unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) + + for x in range(unique_seq.count(1)): + unique_seq.remove(1) + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) + + for y in sorted_uni_arms: + counts=0 + seqs=0 + for x in double_fil_uni_seq: + if y[0]==x[2].split("_")[0]: + counts+=int(x[0].split("-")[1]) + seqs+=1 + + y[1]=seqs + y[2]=counts + + LHE=[] + l.acquire() + if con=="c": + LHE.extend(z[2] for z in double_fil_uni_seq) + for y in double_fil_uni_seq: + samples_mirna_names.append([y[2],y[9]]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) + LHE_names.extend(LHE) + unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) + names.append(name) + samples.append(dedup_unique_seq) + data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) + ini_sample.append(new_main_sam) + + if con=="t": + LHE.extend(z[2] for z in double_fil_uni_seq) + for y in double_fil_uni_seq: + samples_mirna_names.append([y[2],y[9]]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) + LHE_names.extend(LHE) + unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) + names.append(name) + samples.append(dedup_unique_seq) + data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) + ini_sample.append(new_main_sam) + l.release() + + +###################################################################################################################################### + +""" + +Read a sam file from Bowtie and do the followings: + +1) Remove reverse stranded mapped reads +2) Remove unmapped reads +3) Remove all sequences with reads less than 11 reads +4) Sort the arms with the most sequences in decreading rate +5) Sort the sequences of every arm with the most reads in decreasing rate +6) Calculate total number of sequences of every arm +7) Calculate total number of reads of sequences of every arm. +8) Store all the informations in a txt file + +""" + +def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): + + ini_sam=read(path,0) + new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq=[] + unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] + + uni_seq=[] + # Calculate the shifted positions for every isomir and add them to the name of it + sorted_uni_arms = [] + for i in range(1,len(mature_mirnas),2): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + + for j in range(len(unique_seq)): + + temp_mature = mature_mirnas[i].strip().replace("U", "T") + + if temp_mature in unique_seq[j][9]: + + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + if len(unique_diff)<=2: + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + + lengthofmir = len(off_part) + post_pos + if pre_pos == 0: + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq + 1 + + if pre_pos == 0: + + t_name=unique_seq[j].copy() + t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) + uni_seq.append(t_name) + + + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + + + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) + + LHE=[] + + l.acquire() + if con=="c": + LHE.extend(x[2] for x in unique_seq if x[2]!="*") + for x in unique_seq: + if x[2]!="*": + n_samples_mirna_names.append([x[2],x[9]]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + n_LHE_names.extend(LHE) + names.append(name) + data.append([con,name,unique_seq,sorted_uni_arms]) + + + if con=="t": + LHE.extend(x[2] for x in unique_seq if x[2]!="*") + for x in unique_seq: + if x[2]!="*": + n_samples_mirna_names.append([x[2],x[9]]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + n_LHE_names.extend(LHE) + names.append(name) + data.append([con,name,unique_seq,sorted_uni_arms]) + l.release() + +##################################################################################################################################################################################################################### +def deseq2_temp(samples_mirna_names,deseq,con,l): + + samples_mirna_names.sort(key=lambda x:[0]) + for i in range(len(deseq)): + for y in samples_mirna_names: + flag = 0 + for x in deseq[i]: + if y[0] == x[0]: + flag = 1 + break + + if flag == 0: + deseq[i].append([y[0], "0", y[1]]) + + [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] + deseq_final = [[x[0],x[2]] for x in deseq[0]] + [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] + + l.acquire() + if con=="c": + q1.put(deseq_final) + + if con=="t": + q2.put(deseq_final) + l.release() + + +#################################################################################################################################################################################################################### + +def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): + + LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] + LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] + + LH8E_add_names.sort() + LH2E_add_names.sort() + LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) + LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) + + LH2E.sort() + LH8E.sort() + LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) + LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) + + print("LHE_names") + print([len(LH8E_add_names),len(LH2E_add_names)]) + print([len(LH8E),len(LH2E)]) + + zeros=["0"]*(len(LH8E[0])-2) + [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] + LH8E=LH8E+LH8E_add_names + + zeros=["0"]*(len(LH2E[0])-2) + [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] + LH2E=LH2E+LH2E_add_names + + dupes=[] + final_LH2E =[] + + for num,_ in enumerate(LH2E): + + if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: + final_LH2E.append(LH2E[num][1]) + final_LH2E.append(LH2E[num][0]) + else: + dupes.append(LH2E[num][1]) + + + dupes=list(set(dupes)) + + dupes=[[x] for x in dupes] + + for x in LH2E: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + for x in LH2E: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + for x in LH8E: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + + LH2E.sort() + LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) + + LH8E.sort() + LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) + + if int(per)!=-1: + percent=int(per)/100 + print(percent) + print(count) + + c_col_filter=round(percent*(len(LH2E[1])-2)) + t_col_filter=round(percent*(len(LH8E[1])-2)) + + for i, _ in enumerate(LH2E): + c_cols=0 + t_cols=0 + + c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) + t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) + + if c_cols>=c_col_filter or t_cols>=t_col_filter: + filter_LH8E.append(LH8E[i]) + filter_LH2E.append(LH2E[i]) + + raw_LH2E.extend(LH2E) + raw_LH8E.extend(LH8E) + +################################################################################################################################################################################################################## + +def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): + + if flag == 1 and int(per)!=-1: + fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + for x in fil_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in fil_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2 and int(per)!=-1: + fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + + for x in fil_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in fil_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 1: + fp = open('Counts/Raw '+n2+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + for x in raw_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in raw_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2: + fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + + for x in raw_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in raw_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + +######################################################################################################################################### + +def ssamples(names,samp,folder,pro): + + for i in range(2,len(samp[0])): + + fp = open(folder+names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+names[i-2]+"\n") + + for x in samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + fp.close() + +################################################################################################################## + +def DB_write(con,name,unique_seq,sorted_uni_arms,f): + + if f==1: + # Write a txt file with all the information + if con=="c": + fp = open('split1/'+name, 'w') + + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split2/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + + if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: + + temp.append(unique_seq[j]) + + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + if f==2: + + if con=="c": + fp = open('split3/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split4/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: + temp.append(unique_seq[j]) + if temp!=[]: + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + +########################################################################################################################## + +def new_mat_seq(pre_unique_seq,mat_mirnas,l): + + unique_iso = [] + for x in pre_unique_seq: + if len(x[2].split("_"))==3: + for y in pre_unique_seq: + if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): + if any(y[2] in lst2 for lst2 in unique_iso)==False: + y[2]=">"+y[2] + unique_iso.append(y) + l.acquire() + for x in unique_iso: + mat_mirnas.append(x[2]) + mat_mirnas.append(x[9]) + l.release() + +######################################################################################################################### + +def merging_names(LH2E_copy,new): + + dupes=[] + final_LH2E =[] + + for num in range(len(LH2E_copy)): + + if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: + final_LH2E.append(LH2E_copy[num][1]) + final_LH2E.append(LH2E_copy[num][0]) + else: + dupes.append(LH2E_copy[num][1]) + + dupes=list(set(dupes)) + + for i in range(len(dupes)): + dupes[i]=[dupes[i]] + + for x in LH2E_copy: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + + for x in LH2E_copy: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + LH2E_copy.sort() + LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) + + new.extend(LH2E_copy) + + +###################################################################################################################################################### + +def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): + + for i in range(2,len(tem_samp[0])): + + fp = open(folder+tem_names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") + + for x in tem_samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + + for j in range(len(non_names)): + if non_names[j]==tem_names[i-2]: + for x in non_samp: + fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") + fp.close() + +################################################################################################################################################################################################################### + +def download_matures(matures,org_name): + + #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' + url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' + data = urllib.request.urlopen(url).read() + file_mirna = gzip.decompress(data).decode('utf-8') + file_mirna = file_mirna.split("\n") + + for i in range(0,len(file_mirna)-1,2): + + if org_name in file_mirna[i]: + matures.append(file_mirna[i]) + matures.append(file_mirna[i+1]) + +################################################################################################################################################################################################################### +def non_template_ref(sc,st,all_isoforms): + + pre_uni_seq_con = list(sc) + pre_uni_seq_tre = list(st) + + for x in pre_uni_seq_con: + for y in x: + if ">"+y[2] not in all_isoforms and ")_" in y[2] : + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + + + for x in pre_uni_seq_tre: + for y in x: + if ">"+y[2] not in all_isoforms and ")_" in y[2]: + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + +################################################################################################################################################################################################ + +def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): + + for y in mir_names: + flag=0 + for x in sample: + if y[0]==x[0]: + flag=1 + break + if flag==0: + sample.append([y[0],"0",y[1]]) + + sample.sort(key=lambda x: x[0]) + sample=list(sample for sample,_ in itertools.groupby(sample)) + + l.acquire() + new_d.append(sample) + sample_order.append(sample_name) + l.release() + +############################################################################################################################################################################################### +