isoread: mirbase_functions.py comparison

comparison mirbase_functions.py @ 16:e19c832c5368 draft

Uploaded

author	glogobyte
date	Thu, 22 Oct 2020 07:47:58 +0000
parents
children

comparison

equal deleted inserted replaced

-:f06ec007c578
+:e19c832c5368
+import itertools
+import time
+import sys
+import os
+import urllib.request
+import gzip
+from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
+import subprocess
+import argparse
+from collections import OrderedDict
+from matplotlib.backends.backend_pdf import PdfPages
+import pandas as pd
+from math import pi
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.ticker import PercentFormatter
+import seaborn as sns
+import scipy.stats as stats
+from plotnine import *
+import math
+import re
+import matplotlib.ticker as mtick
+import copy
+"""---------------------- Simple Functions -----------------------"""
+# Read a file and return it as a list
+def read(path, flag):
+if flag == 0:
+with open(path) as fp:
+file=fp.readlines()
+fp.close()
+return file
+if flag == 1:
+with open(path) as fp:
+file = fp.read().splitlines()
+fp.close()
+return file
+# Write a list to a txt file
+def write(path, list):
+with open(path,'w') as fp:
+for x in list:
+fp.write(str("\t".join(x[1:-1])))
+fp.close()
+"""---------------------- RNA-seq Functions ----------------------"""
+# Detect the longest common substring sequence between two mirnas
+def longestSubstring(str1, str2):
+from difflib import SequenceMatcher
+# initialize SequenceMatcher object with
+# input string
+seqMatch = SequenceMatcher(None, str1, str2)
+# find match of longest sub-string
+# output will be like Match(a=0, b=0, size=5)
+match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
+# print longest substring
+if (match.size != 0):
+return str1[match.a: match.a + match.size]
+else:
+print('No longest common sub-string found')
+########################################################################################################################################################
+def collapse_sam(path):
+ini_sam=read(path,0)
+main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
+intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in  x.split("\t")[0]]
+uni_seq = []
+for x in main_sam:
+if [x[2], x[9]] not in uni_seq:
+uni_seq.append([x[2], x[9]])
+new_main_sam=[]
+incr_num=0
+for i in range(len(uni_seq)):
+count=0
+incr_num+=1
+for y in main_sam:
+if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
+count+=1
+temp=y
+temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+temp[0]=str(incr_num)+"-"+str(count)
+new_main_sam.append(temp)
+new_sam=intro_sam+new_main_sam
+return new_sam
+#################################################################################################################################################################################################################
+def duplicate_chroms_isoforms(List):
+dupes=[]
+for num in range(len(List)):
+if  [List[num][9],List[num][0],List[num][2]] not in dupes :
+dupes.append([List[num][9],List[num][0],List[num][2]])
+for x in List:
+for y in dupes:
+if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
+y.append(x[2])
+double_List = [x[:] for x in List]
+chr_order=[]
+for x in dupes:
+temp = []
+for i in range(2,len(x)):
+if x[i].split("chr")[1].split("(")[0].isdigit():
+temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
+else:
+temp.append(x[i].split("chr")[1][0:4])
+for z in temp:
+if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
+temp = [str(j) for j in temp]
+temp=list(set(temp))
+temp.sort()
+chr_order.append(temp)
+final_dupes=[]
+for i in range(len(dupes)):
+final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
+for x in chr_order[i]:
+result = re.match("[-+]?\d+$", str(x))
+if len(chr_order[i]) == len(set(chr_order[i])):
+if result is not None:
+if int(x)<0:
+final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
+else:
+final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
+else:
+final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
+else:
+if result is not None:
+if int(x) < 0:
+final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
+else:
+final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
+else:
+final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
+final_dupes.sort()
+final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
+for i in range(len(double_List)):
+for x in final_dupes:
+if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
+gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
+double_List[i][2] = x[1]+gg
+if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
+double_List[i][2]=x[1]
+List[i][2] = x[1]
+List.sort()
+new_list=list(List for List,_ in itertools.groupby(List))
+double_List.sort()
+new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
+return new_list, new_double_List
+#############################################################################################################################################################################################################
+def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
+# read the sam file
+ini_sam=read(path,0)
+new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
+unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
+sorted_uni_arms = []
+for i in range(len(mature_mirnas)):
+tmp_count_reads = 0   # calculate the total number of reads
+tmp_count_seq = 0     # calculate the total number of sequences
+for j in range(len(unique_seq)):
+if "{" in unique_seq[j][2].split("_")[0]:
+official=unique_seq[j][2].split("_")[0][:-4]
+else:
+official=unique_seq[j][2].split("_")[0]
+if mature_mirnas[i].split(" ")[0][1:] == official:
+temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
+off_part = longestSubstring(temp_mature, unique_seq[j][9])
+mat_diff = temp_mature.split(off_part)
+mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
+unique_diff = unique_seq[j][9].split(off_part)
+unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
+# Problem with hsa-miR-8485
+if mat_diff[1]!=0 and unique_diff[1]!=0:
+unique_seq[j]=1
+pre_pos = 0
+post_pos = 0
+elif mat_diff[0]!=0 and unique_diff[0]!=0:
+unique_seq[j]=1
+pre_pos = 0
+post_pos = 0
+else:
+pre_pos = mat_diff[0]-unique_diff[0]
+post_pos = unique_diff[1]-mat_diff[1]
+tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
+tmp_count_seq = tmp_count_seq+1
+if pre_pos != 0 or post_pos != 0:
+if pre_pos == 0:
+unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
+elif post_pos == 0:
+unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
+else:
+unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
+for x in range(unique_seq.count(1)):
+unique_seq.remove(1)
+if tmp_count_reads != 0 and tmp_count_seq != 0:
+sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
+sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
+dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
+for y in sorted_uni_arms:
+counts=0
+seqs=0
+for x in double_fil_uni_seq:
+if y[0]==x[2].split("_")[0]:
+counts+=int(x[0].split("-")[1])
+seqs+=1
+y[1]=seqs
+y[2]=counts
+LHE=[]
+l.acquire()
+if con=="c":
+LHE.extend(z[2] for z in double_fil_uni_seq)
+for y in double_fil_uni_seq:
+samples_mirna_names.append([y[2],y[9]])
+deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
+LHE_names.extend(LHE)
+unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
+unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
+names.append(name)
+samples.append(dedup_unique_seq)
+data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
+ini_sample.append(new_main_sam)
+if con=="t":
+LHE.extend(z[2] for z in double_fil_uni_seq)
+for y in double_fil_uni_seq:
+samples_mirna_names.append([y[2],y[9]])
+deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
+LHE_names.extend(LHE)
+unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
+unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
+names.append(name)
+samples.append(dedup_unique_seq)
+data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
+ini_sample.append(new_main_sam)
+l.release()
+######################################################################################################################################
+"""
+Read a sam file from Bowtie and do the followings:
+1) Remove reverse stranded mapped reads
+2) Remove unmapped reads
+3) Remove all sequences with reads less than 11 reads
+4) Sort the arms with the most sequences in decreading rate
+5) Sort the sequences of every arm with the most reads in decreasing rate
+6) Calculate total number of sequences of every arm
+7) Calculate total number of reads of sequences of every arm.
+8) Store all the informations in a txt file
+"""
+def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
+ini_sam=read(path,0)
+new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
+unique_seq=[]
+unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
+uni_seq=[]
+# Calculate the shifted positions for every isomir and add them to the name of it
+sorted_uni_arms = []
+for i in range(1,len(mature_mirnas),2):
+tmp_count_reads = 0   # calculate the total number of reads
+tmp_count_seq = 0     # calculate the total number of sequences
+for j in range(len(unique_seq)):
+temp_mature = mature_mirnas[i].strip().replace("U", "T")
+if temp_mature in unique_seq[j][9]:
+off_part = longestSubstring(temp_mature, unique_seq[j][9])
+mat_diff = temp_mature.split(off_part)
+mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
+unique_diff = unique_seq[j][9].split(off_part)
+if len(unique_diff)<=2:
+unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
+pre_pos = mat_diff[0]-unique_diff[0]
+post_pos = unique_diff[1]-mat_diff[1]
+lengthofmir = len(off_part) + post_pos
+if pre_pos == 0:
+tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
+tmp_count_seq = tmp_count_seq + 1
+if pre_pos == 0:
+t_name=unique_seq[j].copy()
+t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
+uni_seq.append(t_name)
+if tmp_count_reads != 0 and tmp_count_seq != 0:
+sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
+sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
+unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
+LHE=[]
+l.acquire()
+if con=="c":
+LHE.extend(x[2] for x in unique_seq if x[2]!="*")
+for x in unique_seq:
+if x[2]!="*":
+n_samples_mirna_names.append([x[2],x[9]])
+n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
+n_LHE_names.extend(LHE)
+names.append(name)
+data.append([con,name,unique_seq,sorted_uni_arms])
+if con=="t":
+LHE.extend(x[2] for x in unique_seq if x[2]!="*")
+for x in unique_seq:
+if x[2]!="*":
+n_samples_mirna_names.append([x[2],x[9]])
+n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
+n_LHE_names.extend(LHE)
+names.append(name)
+data.append([con,name,unique_seq,sorted_uni_arms])
+l.release()
+#####################################################################################################################################################################################################################
+def deseq2_temp(samples_mirna_names,deseq,con,l):
+samples_mirna_names.sort(key=lambda x:[0])
+for i in range(len(deseq)):
+for y in samples_mirna_names:
+flag = 0
+for x in deseq[i]:
+if y[0] == x[0]:
+flag = 1
+break
+if flag == 0:
+deseq[i].append([y[0], "0", y[1]])
+[deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
+deseq_final = [[x[0],x[2]] for x in deseq[0]]
+[deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
+l.acquire()
+if con=="c":
+q1.put(deseq_final)
+if con=="t":
+q2.put(deseq_final)
+l.release()
+####################################################################################################################################################################################################################
+def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
+LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
+LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
+LH8E_add_names.sort()
+LH2E_add_names.sort()
+LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
+LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
+LH2E.sort()
+LH8E.sort()
+LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
+LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
+zeros=["0"]*(len(LH8E[0])-2)
+[LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
+LH8E=LH8E+LH8E_add_names
+zeros=["0"]*(len(LH2E[0])-2)
+[LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
+LH2E=LH2E+LH2E_add_names
+dupes=[]
+final_LH2E =[]
+for num,_ in enumerate(LH2E):
+if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
+final_LH2E.append(LH2E[num][1])
+final_LH2E.append(LH2E[num][0])
+else:
+dupes.append(LH2E[num][1])
+dupes=list(set(dupes))
+dupes=[[x] for x in dupes]
+for x in LH2E:
+for y in dupes:
+if x[1]==y[0]:
+fl=0
+if len(y)==1:
+y.append(x[0])
+else:
+for i in range(1,len(y)):
+if y[i].split("_")[0]==x[0].split("_")[0]:
+fl=1
+if len(x[0])<len(y[i]):
+del y[i]
+y.append(x[0])
+break
+if fl==0:
+y.append((x[0]))
+for y in dupes:
+if len(y)>2:
+for i in range(len(y)-1,1,-1):
+y[1]=y[1]+"/"+y[i]
+del y[i]
+for x in LH2E:
+for y in dupes:
+if x[1]==y[0]:
+x[0]=y[1]
+for x in LH8E:
+for y in dupes:
+if x[1]==y[0]:
+x[0]=y[1]
+LH2E.sort()
+LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
+LH8E.sort()
+LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
+if int(per)!=-1:
+percent=int(per)/100
+c_col_filter=round(percent*(len(LH2E[1])-2))
+t_col_filter=round(percent*(len(LH8E[1])-2))
+for i, _ in enumerate(LH2E):
+c_cols=0
+t_cols=0
+c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
+t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
+if c_cols>=c_col_filter or t_cols>=t_col_filter:
+filter_LH8E.append(LH8E[i])
+filter_LH2E.append(LH2E[i])
+raw_LH2E.extend(LH2E)
+raw_LH8E.extend(LH8E)
+##################################################################################################################################################################################################################
+def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
+if flag == 1 and int(per)!=-1:
+fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_tre:
+fp.write("\t"+y)
+for x in fil_LH8E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_con:
+fp.write("\t"+y)
+for x in fil_LH2E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+if flag == 2 and int(per)!=-1:
+fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_tre:
+fp.write("\t"+y)
+for x in fil_LH8E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_con:
+fp.write("\t"+y)
+for x in fil_LH2E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+if flag == 1:
+fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_tre:
+fp.write("\t"+y)
+for x in raw_LH8E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_con:
+fp.write("\t"+y)
+for x in raw_LH2E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+if flag == 2:
+fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_tre:
+fp.write("\t"+y)
+for x in raw_LH8E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
+fp.write("Name\t")
+fp.write("Sequence")
+for y in names_con:
+fp.write("\t"+y)
+for x in raw_LH2E:
+fp.write("\n%s" % "\t".join(x))
+fp.close()
+#########################################################################################################################################
+def ssamples(names,samp,folder,pro):
+for i in range(2,len(samp[0])):
+fp = open(folder+names[i-2]+'.txt','w')
+fp.write("miRNA id"+"\t"+names[i-2]+"\n")
+for x in samp:
+fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
+fp.close()
+##################################################################################################################
+def DB_write(con,name,unique_seq,sorted_uni_arms,f):
+if f==1:
+# Write a txt file with all the information
+if con=="c":
+fp = open('split1/'+name, 'w')
+fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+if con=="t":
+fp = open('split2/'+name, 'w')
+fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+for i in range(len(sorted_uni_arms)):
+temp = []
+for j in range(len(unique_seq)):
+if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
+temp.append(unique_seq[j])
+temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
+fp.write("*********************************************************************************************************\n")
+fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
+fp.write("*********************************************************************************************************\n\n")
+[fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
+fp.write("\n" + "\n")
+fp.close()
+if f==2:
+if con=="c":
+fp = open('split3/'+name, 'w')
+fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+if con=="t":
+fp = open('split4/'+name, 'w')
+fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
+for i in range(len(sorted_uni_arms)):
+temp = []
+for j in range(len(unique_seq)):
+if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
+temp.append(unique_seq[j])
+if temp!=[]:
+temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
+fp.write("*********************************************************************************************************\n")
+fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
+fp.write("*********************************************************************************************************\n\n")
+[fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
+fp.write("\n" + "\n")
+fp.close()
+##########################################################################################################################
+def new_mat_seq(pre_unique_seq,mat_mirnas,l):
+unique_iso = []
+for x in pre_unique_seq:
+if len(x[2].split("_"))==3:
+for y in pre_unique_seq:
+if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
+if any(y[2] in lst2 for lst2 in unique_iso)==False:
+y[2]=">"+y[2]
+unique_iso.append(y)
+l.acquire()
+for x in unique_iso:
+mat_mirnas.append(x[2])
+mat_mirnas.append(x[9])
+l.release()
+#########################################################################################################################
+def merging_names(LH2E_copy,new):
+dupes=[]
+final_LH2E =[]
+for num in range(len(LH2E_copy)):
+if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
+final_LH2E.append(LH2E_copy[num][1])
+final_LH2E.append(LH2E_copy[num][0])
+else:
+dupes.append(LH2E_copy[num][1])
+dupes=list(set(dupes))
+for i in range(len(dupes)):
+dupes[i]=[dupes[i]]
+for x in LH2E_copy:
+for y in dupes:
+if x[1]==y[0]:
+fl=0
+if len(y)==1:
+y.append(x[0])
+else:
+for i in range(1,len(y)):
+if y[i].split("_")[0]==x[0].split("_")[0]:
+fl=1
+if len(x[0])<len(y[i]):
+del y[i]
+y.append(x[0])
+break
+if fl==0:
+y.append((x[0]))
+for y in dupes:
+if len(y)>2:
+for i in range(len(y)-1,1,-1):
+y[1]=y[1]+"/"+y[i]
+del y[i]
+for x in LH2E_copy:
+for y in dupes:
+if x[1]==y[0]:
+x[0]=y[1]
+LH2E_copy.sort()
+LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
+new.extend(LH2E_copy)
+######################################################################################################################################################
+def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
+for i in range(2,len(tem_samp[0])):
+fp = open(folder+tem_names[i-2]+'.txt','w')
+fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
+for x in tem_samp:
+fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
+for j in range(len(non_names)):
+if non_names[j]==tem_names[i-2]:
+for x in non_samp:
+fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
+fp.close()
+###################################################################################################################################################################################################################
+def download_matures(matures,org_name):
+#url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
+url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
+data = urllib.request.urlopen(url).read()
+file_mirna = gzip.decompress(data).decode('utf-8')
+file_mirna = file_mirna.split("\n")
+for i in range(0,len(file_mirna)-1,2):
+if org_name in file_mirna[i]:
+matures.append(file_mirna[i])
+matures.append(file_mirna[i+1])
+###################################################################################################################################################################################################################
+def non_template_ref(sc,st,all_isoforms):
+pre_uni_seq_con = list(sc)
+pre_uni_seq_tre = list(st)
+for x in pre_uni_seq_con:
+for y in x:
+if ">"+y[2] not in all_isoforms and ")_" in y[2] :
+all_isoforms.append(">"+y[2])
+all_isoforms.append(y[9])
+for x in pre_uni_seq_tre:
+for y in x:
+if ">"+y[2] not in all_isoforms and ")_" in y[2]:
+all_isoforms.append(">"+y[2])
+all_isoforms.append(y[9])
+################################################################################################################################################################################################
+def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
+for y in mir_names:
+flag=0
+for x in sample:
+if y[0]==x[0]:
+flag=1
+break
+if flag==0:
+sample.append([y[0],"0",y[1]])
+sample.sort(key=lambda x: x[0])
+sample=list(sample for sample,_ in itertools.groupby(sample))
+l.acquire()
+new_d.append(sample)
+sample_order.append(sample_name)
+l.release()
+###############################################################################################################################################################################################

Mercurial > repos > glogobyte > isoread

comparison mirbase_functions.py @ 16:e19c832c5368 draft