Mercurial > repos > glogobyte > viztool
view mirbase_functions.py @ 1:561b0abcae87 draft
Uploaded
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 12:15:50 +0000 |
parents | 258aaaa465f3 |
children |
line wrap: on
line source
import itertools import time import sys import os import urllib.request import gzip from multiprocessing import Process, Queue, Lock, Pool, Manager, Value import subprocess import argparse from collections import OrderedDict from matplotlib.backends.backend_pdf import PdfPages import pandas as pd from math import pi import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import PercentFormatter import seaborn as sns import scipy.stats as stats from plotnine import * import math import re import matplotlib.ticker as mtick import copy """---------------------- Simple Functions -----------------------""" # Read a file and return it as a list def read(path, flag): if flag == 0: with open(path) as fp: file=fp.readlines() fp.close() return file if flag == 1: with open(path) as fp: file = fp.read().splitlines() fp.close() return file # Write a list to a txt file def write(path, list): with open(path,'w') as fp: for x in list: fp.write(str("\t".join(x[1:-1]))) fp.close() """---------------------- RNA-seq Functions ----------------------""" # Detect the longest common substring sequence between two mirnas def longestSubstring(str1, str2): from difflib import SequenceMatcher # initialize SequenceMatcher object with # input string seqMatch = SequenceMatcher(None, str1, str2) # find match of longest sub-string # output will be like Match(a=0, b=0, size=5) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) # print longest substring if (match.size != 0): return str1[match.a: match.a + match.size] else: print('No longest common sub-string found') ######################################################################################################################################################## def collapse_sam(path): ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] uni_seq = [] for x in main_sam: if [x[2], x[9]] not in uni_seq: uni_seq.append([x[2], x[9]]) new_main_sam=[] incr_num=0 for i in range(len(uni_seq)): count=0 incr_num+=1 for y in main_sam: if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: count+=1 temp=y temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" temp[0]=str(incr_num)+"-"+str(count) new_main_sam.append(temp) new_sam=intro_sam+new_main_sam return new_sam ################################################################################################################################################################################################################# def duplicate_chroms_isoforms(List): dupes=[] for num in range(len(List)): if [List[num][9],List[num][0],List[num][2]] not in dupes : dupes.append([List[num][9],List[num][0],List[num][2]]) for x in List: for y in dupes: if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: y.append(x[2]) double_List = [x[:] for x in List] chr_order=[] for x in dupes: temp = [] for i in range(2,len(x)): if x[i].split("chr")[1].split("(")[0].isdigit(): temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) else: temp.append(x[i].split("chr")[1][0:4]) for z in temp: if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: temp = [str(j) for j in temp] temp=list(set(temp)) temp.sort() chr_order.append(temp) final_dupes=[] for i in range(len(dupes)): final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) for x in chr_order[i]: result = re.match("[-+]?\d+$", str(x)) if len(chr_order[i]) == len(set(chr_order[i])): if result is not None: if int(x)<0: final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" else: final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" else: final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) else: if result is not None: if int(x) < 0: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" else: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" else: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) final_dupes.sort() final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) for i in range(len(double_List)): for x in final_dupes: if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) double_List[i][2] = x[1]+gg if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: double_List[i][2]=x[1] List[i][2] = x[1] List.sort() new_list=list(List for List,_ in itertools.groupby(List)) double_List.sort() new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) return new_list, new_double_List ############################################################################################################################################################################################################# def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): # read the sam file ini_sam=read(path,0) new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] sorted_uni_arms = [] for i in range(len(mature_mirnas)): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): if "{" in unique_seq[j][2].split("_")[0]: official=unique_seq[j][2].split("_")[0][:-4] else: official=unique_seq[j][2].split("_")[0] if mature_mirnas[i].split(" ")[0][1:] == official: temp_mature = mature_mirnas[i+1].strip().replace("U", "T") off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) unique_diff = [len(unique_diff[0]), len(unique_diff[1])] # Problem with hsa-miR-8485 if mat_diff[1]!=0 and unique_diff[1]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 elif mat_diff[0]!=0 and unique_diff[0]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 else: pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq+1 if pre_pos != 0 or post_pos != 0: if pre_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) elif post_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) else: unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) for x in range(unique_seq.count(1)): unique_seq.remove(1) if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) for y in sorted_uni_arms: counts=0 seqs=0 for x in double_fil_uni_seq: if y[0]==x[2].split("_")[0]: counts+=int(x[0].split("-")[1]) seqs+=1 y[1]=seqs y[2]=counts LHE=[] l.acquire() if con=="c": LHE.extend(z[2] for z in double_fil_uni_seq) for y in double_fil_uni_seq: samples_mirna_names.append([y[2],y[9]]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) LHE_names.extend(LHE) unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) names.append(name) samples.append(dedup_unique_seq) data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) ini_sample.append(new_main_sam) if con=="t": LHE.extend(z[2] for z in double_fil_uni_seq) for y in double_fil_uni_seq: samples_mirna_names.append([y[2],y[9]]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) LHE_names.extend(LHE) unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) names.append(name) samples.append(dedup_unique_seq) data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) ini_sample.append(new_main_sam) l.release() ###################################################################################################################################### """ Read a sam file from Bowtie and do the followings: 1) Remove reverse stranded mapped reads 2) Remove unmapped reads 3) Remove all sequences with reads less than 11 reads 4) Sort the arms with the most sequences in decreading rate 5) Sort the sequences of every arm with the most reads in decreasing rate 6) Calculate total number of sequences of every arm 7) Calculate total number of reads of sequences of every arm. 8) Store all the informations in a txt file """ def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): ini_sam=read(path,0) new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq=[] unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] uni_seq=[] # Calculate the shifted positions for every isomir and add them to the name of it sorted_uni_arms = [] for i in range(1,len(mature_mirnas),2): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): temp_mature = mature_mirnas[i].strip().replace("U", "T") if temp_mature in unique_seq[j][9]: off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) if len(unique_diff)<=2: unique_diff = [len(unique_diff[0]), len(unique_diff[1])] pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] lengthofmir = len(off_part) + post_pos if pre_pos == 0: tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq + 1 if pre_pos == 0: t_name=unique_seq[j].copy() t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) uni_seq.append(t_name) if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) LHE=[] l.acquire() if con=="c": LHE.extend(x[2] for x in unique_seq if x[2]!="*") for x in unique_seq: if x[2]!="*": n_samples_mirna_names.append([x[2],x[9]]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) n_LHE_names.extend(LHE) names.append(name) data.append([con,name,unique_seq,sorted_uni_arms]) if con=="t": LHE.extend(x[2] for x in unique_seq if x[2]!="*") for x in unique_seq: if x[2]!="*": n_samples_mirna_names.append([x[2],x[9]]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) n_LHE_names.extend(LHE) names.append(name) data.append([con,name,unique_seq,sorted_uni_arms]) l.release() ##################################################################################################################################################################################################################### def deseq2_temp(samples_mirna_names,deseq,con,l): samples_mirna_names.sort(key=lambda x:[0]) for i in range(len(deseq)): for y in samples_mirna_names: flag = 0 for x in deseq[i]: if y[0] == x[0]: flag = 1 break if flag == 0: deseq[i].append([y[0], "0", y[1]]) [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] deseq_final = [[x[0],x[2]] for x in deseq[0]] [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] l.acquire() if con=="c": q1.put(deseq_final) if con=="t": q2.put(deseq_final) l.release() #################################################################################################################################################################################################################### def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] LH8E_add_names.sort() LH2E_add_names.sort() LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) LH2E.sort() LH8E.sort() LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) print("LHE_names") print([len(LH8E_add_names),len(LH2E_add_names)]) print([len(LH8E),len(LH2E)]) zeros=["0"]*(len(LH8E[0])-2) [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] LH8E=LH8E+LH8E_add_names zeros=["0"]*(len(LH2E[0])-2) [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] LH2E=LH2E+LH2E_add_names dupes=[] final_LH2E =[] for num,_ in enumerate(LH2E): if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: final_LH2E.append(LH2E[num][1]) final_LH2E.append(LH2E[num][0]) else: dupes.append(LH2E[num][1]) dupes=list(set(dupes)) dupes=[[x] for x in dupes] for x in LH2E: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in LH2E: for y in dupes: if x[1]==y[0]: x[0]=y[1] for x in LH8E: for y in dupes: if x[1]==y[0]: x[0]=y[1] LH2E.sort() LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) LH8E.sort() LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) if int(per)!=-1: percent=int(per)/100 print(percent) print(count) c_col_filter=round(percent*(len(LH2E[1])-2)) t_col_filter=round(percent*(len(LH8E[1])-2)) for i, _ in enumerate(LH2E): c_cols=0 t_cols=0 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) if c_cols>=c_col_filter or t_cols>=t_col_filter: filter_LH8E.append(LH8E[i]) filter_LH2E.append(LH2E[i]) raw_LH2E.extend(LH2E) raw_LH8E.extend(LH8E) ################################################################################################################################################################################################################## def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): if flag == 1 and int(per)!=-1: fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in fil_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in fil_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2 and int(per)!=-1: fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in fil_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in fil_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 1: fp = open('Counts/Raw '+n2+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in raw_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in raw_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2: fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in raw_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in raw_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() ######################################################################################################################################### def ssamples(names,samp,folder,pro): for i in range(2,len(samp[0])): fp = open(folder+names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+names[i-2]+"\n") for x in samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") fp.close() ################################################################################################################## def DB_write(con,name,unique_seq,sorted_uni_arms,f): if f==1: # Write a txt file with all the information if con=="c": fp = open('split1/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split2/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: temp.append(unique_seq[j]) temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() if f==2: if con=="c": fp = open('split3/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split4/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: temp.append(unique_seq[j]) if temp!=[]: temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() ########################################################################################################################## def new_mat_seq(pre_unique_seq,mat_mirnas,l): unique_iso = [] for x in pre_unique_seq: if len(x[2].split("_"))==3: for y in pre_unique_seq: if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): if any(y[2] in lst2 for lst2 in unique_iso)==False: y[2]=">"+y[2] unique_iso.append(y) l.acquire() for x in unique_iso: mat_mirnas.append(x[2]) mat_mirnas.append(x[9]) l.release() ######################################################################################################################### def merging_names(LH2E_copy,new): dupes=[] final_LH2E =[] for num in range(len(LH2E_copy)): if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: final_LH2E.append(LH2E_copy[num][1]) final_LH2E.append(LH2E_copy[num][0]) else: dupes.append(LH2E_copy[num][1]) dupes=list(set(dupes)) for i in range(len(dupes)): dupes[i]=[dupes[i]] for x in LH2E_copy: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in LH2E_copy: for y in dupes: if x[1]==y[0]: x[0]=y[1] LH2E_copy.sort() LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) new.extend(LH2E_copy) ###################################################################################################################################################### def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): for i in range(2,len(tem_samp[0])): fp = open(folder+tem_names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") for x in tem_samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") for j in range(len(non_names)): if non_names[j]==tem_names[i-2]: for x in non_samp: fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") fp.close() ################################################################################################################################################################################################################### def download_matures(matures,org_name): #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' data = urllib.request.urlopen(url).read() file_mirna = gzip.decompress(data).decode('utf-8') file_mirna = file_mirna.split("\n") for i in range(0,len(file_mirna)-1,2): if org_name in file_mirna[i]: matures.append(file_mirna[i]) matures.append(file_mirna[i+1]) ################################################################################################################################################################################################################### def non_template_ref(sc,st,all_isoforms): pre_uni_seq_con = list(sc) pre_uni_seq_tre = list(st) for x in pre_uni_seq_con: for y in x: if ">"+y[2] not in all_isoforms and ")_" in y[2] : all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) for x in pre_uni_seq_tre: for y in x: if ">"+y[2] not in all_isoforms and ")_" in y[2]: all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) ################################################################################################################################################################################################ def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): for y in mir_names: flag=0 for x in sample: if y[0]==x[0]: flag=1 break if flag==0: sample.append([y[0],"0",y[1]]) sample.sort(key=lambda x: x[0]) sample=list(sample for sample,_ in itertools.groupby(sample)) l.acquire() new_d.append(sample) sample_order.append(sample_name) l.release() ###############################################################################################################################################################################################