Mercurial > repos > glogobyte > isoread
view mirbase_ultra_v2.py @ 0:99e8a03f8802 draft
Uploaded
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 18:53:05 +0000 |
parents | |
children |
line wrap: on
line source
from mirbase_functions import * from mirbase_graphs import * import itertools import time import sys import os import urllib.request import gzip from multiprocessing import Process, Queue, Lock, Pool, Manager, Value import subprocess import argparse from collections import OrderedDict from matplotlib.backends.backend_pdf import PdfPages import pandas as pd from math import pi import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import PercentFormatter import seaborn as sns import scipy.stats as stats from plotnine import * import math import re import matplotlib.ticker as mtick import copy subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) parser = argparse.ArgumentParser() parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[]) parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] ) parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") parser.add_argument("-gen", "--org_name", help="tool directory path", action="store") parser.add_argument("-program", "--pro", help="choose type of analysis", action="store") parser.add_argument("-f", "--flag", help="choose the database", action="store") parser.add_argument("-umis", "--umi", help="choose the database", action="store") parser.add_argument("-percentage", "--per", help="choose the database", action="store") parser.add_argument("-counts", "--count", help="choose the database", action="store") parser.add_argument("-name1", "--n1", help="choose the database", action="store") parser.add_argument("-name2", "--n2", help="choose the database", action="store") args = parser.parse_args() ######################################################################################################################################### def collapse_sam(path): ini_sam=read(path,0) main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] uni_seq = [] for x in main_sam: if [x[2], x[9]] not in uni_seq: uni_seq.append([x[2], x[9]]) new_main_sam=[] incr_num=0 for i in range(len(uni_seq)): count=0 incr_num+=1 for y in main_sam: if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: count+=1 temp=y temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" temp[0]=str(incr_num)+"-"+str(count) new_main_sam.append(temp) new_sam=intro_sam+new_main_sam return new_sam ################################################################################################################################################################################################################# def duplicate_chroms_isoforms(List): dupes=[] for num in range(len(List)): if [List[num][9],List[num][0],List[num][2]] not in dupes : dupes.append([List[num][9],List[num][0],List[num][2]]) for x in List: for y in dupes: if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: y.append(x[2]) double_List = [x[:] for x in List] chr_order=[] for x in dupes: temp = [] for i in range(2,len(x)): if x[i].split("chr")[1].split("(")[0].isdigit(): temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) else: temp.append(x[i].split("chr")[1][0:4]) for z in temp: if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: temp = [str(j) for j in temp] temp=list(set(temp)) temp.sort() chr_order.append(temp) final_dupes=[] for i in range(len(dupes)): final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) for x in chr_order[i]: result = re.match("[-+]?\d+$", str(x)) if len(chr_order[i]) == len(set(chr_order[i])): if result is not None: if int(x)<0: final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" else: final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" else: final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) else: if result is not None: if int(x) < 0: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" else: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" else: final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) final_dupes.sort() final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) for i in range(len(double_List)): for x in final_dupes: if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) double_List[i][2] = x[1]+gg if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: double_List[i][2]=x[1] List[i][2] = x[1] List.sort() new_list=list(List for List,_ in itertools.groupby(List)) double_List.sort() new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) return new_list, new_double_List ############################################################################################################################################################################################################# def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): # read the sam file ini_sam=read(path,0) new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] sorted_uni_arms = [] for i in range(len(mature_mirnas)): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): if "{" in unique_seq[j][2].split("_")[0]: official=unique_seq[j][2].split("_")[0][:-4] else: official=unique_seq[j][2].split("_")[0] if mature_mirnas[i].split(" ")[0][1:] == official: temp_mature = mature_mirnas[i+1].strip().replace("U", "T") off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) unique_diff = [len(unique_diff[0]), len(unique_diff[1])] # Problem with hsa-miR-8485 if mat_diff[1]!=0 and unique_diff[1]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 elif mat_diff[0]!=0 and unique_diff[0]!=0: unique_seq[j]=1 pre_pos = 0 post_pos = 0 else: pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq+1 if pre_pos != 0 or post_pos != 0: if pre_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) elif post_pos == 0: unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) else: unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) for x in range(unique_seq.count(1)): unique_seq.remove(1) if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) for y in sorted_uni_arms: counts=0 seqs=0 for x in double_fil_uni_seq: if y[0]==x[2].split("_")[0]: counts+=int(x[0].split("-")[1]) seqs+=1 y[1]=seqs y[2]=counts LHE=[] l.acquire() if con=="c": LHE.extend(z[2] for z in double_fil_uni_seq) for y in double_fil_uni_seq: samples_mirna_names.append([y[2],y[9]]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) LHE_names.extend(LHE) unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) names.append(name) samples.append(dedup_unique_seq) data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) ini_sample.append(new_main_sam) if con=="t": LHE.extend(z[2] for z in double_fil_uni_seq) for y in double_fil_uni_seq: samples_mirna_names.append([y[2],y[9]]) deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) LHE_names.extend(LHE) unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) names.append(name) samples.append(dedup_unique_seq) data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) ini_sample.append(new_main_sam) l.release() ###################################################################################################################################### """ Read a sam file from Bowtie and do the followings: 1) Remove reverse stranded mapped reads 2) Remove unmapped reads 3) Remove all sequences with reads less than 11 reads 4) Sort the arms with the most sequences in decreading rate 5) Sort the sequences of every arm with the most reads in decreasing rate 6) Calculate total number of sequences of every arm 7) Calculate total number of reads of sequences of every arm. 8) Store all the informations in a txt file """ def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): ini_sam=read(path,0) new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] unique_seq=[] unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] uni_seq=[] # Calculate the shifted positions for every isomir and add them to the name of it sorted_uni_arms = [] for i in range(1,len(mature_mirnas),2): tmp_count_reads = 0 # calculate the total number of reads tmp_count_seq = 0 # calculate the total number of sequences for j in range(len(unique_seq)): temp_mature = mature_mirnas[i].strip().replace("U", "T") if temp_mature in unique_seq[j][9]: off_part = longestSubstring(temp_mature, unique_seq[j][9]) mat_diff = temp_mature.split(off_part) mat_diff = [len(mat_diff[0]), len(mat_diff[1])] unique_diff = unique_seq[j][9].split(off_part) if len(unique_diff)<=2: unique_diff = [len(unique_diff[0]), len(unique_diff[1])] pre_pos = mat_diff[0]-unique_diff[0] post_pos = unique_diff[1]-mat_diff[1] lengthofmir = len(off_part) + post_pos if pre_pos == 0: tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) tmp_count_seq = tmp_count_seq + 1 if pre_pos == 0: t_name=unique_seq[j].copy() t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) uni_seq.append(t_name) if tmp_count_reads != 0 and tmp_count_seq != 0: sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) LHE=[] l.acquire() if con=="c": LHE.extend(x[2] for x in unique_seq if x[2]!="*") for x in unique_seq: if x[2]!="*": n_samples_mirna_names.append([x[2],x[9]]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) n_LHE_names.extend(LHE) names.append(name) data.append([con,name,unique_seq,sorted_uni_arms]) if con=="t": LHE.extend(x[2] for x in unique_seq if x[2]!="*") for x in unique_seq: if x[2]!="*": n_samples_mirna_names.append([x[2],x[9]]) n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) n_LHE_names.extend(LHE) names.append(name) data.append([con,name,unique_seq,sorted_uni_arms]) l.release() ##################################################################################################################################################################################################################### def deseq2_temp(samples_mirna_names,deseq,con,l): samples_mirna_names.sort(key=lambda x:[0]) for i in range(len(deseq)): for y in samples_mirna_names: flag = 0 for x in deseq[i]: if y[0] == x[0]: flag = 1 break if flag == 0: deseq[i].append([y[0], "0", y[1]]) [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] deseq_final = [[x[0],x[2]] for x in deseq[0]] [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] l.acquire() if con=="c": q1.put(deseq_final) if con=="t": q2.put(deseq_final) l.release() #################################################################################################################################################################################################################### def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E): LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] LH8E_add_names.sort() LH2E_add_names.sort() LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) LH2E.sort() LH8E.sort() LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) print("LHE_names") print([len(LH8E_add_names),len(LH2E_add_names)]) print([len(LH8E),len(LH2E)]) zeros=["0"]*(len(LH8E[0])-2) [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] LH8E=LH8E+LH8E_add_names zeros=["0"]*(len(LH2E[0])-2) [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] LH2E=LH2E+LH2E_add_names dupes=[] final_LH2E =[] for num,_ in enumerate(LH2E): if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: final_LH2E.append(LH2E[num][1]) final_LH2E.append(LH2E[num][0]) else: dupes.append(LH2E[num][1]) dupes=list(set(dupes)) dupes=[[x] for x in dupes] for x in LH2E: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in LH2E: for y in dupes: if x[1]==y[0]: x[0]=y[1] for x in LH8E: for y in dupes: if x[1]==y[0]: x[0]=y[1] LH2E.sort() LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) LH8E.sort() LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) LH8E_new=[] LH2E_new=[] if int(args.per)!=-1: percent=int(args.per)/100 print(percent) print(args.count) c_col_filter=round(percent*(len(LH2E[1])-2)) t_col_filter=round(percent*(len(LH8E[1])-2)) for i, _ in enumerate(LH2E): c_cols=0 t_cols=0 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)]) t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)]) if c_cols>=c_col_filter or t_cols>=t_col_filter: LH8E_new.append(LH8E[i]) LH2E_new.append(LH2E[i]) filter_LH2E.extend(LH2E_new) filter_LH8E.extend(LH8E_new) raw_LH2E.extend(LH2E) raw_LH8E.extend(LH8E) ################################################################################################################################################################################################################## def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag): if flag == 1 and int(args.per)!=-1: fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in fil_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in fil_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2 and int(args.per)!=-1: fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in fil_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in fil_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 1: fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in raw_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in raw_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() if flag == 2: fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_tre: fp.write("\t"+y) for x in raw_LH8E: fp.write("\n%s" % "\t".join(x)) fp.close() fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w') fp.write("Name\t") fp.write("Sequence") for y in names_con: fp.write("\t"+y) for x in raw_LH2E: fp.write("\n%s" % "\t".join(x)) fp.close() ######################################################################################################################################### def ssamples(names,samp,folder,pro): for i in range(2,len(samp[0])): fp = open(folder+names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+names[i-2]+"\n") for x in samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") fp.close() ################################################################################################################## def DB_write(con,name,unique_seq,sorted_uni_arms,f): if f==1: # Write a txt file with all the information if con=="c": fp = open('split1/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split2/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: temp.append(unique_seq[j]) temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() if f==2: if con=="c": fp = open('split3/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) if con=="t": fp = open('split4/'+name, 'w') fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) for i in range(len(sorted_uni_arms)): temp = [] for j in range(len(unique_seq)): if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: temp.append(unique_seq[j]) if temp!=[]: temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) fp.write("*********************************************************************************************************\n") fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) fp.write("*********************************************************************************************************\n\n") [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] fp.write("\n" + "\n") fp.close() ########################################################################################################################## def new_mat_seq(pre_unique_seq,mat_mirnas,l): unique_iso = [] for x in pre_unique_seq: if len(x[2].split("_"))==3: for y in pre_unique_seq: if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): if any(y[2] in lst2 for lst2 in unique_iso)==False: y[2]=">"+y[2] unique_iso.append(y) l.acquire() for x in unique_iso: mat_mirnas.append(x[2]) mat_mirnas.append(x[9]) l.release() ######################################################################################################################### def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 c_non = len(c_non_samples) c_non_counts = sum(x[2] for x in c_non_samples) t_non = len(t_non_samples) t_non_counts = sum(x[2] for x in t_non_samples) c_unmap = c_unmap - c_non t_unmap = c_unmap - t_non c_unmap_counts=c_unmap_counts - c_non_counts t_unmap_counts=t_unmap_counts - t_non_counts for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure(figsize=(7,5)) labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title('Control Group (reads)',fontsize=12) labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title('Treated Group (reads)', fontsize=12) plt.savefig('pie_non.png',dpi=300) ###################################################################################################################################################### def merging_names(LH2E_copy,new): dupes=[] final_LH2E =[] for num in range(len(LH2E_copy)): if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: final_LH2E.append(LH2E_copy[num][1]) final_LH2E.append(LH2E_copy[num][0]) else: dupes.append(LH2E_copy[num][1]) dupes=list(set(dupes)) for i in range(len(dupes)): dupes[i]=[dupes[i]] for x in LH2E_copy: for y in dupes: if x[1]==y[0]: fl=0 if len(y)==1: y.append(x[0]) else: for i in range(1,len(y)): if y[i].split("_")[0]==x[0].split("_")[0]: fl=1 if len(x[0])<len(y[i]): del y[i] y.append(x[0]) break if fl==0: y.append((x[0])) for y in dupes: if len(y)>2: for i in range(len(y)-1,1,-1): y[1]=y[1]+"/"+y[i] del y[i] for x in LH2E_copy: for y in dupes: if x[1]==y[0]: x[0]=y[1] LH2E_copy.sort() LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) new.extend(LH2E_copy) ###################################################################################################################################################### def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure() labels = 'miRNA RefSeq','Template', 'Unmapped' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title('Control group (reads)', fontsize=12) labels = 'miRNA RefSeq','Template', 'Unmapped' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title('Treated group (reads)',fontsize = 12) plt.savefig('pie_tem.png',dpi=300) ################################################################################################################################################################################################################### def make_spider(merge_LH2E,merge_LH8E): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] c_5 = 0 c_5_counts = 0 c_3 = 0 c_3_counts = 0 c_both =0 c_both_counts=0 c_mature = 0 c_mat_counts = 0 c_exception=0 c_exception_counts=0 t_5 = 0 t_5_counts = 0 t_3 = 0 t_3_counts = 0 t_both = 0 t_both_counts = 0 t_mature = 0 t_mat_counts = 0 t_exception = 0 t_exception_counts=0 for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] elif 0 == int(x[0].split("_")[-1]): c_5+=1 c_5_counts += x[2] elif 0 == int(x[0].split("_")[-2]): c_3+=1 c_3_counts += x[2] else: c_both+=1 c_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: for y in x[0].split("/"): c_exception+=1 c_exception_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] elif 0 == int(x[0].split("_")[-1]): t_5+=1 t_5_counts += x[2] elif 0 == int(x[0].split("_")[-2]): t_3+=1 t_3_counts += x[2] else: t_both+=1 t_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: for y in x[0].split("/"): t_exception+=1 t_exception_counts += x[2] c_all = c_5+c_3+c_both+c_mature+c_exception c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts t_all = t_5+t_3+t_both+t_mature + t_exception t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts c_5 = round(c_5/c_all*100,2) c_3 = round(c_3/c_all*100,2) c_both = round(c_both/c_all*100,2) c_mature = round(c_mature/c_all*100,2) c_exception = round(c_exception/c_all*100,2) c_5_counts = round(c_5_counts/c_all_counts*100,2) c_3_counts = round(c_3_counts/c_all_counts*100,2) c_both_counts = round(c_both_counts/c_all_counts*100,2) c_mat_counts = round(c_mat_counts/c_all_counts*100,2) c_exception_counts = round(c_exception_counts/c_all_counts*100,2) t_5 = round(t_5/t_all*100,2) t_3 = round(t_3/t_all*100,2) t_both = round(t_both/t_all*100,2) t_mature = round(t_mature/t_all*100,2) t_exception = round(t_exception/t_all*100,2) t_5_counts = round(t_5_counts/t_all_counts*100,2) t_3_counts = round(t_3_counts/t_all_counts*100,2) t_both_counts = round(t_both_counts/t_all_counts*100,2) t_mat_counts = round(t_mat_counts/t_all_counts*100,2) t_exception_counts = round(t_exception_counts/t_all_counts*100,2) radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) df=pd.DataFrame({ 'group':['Controls','Treated'], """5' and 3' isomiRs""":[c_both,t_both], """3' isomiRs""":[c_3,t_3], 'miRNA RefSeq':[c_mature,t_mature], """5' isomiRs""":[c_5,t_5], 'Others*':[c_exception,t_exception]}) df1=pd.DataFrame({ 'group':['Controls','Treated'], """5' and 3' isomiRs""":[c_both_counts,t_both_counts], """3' isomiRs""":[c_3_counts,t_3_counts], 'miRNA RefSeq':[c_mat_counts,t_mat_counts], """5' isomiRs""":[c_5_counts,t_5_counts], 'Others*':[c_exception_counts,t_exception_counts]}) spider_last(df,radar_max,1) spider_last(df1,radar_max_counts,2) def spider_last(df,radar_max,flag): # ------- PART 1: Create background fig = plt.figure() # number of variable categories=list(df)[1:] N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot ax = plt.subplot(111, polar=True) # If you want the first axis to be on top: ax.set_theta_offset(pi/2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet plt.xticks(angles[:-1], categories, fontsize=11) # Draw ylabels radar_max=round(radar_max+radar_max*0.1) mul=len(str(radar_max))-1 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) sep = round(maxi/4) plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) plt.ylim(0, maxi) # ------- PART 2: Add plots # Plot each individual = each line of the data # I don't do a loop, because plotting more than 3 groups makes the chart unreadable # Ind1 values=df.loc[0].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") ax.fill(angles, values, 'b', alpha=0.1) # Ind2 values=df.loc[1].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") ax.fill(angles, values, 'r', alpha=0.1) # Add legend if flag==1: plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_non_red.png',dpi=300) else: plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_red.png',dpi=300) ############################################################################################################################################################################################################# def hist_red(samples,flag): lengths=[] cat=[] total_reads=0 seq=[] if flag == "c": title = "Length Distribution of Control group (Redudant reads)" if flag == "t": title = "Length Distribution of Treated group (Redudant reads)" for i in samples: for x in i: lengths.append(len(x[9])) if x[1]=="0": seq.append([x[9],x[0].split("-")[1],"Mapped"]) cat.append("Mapped") if x[1] == "4": seq.append([x[9],x[0].split("-")[1],"Unmapped"]) cat.append("Unmapped") uni_len=list(set(lengths)) uni_len=[x for x in uni_len if x<=35] low=min(lengths) up=max(lengths) seq.sort() uni_seq=list(seq for seq,_ in itertools.groupby(seq)) dim=up-low if dim>20: s=5 else: s=8 total_reads+=sum([int(x[1]) for x in uni_seq]) map_reads=[] unmap_reads=[] length=[] for y in uni_len: map_temp=0 unmap_temp=0 for x in uni_seq: if len(x[0])==y and x[2]=="Mapped": map_temp+=int(x[1]) if len(x[0])==y and x[2]=="Unmapped": unmap_temp+=int(x[1]) if y<=35: length.append(y) map_reads.append(round(map_temp/total_reads*100,2)) unmap_reads.append(round(unmap_temp/total_reads*100,2)) ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) ylim=ylim+ylim*20/100 fig, ax = plt.subplots() width=0.8 ax.bar(length, unmap_reads, width, label='Unmapped') h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') plt.xticks(np.arange(length[0], length[-1]+1, 1)) plt.xlabel('Length (nt)',fontsize=14) plt.ylabel('Percentage',fontsize=14) plt.title(title,fontsize=14) ax.legend() plt.ylim([0, ylim]) ax.grid(axis='y',linewidth=0.2) if flag=='c': plt.savefig('c_hist_red.png',dpi=300) if flag=='t': plt.savefig('t_hist_red.png',dpi=300) ################################################################################################################# def logo_seq_red(merge, flag): if flag=="c": titlos="Control group (Redundant)" file_logo="c_logo.png" file_bar="c_bar.png" if flag=="t": titlos="Treated group (Redundant)" file_logo="t_logo.png" file_bar="t_bar.png" c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] A=[0]*5 C=[0]*5 G=[0]*5 T=[0]*5 total_reads=0 for y in c_samples: if "/" in y[0]: length=[] for x in y[0].split("/"): length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) best=min(length) total_reads+=best[2] for i in range(5): if i<len(best[1]): if best[1][i] == "A": A[i]+=best[2] elif best[1][i] == "C": C[i]+=best[2] elif best[1][i] == "G": G[i]+=best[2] else: T[i]+=best[2] else: total_reads+=y[2] for i in range(5): if i<len(y[0].split("_")[-1]): if y[0].split("_")[-1][i] == "A": A[i]+=(y[2]) elif y[0].split("_")[-1][i] == "C": C[i]+=(y[2]) elif y[0].split("_")[-1][i] == "G": G[i]+=(y[2]) else: T[i]+=y[2] A[:] = [round(x*100,1) / total_reads for x in A] C[:] = [round(x*100,1) / total_reads for x in C] G[:] = [round(x*100,1) / total_reads for x in G] T[:] = [round(x*100,1) / total_reads for x in T] data = {'A':A,'C':C,'G':G,'T':T} df = pd.DataFrame(data, index=[1,2,3,4,5]) h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) h.grid(axis='y',linewidth=0.2) plt.xticks(rotation=0, ha="right") plt.ylabel("Counts (%)",fontsize=18) plt.xlabel("Positions (nt)",fontsize=18) plt.title(titlos,fontsize=20) plt.tight_layout() plt.savefig(file_bar, dpi=300) import logomaker as lm crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') crp_logo.style_spines(visible=False) crp_logo.style_spines(spines=['left', 'bottom'], visible=True) crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) # style using Axes methods crp_logo.ax.set_title(titlos,fontsize=18) crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) figure = plt.gcf() figure.set_size_inches(6, 4) crp_logo.fig.savefig(file_logo,dpi=300) ########################################################################################################################################################################################################## def logo_seq_non_red(merge_LH2E): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] A=[0]*5 C=[0]*5 G=[0]*5 T=[0]*5 for y in c_samples: if "/" in y[0]: length=[] for x in y[0].split("/"): length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) best=min(length) for i in range(5): if i<len(best[1]): if best[1][i] == "A": A[i]+=1 elif best[1][i] == "C": C[i]+=1 elif best[1][i] == "G": G[i]+=1 else: T[i]+=1 else: for i in range(5): if i<len(y[0].split("_")[-1]): if y[0].split("_")[-1][i] == "A": A[i]+=1 elif y[0].split("_")[-1][i] == "C": C[i]+=1 elif y[0].split("_")[-1][i] == "G": G[i]+=1 else: T[i]+=1 data = {'A':A,'C':C,'G':G,'T':T} df = pd.DataFrame(data, index=[1,2,3,4,5]) h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) h.set_xlabel("Positions (nt)") h.set_ylabel("Unique sequences") plt.xticks(rotation=0, ha="right") plt.tight_layout() plt.savefig("bar2.png", dpi=300) import logomaker as lm crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') # style using Logo methods crp_logo.style_spines(visible=False) crp_logo.style_spines(spines=['left', 'bottom'], visible=True) crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) # style using Axes methods crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) crp_logo.ax.set_title("Non-redundant") figure = plt.gcf() crp_logo.fig.savefig('logo2.png', dpi=300) ################################################################################################################################################################################################################### def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): for i in range(2,len(tem_samp[0])): fp = open(folder+tem_names[i-2]+'.txt','w') fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") for x in tem_samp: fp.write("%s" % "\t".join([x[0],x[i]])+"\n") for j in range(len(non_names)): if non_names[j]==tem_names[i-2]: for x in non_samp: fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") fp.close() ################################################################################################################################################################################################################### def download_matures(matures,org_name): #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' data = urllib.request.urlopen(url).read() file_mirna = gzip.decompress(data).decode('utf-8') file_mirna = file_mirna.split("\n") for i in range(0,len(file_mirna)-1,2): if org_name in file_mirna[i]: matures.append(file_mirna[i]) matures.append(file_mirna[i+1]) ################################################################################################################################################################################################################### def non_template_ref(sc,st,all_isoforms): pre_uni_seq_con = list(sc) pre_uni_seq_tre = list(st) for x in pre_uni_seq_con: for y in x: if ">"+y[2] not in all_isoforms and ")_" in y[2] : all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) for x in pre_uni_seq_tre: for y in x: if ">"+y[2] not in all_isoforms and ")_" in y[2]: all_isoforms.append(">"+y[2]) all_isoforms.append(y[9]) ################################################################################################################################################################################################ def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): for y in mir_names: flag=0 for x in sample: if y[0]==x[0]: flag=1 break if flag==0: sample.append([y[0],"0",y[1]]) sample.sort(key=lambda x: x[0]) sample=list(sample for sample,_ in itertools.groupby(sample)) l.acquire() new_d.append(sample) sample_order.append(sample_name) l.release() ############################################################################################################################################################################################### if __name__ == '__main__': starttime = time.time() q1 = Queue() q2 = Queue() lock = Lock() manager = Manager() mature_mirnas=manager.list() ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) ps_mature.start() args.control[0]=args.control[0][1:] args.control[len(args.control)-1][:-1] control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] args.treated[0]=args.treated[0][1:] args.treated[len(args.treated)-1][:-1] treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] ############## Detection of templated isoforms ################ radar = manager.list([0,0,0,0]) samples = manager.list() data= manager.list() names_con=manager.list() samples_mirna_names=manager.list() deseq=manager.list() unmap_seq=manager.Value('i',0) unmap_counts=manager.Value('i',0) LH2E_names=manager.list() ini_c_samples = manager.list() radar1 = manager.list([0,0,0,0]) samples1 = manager.list() data1 = manager.list() names_tre = manager.list() samples_mirna_names1=manager.list() deseq1=manager.list() unmap1_seq = manager.Value('i',0) unmap1_counts = manager.Value('i',0) LH8E_names=manager.list() ini_t_samples = manager.list() ps_mature.join() mature_mirnas=list(mature_mirnas) starttime1 = time.time() ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control] ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated]) [p.start() for p in ps_sam] [p.join() for p in ps_sam] print('SAM took {} seconds'.format(time.time() - starttime1)) ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))] ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))]) [x.start() for x in ps_hist] starttime200=time.time() sc = list(samples) st = list(samples1) names_con=list(names_con) names_tre=list(names_tre) samples_mirna_names=list(samples_mirna_names) samples_mirna_names.sort() samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names)) samples_mirna_names1=list(samples_mirna_names1) samples_mirna_names1.sort() samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1)) deseq=list(deseq) deseq1=list(deseq1) new_names_con=manager.list() new_names_tre=manager.list() new_deseq=manager.list() new_deseq1=manager.list() ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)] ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)]) [z.start() for z in ps_deseq] [z.join() for z in ps_deseq] new_deseq=list(new_deseq) new_deseq1=list(new_deseq1) LH2E=[[x[0],x[2]] for x in new_deseq[0]] [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq] LH8E=[[x[0],x[2]] for x in new_deseq1[0]] [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1] print('Deseq took {} seconds'.format(time.time() - starttime200)) merg_nam_LH2E=manager.list() merg_nam_LH8E=manager.list() LH2E_copy=copy.deepcopy(list(LH2E)) LH8E_copy=copy.deepcopy(list(LH8E)) fil_sort_tre=manager.list() fil_sort_con=manager.list() raw_sort_tre=manager.list() raw_sort_con=manager.list() ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con)) ps_main.start() if args.anal=="2": all_iso = manager.list() ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso)) ps_non_iso.start() ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))] ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))]) [x.start() for x in ps_merge] [x.join() for x in ps_merge] merg_nam_LH2E=list(merg_nam_LH2E) merg_nam_LH8E=list(merg_nam_LH8E) starttime2 = time.time() procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data] procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1]) procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))]) if args.anal == "1": procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))]) [p.start() for p in procs] if args.anal=="1": [x.join() for x in ps_hist] [p.join() for p in procs] ps_pdf = Process(target=pdf_before_DE,args=(args.anal)) ps_pdf.start() print('Graphs took {} seconds'.format(time.time() - starttime2)) ps_main.join() fil_sort_con=list(fil_sort_con) fil_sort_tre=list(fil_sort_tre) if fil_sort_con==[]: fil_sort_con=raw_sort_con fil_sort_tre=raw_sort_tre raw_sort_con=list(raw_sort_con) raw_sort_tre=list(raw_sort_tre) names_con=list(new_names_con) names_tre=list(new_names_tre) ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1)) ps_write.start() ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))] ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))]) [p.start() for p in ps1_matrix] if args.anal=="1": ps_pdf.join() if args.anal=="2": [p.join() for p in procs] [x.join() for x in ps_hist] ps_write.join() [p.join() for p in ps1_matrix] ############################## Detection of Both ####################################### starttime10 = time.time() if args.anal == "2": n_data= manager.list() n_names_con=manager.list() n_samples_mirna_names=manager.list() n_deseq=manager.list() n_LH2E_names=manager.list() n_data1 = manager.list() n_names_tre = manager.list() n_samples_mirna_names1=manager.list() n_deseq1=manager.list() n_LH8E_names=manager.list() new_mat_mirnas = list(mature_mirnas) ps_non_iso.join() all_iso=list(all_iso) new_mat_mirnas.extend(all_iso) starttime11=time.time() ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control] ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated]) [p.start() for p in ps_sam] [p.join() for p in ps_sam] print('Non-sam took {} seconds'.format(time.time() - starttime11)) starttime12=time.time() n_names_con=list(n_names_con) n_names_tre=list(n_names_tre) n_samples_mirna_names=list(n_samples_mirna_names) n_samples_mirna_names.sort() n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names)) n_samples_mirna_names1=list(n_samples_mirna_names1) n_samples_mirna_names1.sort() n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1)) n_deseq=list(n_deseq) n_deseq1=list(n_deseq1) new_n_names_con=manager.list() new_n_names_tre=manager.list() n_new_deseq=manager.list() n_new_deseq1=manager.list() ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)] ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)]) [x.start() for x in ps_deseq] [x.join() for x in ps_deseq] n_new_deseq=list(n_new_deseq) n_new_deseq1=list(n_new_deseq1) print([len(n_new_deseq[0]),len(n_new_deseq[1])]) print([len(n_new_deseq1[0]),len(n_new_deseq1[1])]) n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]] [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq] n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]] [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1] print('Non-deseq took {} seconds'.format(time.time() - starttime12)) merg_nam_n_LH2E=manager.list() merg_nam_n_LH8E=manager.list() n_LH2E_copy=copy.deepcopy(list(n_LH2E)) n_LH8E_copy=copy.deepcopy(list(n_LH8E)) n_sort_tre=manager.list() n_sort_con=manager.list() n_fil_sort_con=manager.list() n_fil_sort_tre=manager.list() n_raw_sort_con=manager.list() n_raw_sort_tre=manager.list() ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con)) ps_main.start() ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))] ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))]) [p.start() for p in ps_merge] [p.join() for p in ps_merge] merg_nam_n_LH2E=list(merg_nam_n_LH2E) merg_nam_n_LH8E=list(merg_nam_n_LH8E) procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data] procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1]) procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))]) procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))]) procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))]) starttime13=time.time() [p.start() for p in procs] [p.join() for p in procs] print('Graphs took {} seconds'.format(time.time() - starttime13)) procs1 = Process(target=pdf_before_DE,args=(args.anal)) procs1.start() starttime14=time.time() ps_main.join() n_fil_sort_con=list(n_fil_sort_con) n_fil_sort_tre=list(n_fil_sort_tre) if n_fil_sort_con==[]: n_fil_sort_con=n_raw_sort_con n_fil_sort_tre=n_raw_sort_tre n_raw_sort_con=list(n_raw_sort_con) n_raw_sort_tre=list(n_raw_sort_tre) n_names_con=list(new_n_names_con) n_names_tre=list(new_n_names_tre) ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2)) ps_write.start() ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))] ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))]) [p.start() for p in ps1_matrix] ps_write.join() [p.join() for p in ps1_matrix] procs1.join() print('That took {} seconds'.format(time.time() - starttime10)) print('That took {} seconds'.format(time.time() - starttime))