Mercurial > repos > glogobyte > viztool
changeset 2:c607e8513b5b draft
Uploaded
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 12:16:37 +0000 |
parents | 561b0abcae87 |
children | 67f6938b3c42 |
files | mirbase_ultra_v2.py |
diffstat | 1 files changed, 1723 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mirbase_ultra_v2.py Fri Oct 16 12:16:37 2020 +0000 @@ -0,0 +1,1723 @@ +from mirbase_functions import * +from mirbase_graphs import * +import itertools +import time +import sys +import os +import urllib.request +import gzip +from multiprocessing import Process, Queue, Lock, Pool, Manager, Value +import subprocess +import argparse +from collections import OrderedDict +from matplotlib.backends.backend_pdf import PdfPages +import pandas as pd +from math import pi +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import PercentFormatter +import seaborn as sns +import scipy.stats as stats +from plotnine import * +import math +import re +import matplotlib.ticker as mtick +import copy + +subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) + +parser = argparse.ArgumentParser() +parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") +parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[]) +parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] ) +parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") +parser.add_argument("-gen", "--org_name", help="tool directory path", action="store") +parser.add_argument("-program", "--pro", help="choose type of analysis", action="store") +parser.add_argument("-f", "--flag", help="choose the database", action="store") +parser.add_argument("-umis", "--umi", help="choose the database", action="store") +parser.add_argument("-percentage", "--per", help="choose the database", action="store") +parser.add_argument("-counts", "--count", help="choose the database", action="store") +parser.add_argument("-name1", "--n1", help="choose the database", action="store") +parser.add_argument("-name2", "--n2", help="choose the database", action="store") +args = parser.parse_args() + +######################################################################################################################################### + +def collapse_sam(path): + + ini_sam=read(path,0) + main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] + + uni_seq = [] + for x in main_sam: + + if [x[2], x[9]] not in uni_seq: + uni_seq.append([x[2], x[9]]) + + new_main_sam=[] + incr_num=0 + for i in range(len(uni_seq)): + count=0 + incr_num+=1 + for y in main_sam: + if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: + count+=1 + temp=y + temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + temp[0]=str(incr_num)+"-"+str(count) + new_main_sam.append(temp) + + new_sam=intro_sam+new_main_sam + + return new_sam + +################################################################################################################################################################################################################# + +def duplicate_chroms_isoforms(List): + + dupes=[] + + for num in range(len(List)): + + if [List[num][9],List[num][0],List[num][2]] not in dupes : + dupes.append([List[num][9],List[num][0],List[num][2]]) + + for x in List: + for y in dupes: + if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: + y.append(x[2]) + + + double_List = [x[:] for x in List] + + chr_order=[] + for x in dupes: + temp = [] + for i in range(2,len(x)): + if x[i].split("chr")[1].split("(")[0].isdigit(): + temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) + else: + temp.append(x[i].split("chr")[1][0:4]) + + for z in temp: + if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: + temp = [str(j) for j in temp] + temp=list(set(temp)) + temp.sort() + chr_order.append(temp) + + final_dupes=[] + for i in range(len(dupes)): + final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) + for x in chr_order[i]: + result = re.match("[-+]?\d+$", str(x)) + if len(chr_order[i]) == len(set(chr_order[i])): + if result is not None: + + if int(x)<0: + final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" + else: + final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" + else: + final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) + else: + if result is not None: + if int(x) < 0: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" + else: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" + else: + final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) + + final_dupes.sort() + final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) + + for i in range(len(double_List)): + for x in final_dupes: + + if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: + gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) + double_List[i][2] = x[1]+gg + + if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: + double_List[i][2]=x[1] + List[i][2] = x[1] + + List.sort() + new_list=list(List for List,_ in itertools.groupby(List)) + + double_List.sort() + new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) + + return new_list, new_double_List + + +############################################################################################################################################################################################################# + +def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): + + # read the sam file + ini_sam=read(path,0) + new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] + + sorted_uni_arms = [] + + for i in range(len(mature_mirnas)): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + for j in range(len(unique_seq)): + + if "{" in unique_seq[j][2].split("_")[0]: + official=unique_seq[j][2].split("_")[0][:-4] + else: + official=unique_seq[j][2].split("_")[0] + + if mature_mirnas[i].split(" ")[0][1:] == official: + + temp_mature = mature_mirnas[i+1].strip().replace("U", "T") + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + # Problem with hsa-miR-8485 + if mat_diff[1]!=0 and unique_diff[1]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + elif mat_diff[0]!=0 and unique_diff[0]!=0: + unique_seq[j]=1 + pre_pos = 0 + post_pos = 0 + + else: + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq+1 + + if pre_pos != 0 or post_pos != 0: + if pre_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) + elif post_pos == 0: + unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) + else: + unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) + + for x in range(unique_seq.count(1)): + unique_seq.remove(1) + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) + + for y in sorted_uni_arms: + counts=0 + seqs=0 + for x in double_fil_uni_seq: + if y[0]==x[2].split("_")[0]: + counts+=int(x[0].split("-")[1]) + seqs+=1 + + y[1]=seqs + y[2]=counts + + LHE=[] + l.acquire() + if con=="c": + LHE.extend(z[2] for z in double_fil_uni_seq) + for y in double_fil_uni_seq: + samples_mirna_names.append([y[2],y[9]]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) + LHE_names.extend(LHE) + unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) + names.append(name) + samples.append(dedup_unique_seq) + data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) + ini_sample.append(new_main_sam) + + if con=="t": + LHE.extend(z[2] for z in double_fil_uni_seq) + for y in double_fil_uni_seq: + samples_mirna_names.append([y[2],y[9]]) + deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) + LHE_names.extend(LHE) + unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) + unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) + names.append(name) + samples.append(dedup_unique_seq) + data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) + ini_sample.append(new_main_sam) + l.release() + + +###################################################################################################################################### +""" + +Read a sam file from Bowtie and do the followings: + +1) Remove reverse stranded mapped reads +2) Remove unmapped reads +3) Remove all sequences with reads less than 11 reads +4) Sort the arms with the most sequences in decreading rate +5) Sort the sequences of every arm with the most reads in decreasing rate +6) Calculate total number of sequences of every arm +7) Calculate total number of reads of sequences of every arm. +8) Store all the informations in a txt file + +""" + +def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): + + ini_sam=read(path,0) + new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] + unique_seq=[] + unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] + + uni_seq=[] + # Calculate the shifted positions for every isomir and add them to the name of it + sorted_uni_arms = [] + for i in range(1,len(mature_mirnas),2): + tmp_count_reads = 0 # calculate the total number of reads + tmp_count_seq = 0 # calculate the total number of sequences + + for j in range(len(unique_seq)): + + temp_mature = mature_mirnas[i].strip().replace("U", "T") + + if temp_mature in unique_seq[j][9]: + + off_part = longestSubstring(temp_mature, unique_seq[j][9]) + + mat_diff = temp_mature.split(off_part) + mat_diff = [len(mat_diff[0]), len(mat_diff[1])] + + unique_diff = unique_seq[j][9].split(off_part) + if len(unique_diff)<=2: + unique_diff = [len(unique_diff[0]), len(unique_diff[1])] + + pre_pos = mat_diff[0]-unique_diff[0] + post_pos = unique_diff[1]-mat_diff[1] + + lengthofmir = len(off_part) + post_pos + if pre_pos == 0: + tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) + tmp_count_seq = tmp_count_seq + 1 + + if pre_pos == 0: + + t_name=unique_seq[j].copy() + t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) + uni_seq.append(t_name) + + + if tmp_count_reads != 0 and tmp_count_seq != 0: + sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) + + + sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) + unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) + + LHE=[] + + l.acquire() + if con=="c": + LHE.extend(x[2] for x in unique_seq if x[2]!="*") + for x in unique_seq: + if x[2]!="*": + n_samples_mirna_names.append([x[2],x[9]]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + n_LHE_names.extend(LHE) + names.append(name) + data.append([con,name,unique_seq,sorted_uni_arms]) + + + if con=="t": + LHE.extend(x[2] for x in unique_seq if x[2]!="*") + for x in unique_seq: + if x[2]!="*": + n_samples_mirna_names.append([x[2],x[9]]) + n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) + n_LHE_names.extend(LHE) + names.append(name) + data.append([con,name,unique_seq,sorted_uni_arms]) + l.release() + +##################################################################################################################################################################################################################### +def deseq2_temp(samples_mirna_names,deseq,con,l): + + samples_mirna_names.sort(key=lambda x:[0]) + for i in range(len(deseq)): + for y in samples_mirna_names: + flag = 0 + for x in deseq[i]: + if y[0] == x[0]: + flag = 1 + break + + if flag == 0: + deseq[i].append([y[0], "0", y[1]]) + + [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] + deseq_final = [[x[0],x[2]] for x in deseq[0]] + [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] + + l.acquire() + if con=="c": + q1.put(deseq_final) + + if con=="t": + q2.put(deseq_final) + l.release() + + +#################################################################################################################################################################################################################### + +def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E): + + LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] + LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] + + LH8E_add_names.sort() + LH2E_add_names.sort() + LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) + LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) + + LH2E.sort() + LH8E.sort() + LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) + LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) + + print("LHE_names") + print([len(LH8E_add_names),len(LH2E_add_names)]) + print([len(LH8E),len(LH2E)]) + + zeros=["0"]*(len(LH8E[0])-2) + [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] + LH8E=LH8E+LH8E_add_names + + zeros=["0"]*(len(LH2E[0])-2) + [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] + LH2E=LH2E+LH2E_add_names + + dupes=[] + final_LH2E =[] + + for num,_ in enumerate(LH2E): + + if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: + final_LH2E.append(LH2E[num][1]) + final_LH2E.append(LH2E[num][0]) + else: + dupes.append(LH2E[num][1]) + + + dupes=list(set(dupes)) + + dupes=[[x] for x in dupes] + + for x in LH2E: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + for x in LH2E: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + for x in LH8E: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + + LH2E.sort() + LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) + + LH8E.sort() + LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) + + LH8E_new=[] + LH2E_new=[] + + if int(args.per)!=-1: + percent=int(args.per)/100 + print(percent) + print(args.count) + + c_col_filter=round(percent*(len(LH2E[1])-2)) + t_col_filter=round(percent*(len(LH8E[1])-2)) + + for i, _ in enumerate(LH2E): + c_cols=0 + t_cols=0 + + c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)]) + t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)]) + + if c_cols>=c_col_filter or t_cols>=t_col_filter: + LH8E_new.append(LH8E[i]) + LH2E_new.append(LH2E[i]) + + filter_LH2E.extend(LH2E_new) + filter_LH8E.extend(LH8E_new) + raw_LH2E.extend(LH2E) + raw_LH8E.extend(LH8E) + +################################################################################################################################################################################################################## + +def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag): + + if flag == 1 and int(args.per)!=-1: + fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + for x in fil_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in fil_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2 and int(args.per)!=-1: + fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + + for x in fil_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in fil_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 1: + fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + for x in raw_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in raw_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + + if flag == 2: + fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_tre: + fp.write("\t"+y) + + + for x in raw_LH8E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w') + fp.write("Name\t") + fp.write("Sequence") + for y in names_con: + fp.write("\t"+y) + + for x in raw_LH2E: + fp.write("\n%s" % "\t".join(x)) + fp.close() + + +######################################################################################################################################### + +def ssamples(names,samp,folder,pro): + + for i in range(2,len(samp[0])): + + fp = open(folder+names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+names[i-2]+"\n") + + for x in samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + fp.close() + +################################################################################################################## + +def DB_write(con,name,unique_seq,sorted_uni_arms,f): + + if f==1: + # Write a txt file with all the information + if con=="c": + fp = open('split1/'+name, 'w') + + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split2/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + + if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: + + temp.append(unique_seq[j]) + + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + if f==2: + + if con=="c": + fp = open('split3/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + if con=="t": + fp = open('split4/'+name, 'w') + fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) + + + for i in range(len(sorted_uni_arms)): + temp = [] + for j in range(len(unique_seq)): + if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: + temp.append(unique_seq[j]) + if temp!=[]: + temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) + fp.write("*********************************************************************************************************\n") + fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) + fp.write("*********************************************************************************************************\n\n") + [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] + fp.write("\n" + "\n") + fp.close() + + +########################################################################################################################## + +def new_mat_seq(pre_unique_seq,mat_mirnas,l): + + unique_iso = [] + for x in pre_unique_seq: + if len(x[2].split("_"))==3: + for y in pre_unique_seq: + if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): + if any(y[2] in lst2 for lst2 in unique_iso)==False: + y[2]=">"+y[2] + unique_iso.append(y) + l.acquire() + for x in unique_iso: + mat_mirnas.append(x[2]) + mat_mirnas.append(x[9]) + l.release() + +######################################################################################################################### +def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): + + c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] + t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] + c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] + t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] + + c_templ = 0 + c_tem_counts = 0 + c_mature = 0 + c_mat_counts = 0 + t_templ = 0 + t_tem_counts = 0 + t_mature = 0 + t_mat_counts = 0 + + c_non = len(c_non_samples) + c_non_counts = sum(x[2] for x in c_non_samples) + t_non = len(t_non_samples) + t_non_counts = sum(x[2] for x in t_non_samples) + + c_unmap = c_unmap - c_non + t_unmap = c_unmap - t_non + + c_unmap_counts=c_unmap_counts - c_non_counts + t_unmap_counts=t_unmap_counts - t_non_counts + + + for x in c_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + else: + c_templ+=1 + c_tem_counts += x[2] + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + f=1 + break + if f==0: + c_templ+=1 + c_tem_counts += x[2] + + for x in t_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + else: + t_templ+=1 + t_tem_counts += x[2] + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + f=1 + break + if f==0: + t_templ+=1 + t_tem_counts += x[2] + + fig = plt.figure(figsize=(7,5)) + labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' + sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] + colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] + ax1 = plt.subplot2grid((1,2),(0,0)) + patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) + [x.set_fontsize(8) for x in texts] + plt.title('Control Group (reads)',fontsize=12) + labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' + sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] + colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] + ax2 = plt.subplot2grid((1,2),(0,1)) + patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) + [x.set_fontsize(8) for x in texts] + plt.title('Treated Group (reads)', fontsize=12) + plt.savefig('pie_non.png',dpi=300) + +###################################################################################################################################################### + +def merging_names(LH2E_copy,new): + + dupes=[] + final_LH2E =[] + + for num in range(len(LH2E_copy)): + + if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: + final_LH2E.append(LH2E_copy[num][1]) + final_LH2E.append(LH2E_copy[num][0]) + else: + dupes.append(LH2E_copy[num][1]) + + dupes=list(set(dupes)) + + for i in range(len(dupes)): + dupes[i]=[dupes[i]] + + for x in LH2E_copy: + for y in dupes: + if x[1]==y[0]: + fl=0 + if len(y)==1: + y.append(x[0]) + else: + for i in range(1,len(y)): + if y[i].split("_")[0]==x[0].split("_")[0]: + fl=1 + if len(x[0])<len(y[i]): + del y[i] + y.append(x[0]) + break + + if fl==0: + y.append((x[0])) + + for y in dupes: + if len(y)>2: + for i in range(len(y)-1,1,-1): + y[1]=y[1]+"/"+y[i] + del y[i] + + + for x in LH2E_copy: + for y in dupes: + if x[1]==y[0]: + x[0]=y[1] + + LH2E_copy.sort() + LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) + + new.extend(LH2E_copy) + + +###################################################################################################################################################### +def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): + + c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] + t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] + + c_templ = 0 + c_tem_counts = 0 + c_mature = 0 + c_mat_counts = 0 + t_templ = 0 + t_tem_counts = 0 + t_mature = 0 + t_mat_counts = 0 + + for x in c_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + else: + c_templ+=1 + c_tem_counts += x[2] + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + f=1 + break + if f==0: + c_templ+=1 + c_tem_counts += x[2] + + for x in t_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + else: + t_templ+=1 + t_tem_counts += x[2] + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + f=1 + break + if f==0: + t_templ+=1 + t_tem_counts += x[2] + + + fig = plt.figure() + labels = 'miRNA RefSeq','Template', 'Unmapped' + sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] + colors = ['gold', 'yellowgreen', 'lightskyblue'] + explode = (0.2, 0.05, 0.1) + ax1 = plt.subplot2grid((1,2),(0,0)) + patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) + [x.set_fontsize(8) for x in texts] + plt.title('Control group (reads)', fontsize=12) + labels = 'miRNA RefSeq','Template', 'Unmapped' + sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] + colors = ['gold', 'yellowgreen', 'lightskyblue'] + explode = (0.2, 0.05, 0.1) + ax2 = plt.subplot2grid((1,2),(0,1)) + patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) + [x.set_fontsize(8) for x in texts] + plt.title('Treated group (reads)',fontsize = 12) + plt.savefig('pie_tem.png',dpi=300) + +################################################################################################################################################################################################################### + +def make_spider(merge_LH2E,merge_LH8E): + + c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] + t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] + + c_5 = 0 + c_5_counts = 0 + c_3 = 0 + c_3_counts = 0 + c_both =0 + c_both_counts=0 + c_mature = 0 + c_mat_counts = 0 + c_exception=0 + c_exception_counts=0 + + + t_5 = 0 + t_5_counts = 0 + t_3 = 0 + t_3_counts = 0 + t_both = 0 + t_both_counts = 0 + t_mature = 0 + t_mat_counts = 0 + t_exception = 0 + t_exception_counts=0 + + for x in c_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + elif 0 == int(x[0].split("_")[-1]): + c_5+=1 + c_5_counts += x[2] + elif 0 == int(x[0].split("_")[-2]): + c_3+=1 + c_3_counts += x[2] + else: + c_both+=1 + c_both_counts+=x[2] + + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + c_mature+=1 + c_mat_counts += x[2] + f=1 + break + if f==0: + for y in x[0].split("/"): + c_exception+=1 + c_exception_counts += x[2] + + + for x in t_samples: + + if "/" not in x[0]: + if "chr" in x[0].split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + elif 0 == int(x[0].split("_")[-1]): + t_5+=1 + t_5_counts += x[2] + elif 0 == int(x[0].split("_")[-2]): + t_3+=1 + t_3_counts += x[2] + else: + t_both+=1 + t_both_counts+=x[2] + + else: + f=0 + for y in x[0].split("/"): + if "chr" in y.split("_")[-1]: + t_mature+=1 + t_mat_counts += x[2] + f=1 + break + if f==0: + for y in x[0].split("/"): + t_exception+=1 + t_exception_counts += x[2] + + + c_all = c_5+c_3+c_both+c_mature+c_exception + c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts + + t_all = t_5+t_3+t_both+t_mature + t_exception + t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts + + c_5 = round(c_5/c_all*100,2) + c_3 = round(c_3/c_all*100,2) + c_both = round(c_both/c_all*100,2) + c_mature = round(c_mature/c_all*100,2) + c_exception = round(c_exception/c_all*100,2) + + c_5_counts = round(c_5_counts/c_all_counts*100,2) + c_3_counts = round(c_3_counts/c_all_counts*100,2) + c_both_counts = round(c_both_counts/c_all_counts*100,2) + c_mat_counts = round(c_mat_counts/c_all_counts*100,2) + c_exception_counts = round(c_exception_counts/c_all_counts*100,2) + + t_5 = round(t_5/t_all*100,2) + t_3 = round(t_3/t_all*100,2) + t_both = round(t_both/t_all*100,2) + t_mature = round(t_mature/t_all*100,2) + t_exception = round(t_exception/t_all*100,2) + + t_5_counts = round(t_5_counts/t_all_counts*100,2) + t_3_counts = round(t_3_counts/t_all_counts*100,2) + t_both_counts = round(t_both_counts/t_all_counts*100,2) + t_mat_counts = round(t_mat_counts/t_all_counts*100,2) + t_exception_counts = round(t_exception_counts/t_all_counts*100,2) + + radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) + radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) + + df=pd.DataFrame({ + 'group':['Controls','Treated'], + """5' and 3' isomiRs""":[c_both,t_both], + """3' isomiRs""":[c_3,t_3], + 'miRNA RefSeq':[c_mature,t_mature], + """5' isomiRs""":[c_5,t_5], + 'Others*':[c_exception,t_exception]}) + + df1=pd.DataFrame({ + 'group':['Controls','Treated'], + """5' and 3' isomiRs""":[c_both_counts,t_both_counts], + """3' isomiRs""":[c_3_counts,t_3_counts], + 'miRNA RefSeq':[c_mat_counts,t_mat_counts], + """5' isomiRs""":[c_5_counts,t_5_counts], + 'Others*':[c_exception_counts,t_exception_counts]}) + + spider_last(df,radar_max,1) + spider_last(df1,radar_max_counts,2) + + +def spider_last(df,radar_max,flag): + # ------- PART 1: Create background + fig = plt.figure() + # number of variable + categories=list(df)[1:] + N = len(categories) + + # What will be the angle of each axis in the plot? (we divide the plot / number of variable) + angles = [n / float(N) * 2 * pi for n in range(N)] + angles += angles[:1] + + # Initialise the spider plot + ax = plt.subplot(111, polar=True) + + # If you want the first axis to be on top: + ax.set_theta_offset(pi/2) + ax.set_theta_direction(-1) + + # Draw one axe per variable + add labels labels yet + plt.xticks(angles[:-1], categories, fontsize=11) + + # Draw ylabels + radar_max=round(radar_max+radar_max*0.1) + mul=len(str(radar_max))-1 + maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) + sep = round(maxi/4) + plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) + plt.ylim(0, maxi) + + # ------- PART 2: Add plots + + # Plot each individual = each line of the data + # I don't do a loop, because plotting more than 3 groups makes the chart unreadable + + # Ind1 + values=df.loc[0].drop('group').values.flatten().tolist() + values += values[:1] + ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") + ax.fill(angles, values, 'b', alpha=0.1) + + # Ind2 + values=df.loc[1].drop('group').values.flatten().tolist() + values += values[:1] + ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") + ax.fill(angles, values, 'r', alpha=0.1) + + # Add legend + if flag==1: + plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) + plt.savefig('spider_non_red.png',dpi=300) + else: + plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) + plt.savefig('spider_red.png',dpi=300) + + +############################################################################################################################################################################################################# +def hist_red(samples,flag): + lengths=[] + cat=[] + total_reads=0 + seq=[] + + if flag == "c": + title = "Length Distribution of Control group (Redudant reads)" + if flag == "t": + title = "Length Distribution of Treated group (Redudant reads)" + + for i in samples: + for x in i: + lengths.append(len(x[9])) + if x[1]=="0": + seq.append([x[9],x[0].split("-")[1],"Mapped"]) + cat.append("Mapped") + if x[1] == "4": + seq.append([x[9],x[0].split("-")[1],"Unmapped"]) + cat.append("Unmapped") + + uni_len=list(set(lengths)) + uni_len=[x for x in uni_len if x<=35] + low=min(lengths) + up=max(lengths) + seq.sort() + uni_seq=list(seq for seq,_ in itertools.groupby(seq)) + dim=up-low + + if dim>20: + s=5 + else: + s=8 + + total_reads+=sum([int(x[1]) for x in uni_seq]) + + map_reads=[] + unmap_reads=[] + length=[] + for y in uni_len: + map_temp=0 + unmap_temp=0 + for x in uni_seq: + if len(x[0])==y and x[2]=="Mapped": + map_temp+=int(x[1]) + if len(x[0])==y and x[2]=="Unmapped": + unmap_temp+=int(x[1]) + if y<=35: + length.append(y) + map_reads.append(round(map_temp/total_reads*100,2)) + unmap_reads.append(round(unmap_temp/total_reads*100,2)) + + ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) + ylim=ylim+ylim*20/100 + fig, ax = plt.subplots() + width=0.8 + ax.bar(length, unmap_reads, width, label='Unmapped') + h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') + plt.xticks(np.arange(length[0], length[-1]+1, 1)) + plt.xlabel('Length (nt)',fontsize=14) + plt.ylabel('Percentage',fontsize=14) + plt.title(title,fontsize=14) + ax.legend() + plt.ylim([0, ylim]) + ax.grid(axis='y',linewidth=0.2) + + if flag=='c': + plt.savefig('c_hist_red.png',dpi=300) + + if flag=='t': + plt.savefig('t_hist_red.png',dpi=300) + +################################################################################################################# + + +def logo_seq_red(merge, flag): + + if flag=="c": + titlos="Control group (Redundant)" + file_logo="c_logo.png" + file_bar="c_bar.png" + if flag=="t": + titlos="Treated group (Redundant)" + file_logo="t_logo.png" + file_bar="t_bar.png" + + c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] + + A=[0]*5 + C=[0]*5 + G=[0]*5 + T=[0]*5 + total_reads=0 + + for y in c_samples: + if "/" in y[0]: + length=[] + for x in y[0].split("/"): + length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) + + best=min(length) + total_reads+=best[2] + for i in range(5): + if i<len(best[1]): + if best[1][i] == "A": + A[i]+=best[2] + elif best[1][i] == "C": + C[i]+=best[2] + elif best[1][i] == "G": + G[i]+=best[2] + else: + T[i]+=best[2] + else: + total_reads+=y[2] + for i in range(5): + if i<len(y[0].split("_")[-1]): + if y[0].split("_")[-1][i] == "A": + A[i]+=(y[2]) + elif y[0].split("_")[-1][i] == "C": + C[i]+=(y[2]) + elif y[0].split("_")[-1][i] == "G": + G[i]+=(y[2]) + else: + T[i]+=y[2] + + A[:] = [round(x*100,1) / total_reads for x in A] + C[:] = [round(x*100,1) / total_reads for x in C] + G[:] = [round(x*100,1) / total_reads for x in G] + T[:] = [round(x*100,1) / total_reads for x in T] + + + + data = {'A':A,'C':C,'G':G,'T':T} + df = pd.DataFrame(data, index=[1,2,3,4,5]) + h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) + h.grid(axis='y',linewidth=0.2) + plt.xticks(rotation=0, ha="right") + plt.ylabel("Counts (%)",fontsize=18) + plt.xlabel("Positions (nt)",fontsize=18) + plt.title(titlos,fontsize=20) + plt.tight_layout() + plt.savefig(file_bar, dpi=300) + + import logomaker as lm + crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') + crp_logo.style_spines(visible=False) + crp_logo.style_spines(spines=['left', 'bottom'], visible=True) + crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) + + # style using Axes methods + crp_logo.ax.set_title(titlos,fontsize=18) + crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) + crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) + crp_logo.ax.xaxis.set_ticks_position('none') + crp_logo.ax.xaxis.set_tick_params(pad=-1) + figure = plt.gcf() + figure.set_size_inches(6, 4) + crp_logo.fig.savefig(file_logo,dpi=300) + +########################################################################################################################################################################################################## + + + +def logo_seq_non_red(merge_LH2E): + + c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] + + A=[0]*5 + C=[0]*5 + G=[0]*5 + T=[0]*5 + + for y in c_samples: + if "/" in y[0]: + length=[] + for x in y[0].split("/"): + length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) + + best=min(length) + for i in range(5): + if i<len(best[1]): + if best[1][i] == "A": + A[i]+=1 + elif best[1][i] == "C": + C[i]+=1 + elif best[1][i] == "G": + G[i]+=1 + else: + T[i]+=1 + else: + for i in range(5): + if i<len(y[0].split("_")[-1]): + if y[0].split("_")[-1][i] == "A": + A[i]+=1 + elif y[0].split("_")[-1][i] == "C": + C[i]+=1 + elif y[0].split("_")[-1][i] == "G": + G[i]+=1 + else: + T[i]+=1 + + data = {'A':A,'C':C,'G':G,'T':T} + df = pd.DataFrame(data, index=[1,2,3,4,5]) + h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) + h.set_xlabel("Positions (nt)") + h.set_ylabel("Unique sequences") + plt.xticks(rotation=0, ha="right") + plt.tight_layout() + plt.savefig("bar2.png", dpi=300) + + + import logomaker as lm + crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') + + # style using Logo methods + crp_logo.style_spines(visible=False) + crp_logo.style_spines(spines=['left', 'bottom'], visible=True) + crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) + + # style using Axes methods + crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) + crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) + crp_logo.ax.xaxis.set_ticks_position('none') + crp_logo.ax.xaxis.set_tick_params(pad=-1) + crp_logo.ax.set_title("Non-redundant") + figure = plt.gcf() + crp_logo.fig.savefig('logo2.png', dpi=300) + + +################################################################################################################################################################################################################### + +def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): + + for i in range(2,len(tem_samp[0])): + + fp = open(folder+tem_names[i-2]+'.txt','w') + fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") + + for x in tem_samp: + fp.write("%s" % "\t".join([x[0],x[i]])+"\n") + + for j in range(len(non_names)): + if non_names[j]==tem_names[i-2]: + for x in non_samp: + fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") + fp.close() + +################################################################################################################################################################################################################### + +def download_matures(matures,org_name): + + #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' + url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' + data = urllib.request.urlopen(url).read() + file_mirna = gzip.decompress(data).decode('utf-8') + file_mirna = file_mirna.split("\n") + + for i in range(0,len(file_mirna)-1,2): + + if org_name in file_mirna[i]: + matures.append(file_mirna[i]) + matures.append(file_mirna[i+1]) + +################################################################################################################################################################################################################### +def non_template_ref(sc,st,all_isoforms): + + pre_uni_seq_con = list(sc) + pre_uni_seq_tre = list(st) + + for x in pre_uni_seq_con: + for y in x: + if ">"+y[2] not in all_isoforms and ")_" in y[2] : + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + + + for x in pre_uni_seq_tre: + for y in x: + if ">"+y[2] not in all_isoforms and ")_" in y[2]: + all_isoforms.append(">"+y[2]) + all_isoforms.append(y[9]) + +################################################################################################################################################################################################ + +def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): + + for y in mir_names: + flag=0 + for x in sample: + if y[0]==x[0]: + flag=1 + break + if flag==0: + sample.append([y[0],"0",y[1]]) + + sample.sort(key=lambda x: x[0]) + sample=list(sample for sample,_ in itertools.groupby(sample)) + + l.acquire() + new_d.append(sample) + sample_order.append(sample_name) + l.release() + +############################################################################################################################################################################################### + +if __name__ == '__main__': + + starttime = time.time() + + q1 = Queue() + q2 = Queue() + lock = Lock() + manager = Manager() + + mature_mirnas=manager.list() + ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) + ps_mature.start() + + args.control[0]=args.control[0][1:] + args.control[len(args.control)-1][:-1] + control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] + + args.treated[0]=args.treated[0][1:] + args.treated[len(args.treated)-1][:-1] + treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] + + +############## Detection of templated isoforms ################ + + radar = manager.list([0,0,0,0]) + samples = manager.list() + data= manager.list() + names_con=manager.list() + samples_mirna_names=manager.list() + deseq=manager.list() + unmap_seq=manager.Value('i',0) + unmap_counts=manager.Value('i',0) + LH2E_names=manager.list() + ini_c_samples = manager.list() + + + radar1 = manager.list([0,0,0,0]) + samples1 = manager.list() + data1 = manager.list() + names_tre = manager.list() + samples_mirna_names1=manager.list() + deseq1=manager.list() + unmap1_seq = manager.Value('i',0) + unmap1_counts = manager.Value('i',0) + LH8E_names=manager.list() + ini_t_samples = manager.list() + ps_mature.join() + + + mature_mirnas=list(mature_mirnas) + + + starttime1 = time.time() + ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control] + ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated]) + + [p.start() for p in ps_sam] + [p.join() for p in ps_sam] + print('SAM took {} seconds'.format(time.time() - starttime1)) + + ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))] + ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))]) + [x.start() for x in ps_hist] + + starttime200=time.time() + + sc = list(samples) + st = list(samples1) + + names_con=list(names_con) + names_tre=list(names_tre) + samples_mirna_names=list(samples_mirna_names) + samples_mirna_names.sort() + samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names)) + + samples_mirna_names1=list(samples_mirna_names1) + samples_mirna_names1.sort() + samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1)) + + deseq=list(deseq) + deseq1=list(deseq1) + + new_names_con=manager.list() + new_names_tre=manager.list() + new_deseq=manager.list() + new_deseq1=manager.list() + ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)] + ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)]) + + [z.start() for z in ps_deseq] + [z.join() for z in ps_deseq] + new_deseq=list(new_deseq) + new_deseq1=list(new_deseq1) + + LH2E=[[x[0],x[2]] for x in new_deseq[0]] + [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq] + + LH8E=[[x[0],x[2]] for x in new_deseq1[0]] + [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1] + + print('Deseq took {} seconds'.format(time.time() - starttime200)) + + merg_nam_LH2E=manager.list() + merg_nam_LH8E=manager.list() + + LH2E_copy=copy.deepcopy(list(LH2E)) + LH8E_copy=copy.deepcopy(list(LH8E)) + + fil_sort_tre=manager.list() + fil_sort_con=manager.list() + raw_sort_tre=manager.list() + raw_sort_con=manager.list() + + ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con)) + ps_main.start() + + if args.anal=="2": + all_iso = manager.list() + ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso)) + ps_non_iso.start() + + ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))] + ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))]) + [x.start() for x in ps_merge] + [x.join() for x in ps_merge] + + merg_nam_LH2E=list(merg_nam_LH2E) + merg_nam_LH8E=list(merg_nam_LH8E) + + starttime2 = time.time() + procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data] + procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1]) + procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))]) + if args.anal == "1": + procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))]) + + [p.start() for p in procs] + + + if args.anal=="1": + [x.join() for x in ps_hist] + [p.join() for p in procs] + ps_pdf = Process(target=pdf_before_DE,args=(args.anal)) + ps_pdf.start() + + print('Graphs took {} seconds'.format(time.time() - starttime2)) + + ps_main.join() + + fil_sort_con=list(fil_sort_con) + fil_sort_tre=list(fil_sort_tre) + if fil_sort_con==[]: + fil_sort_con=raw_sort_con + fil_sort_tre=raw_sort_tre + + raw_sort_con=list(raw_sort_con) + raw_sort_tre=list(raw_sort_tre) + names_con=list(new_names_con) + names_tre=list(new_names_tre) + + ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1)) + ps_write.start() + + ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))] + ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))]) + [p.start() for p in ps1_matrix] + + if args.anal=="1": + ps_pdf.join() + if args.anal=="2": + [p.join() for p in procs] + [x.join() for x in ps_hist] + + ps_write.join() + [p.join() for p in ps1_matrix] + + +############################## Detection of Both ####################################### + + starttime10 = time.time() + + if args.anal == "2": + + n_data= manager.list() + n_names_con=manager.list() + n_samples_mirna_names=manager.list() + n_deseq=manager.list() + n_LH2E_names=manager.list() + + n_data1 = manager.list() + n_names_tre = manager.list() + n_samples_mirna_names1=manager.list() + n_deseq1=manager.list() + n_LH8E_names=manager.list() + + + new_mat_mirnas = list(mature_mirnas) + ps_non_iso.join() + + all_iso=list(all_iso) + new_mat_mirnas.extend(all_iso) + + starttime11=time.time() + + ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control] + ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated]) + + [p.start() for p in ps_sam] + [p.join() for p in ps_sam] + + print('Non-sam took {} seconds'.format(time.time() - starttime11)) + + starttime12=time.time() + + n_names_con=list(n_names_con) + n_names_tre=list(n_names_tre) + n_samples_mirna_names=list(n_samples_mirna_names) + n_samples_mirna_names.sort() + n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names)) + + n_samples_mirna_names1=list(n_samples_mirna_names1) + n_samples_mirna_names1.sort() + n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1)) + + n_deseq=list(n_deseq) + n_deseq1=list(n_deseq1) + + new_n_names_con=manager.list() + new_n_names_tre=manager.list() + n_new_deseq=manager.list() + n_new_deseq1=manager.list() + ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)] + ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)]) + + [x.start() for x in ps_deseq] + [x.join() for x in ps_deseq] + n_new_deseq=list(n_new_deseq) + n_new_deseq1=list(n_new_deseq1) + + print([len(n_new_deseq[0]),len(n_new_deseq[1])]) + print([len(n_new_deseq1[0]),len(n_new_deseq1[1])]) + + n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]] + [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq] + + n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]] + [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1] + + print('Non-deseq took {} seconds'.format(time.time() - starttime12)) + + merg_nam_n_LH2E=manager.list() + merg_nam_n_LH8E=manager.list() + + n_LH2E_copy=copy.deepcopy(list(n_LH2E)) + n_LH8E_copy=copy.deepcopy(list(n_LH8E)) + + n_sort_tre=manager.list() + n_sort_con=manager.list() + + n_fil_sort_con=manager.list() + n_fil_sort_tre=manager.list() + n_raw_sort_con=manager.list() + n_raw_sort_tre=manager.list() + + ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con)) + ps_main.start() + + ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))] + ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))]) + [p.start() for p in ps_merge] + [p.join() for p in ps_merge] + + merg_nam_n_LH2E=list(merg_nam_n_LH2E) + merg_nam_n_LH8E=list(merg_nam_n_LH8E) + + procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data] + procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1]) + procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))]) + procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))]) + procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))]) + + starttime13=time.time() + [p.start() for p in procs] + [p.join() for p in procs] + + print('Graphs took {} seconds'.format(time.time() - starttime13)) + + procs1 = Process(target=pdf_before_DE,args=(args.anal)) + procs1.start() + + starttime14=time.time() + ps_main.join() + + n_fil_sort_con=list(n_fil_sort_con) + n_fil_sort_tre=list(n_fil_sort_tre) + if n_fil_sort_con==[]: + n_fil_sort_con=n_raw_sort_con + n_fil_sort_tre=n_raw_sort_tre + + n_raw_sort_con=list(n_raw_sort_con) + n_raw_sort_tre=list(n_raw_sort_tre) + n_names_con=list(new_n_names_con) + n_names_tre=list(new_n_names_tre) + + ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2)) + ps_write.start() + + ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))] + ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))]) + [p.start() for p in ps1_matrix] + + ps_write.join() + [p.join() for p in ps1_matrix] + procs1.join() + print('That took {} seconds'.format(time.time() - starttime10)) + print('That took {} seconds'.format(time.time() - starttime)) + + + + + + + +