Mercurial > repos > glogobyte > viztool
changeset 4:2b231cde4efd draft
Deleted selected files
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 17:45:21 +0000 |
parents | 67f6938b3c42 |
children | 89ab0ef1c931 |
files | mirbase_functions.py mirbase_graphs.py mirbase_ultra_v2.py toolExample.xml |
diffstat | 4 files changed, 0 insertions(+), 3560 deletions(-) [+] |
line wrap: on
line diff
--- a/mirbase_functions.py Fri Oct 16 12:17:22 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,829 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -"""---------------------- Simple Functions -----------------------""" - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - -"""---------------------- RNA-seq Functions ----------------------""" - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - - - -######################################################################################################################################################## - -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################################################################# - -def duplicate_chroms_isoforms(List): - - dupes=[] - - for num in range(len(List)): - - if [List[num][9],List[num][0],List[num][2]] not in dupes : - dupes.append([List[num][9],List[num][0],List[num][2]]) - - for x in List: - for y in dupes: - if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: - y.append(x[2]) - - - double_List = [x[:] for x in List] - - chr_order=[] - for x in dupes: - temp = [] - for i in range(2,len(x)): - if x[i].split("chr")[1].split("(")[0].isdigit(): - temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) - else: - temp.append(x[i].split("chr")[1][0:4]) - - for z in temp: - if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: - temp = [str(j) for j in temp] - temp=list(set(temp)) - temp.sort() - chr_order.append(temp) - - final_dupes=[] - for i in range(len(dupes)): - final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) - for x in chr_order[i]: - result = re.match("[-+]?\d+$", str(x)) - if len(chr_order[i]) == len(set(chr_order[i])): - if result is not None: - - if int(x)<0: - final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) - else: - if result is not None: - if int(x) < 0: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) - - final_dupes.sort() - final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) - - for i in range(len(double_List)): - for x in final_dupes: - - if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) - double_List[i][2] = x[1]+gg - - if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - double_List[i][2]=x[1] - List[i][2] = x[1] - - List.sort() - new_list=list(List for List,_ in itertools.groupby(List)) - - double_List.sort() - new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) - - return new_list, new_double_List - - -############################################################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if "{" in unique_seq[j][2].split("_")[0]: - official=unique_seq[j][2].split("_")[0][:-4] - else: - official=unique_seq[j][2].split("_")[0] - - if mature_mirnas[i].split(" ")[0][1:] == official: - - temp_mature = mature_mirnas[i+1].strip().replace("U", "T") - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in double_fil_uni_seq: - if y[0]==x[2].split("_")[0]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - l.acquire() - if con=="c": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### - -""" - -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - - uni_seq=[] - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - - t_name=unique_seq[j].copy() - t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -##################################################################################################################################################################################################################### -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -#################################################################################################################################################################################################################### - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - print("LHE_names") - print([len(LH8E_add_names),len(LH2E_add_names)]) - print([len(LH8E),len(LH2E)]) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - if int(per)!=-1: - percent=int(per)/100 - print(percent) - print(count) - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - filter_LH8E.append(LH8E[i]) - filter_LH2E.append(LH2E[i]) - - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2: - fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - -######################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -################################################################################################################## - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): - if any(y[2] in lst2 for lst2 in unique_iso)==False: - y[2]=">"+y[2] - unique_iso.append(y) - l.acquire() - for x in unique_iso: - mat_mirnas.append(x[2]) - mat_mirnas.append(x[9]) - l.release() - -######################################################################################################################### - -def merging_names(LH2E_copy,new): - - dupes=[] - final_LH2E =[] - - for num in range(len(LH2E_copy)): - - if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: - final_LH2E.append(LH2E_copy[num][1]) - final_LH2E.append(LH2E_copy[num][0]) - else: - dupes.append(LH2E_copy[num][1]) - - dupes=list(set(dupes)) - - for i in range(len(dupes)): - dupes[i]=[dupes[i]] - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - - new.extend(LH2E_copy) - - -###################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################### - -def download_matures(matures,org_name): - - #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' - url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' - data = urllib.request.urlopen(url).read() - file_mirna = gzip.decompress(data).decode('utf-8') - file_mirna = file_mirna.split("\n") - - for i in range(0,len(file_mirna)-1,2): - - if org_name in file_mirna[i]: - matures.append(file_mirna[i]) - matures.append(file_mirna[i+1]) - -################################################################################################################################################################################################################### -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - - - for x in pre_uni_seq_tre: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2]: - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### -
--- a/mirbase_graphs.py Fri Oct 16 12:17:22 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,809 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -################################################################################################################################################################# -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -###################################################################################################################################################### - - -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - -##################################################################################################################################################### - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - - -############################################################################################################################################################################################################# - -def hist_red(samples,flag): - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -################################################################################################################# - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=best[2] - elif best[1][i] == "C": - C[i]+=best[2] - elif best[1][i] == "G": - G[i]+=best[2] - else: - T[i]+=best[2] - else: - total_reads+=y[2] - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "C": - C[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "G": - G[i]+=(y[2]) - else: - T[i]+=y[2] - - A[:] = [round(x*100,1) / total_reads for x in A] - C[:] = [round(x*100,1) / total_reads for x in C] - G[:] = [round(x*100,1) / total_reads for x in G] - T[:] = [round(x*100,1) / total_reads for x in T] - - - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) - h.grid(axis='y',linewidth=0.2) - plt.xticks(rotation=0, ha="right") - plt.ylabel("Counts (%)",fontsize=18) - plt.xlabel("Positions (nt)",fontsize=18) - plt.title(titlos,fontsize=20) - plt.tight_layout() - plt.savefig(file_bar, dpi=300) - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_title(titlos,fontsize=18) - crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - figure = plt.gcf() - figure.set_size_inches(6, 4) - crp_logo.fig.savefig(file_logo,dpi=300) - -########################################################################################################################################################################################################## - -def logo_seq_non_red(merge_LH2E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=1 - elif best[1][i] == "C": - C[i]+=1 - elif best[1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - else: - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=1 - elif y[0].split("_")[-1][i] == "C": - C[i]+=1 - elif y[0].split("_")[-1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) - h.set_xlabel("Positions (nt)") - h.set_ylabel("Unique sequences") - plt.xticks(rotation=0, ha="right") - plt.tight_layout() - plt.savefig("bar2.png", dpi=300) - - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - - # style using Logo methods - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - crp_logo.ax.set_title("Non-redundant") - figure = plt.gcf() - crp_logo.fig.savefig('logo2.png', dpi=300) - - -################################################################################################################################################################ - -def pdf_before_DE(analysis): - - # Import FPDF class - from fpdf import FPDF, fpdf - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - if analysis=="2": - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png") - else: - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png") - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - #sys.exit(images) - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'A4') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 20.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'IsomiR Profile Report',align='C') - pdf.ln(0.7) - pdf.set_font('Arial','', 16.0) - pdf.cell(pdf.w-0.5, 0.5, 'sRNA Length Distribution',align='C') - - # Smaller font for image captions - pdf.set_font('Arial', '', 11.0) - - # Image caption - pdf.ln(0.5) - - yh=FPDF.get_y(pdf) - pdf.image(images[0],x=0.3,w=4, h=3) - pdf.image(images[1],x=4,y=yh, w=4, h=3) - pdf.ln(0.3) - - # Image caption - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Mapped and unmapped reads to custom precussor arm reference DB (5p and 3p arms) in Control (left)") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " and Treated (right) groups") - - - pdf.ln(0.5) - h1=FPDF.get_y(pdf) - pdf.image(images[2],x=1, w=6.5, h=5) - h2=FPDF.get_y(pdf) - FPDF.set_y(pdf,h1+0.2) - pdf.set_font('Arial','', 14.0) - pdf.cell(pdf.w-0.5, 0.5, 'Template and non-template IsomiRs',align='C') - pdf.set_font('Arial', '', 11.0) - FPDF.set_y(pdf,h2) - FPDF.set_y(pdf,9.5) - # Image caption - pdf.cell(0.2) - if analysis=="2": - pdf.cell(3.0, 0.0, " Template, non-template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " in Control (left) and treated (right) groups") - else: - pdf.cell(3.0, 0.0, " Template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads in") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Control (left) and treated (right) groups") - - - - pdf.add_page() - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Reference form and isomiR among total miRNA reads",align='C') - pdf.ln(0.7) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "Template isomiR profile (redundant)",align='C') - pdf.ln(0.5) - pdf.image(images[3],x=1.5, w=5.5, h=4) - pdf.ln(0.6) - pdf.cell(pdf.w-0.5, 0.0, "Template isomiR profile (non-redundant)",align='C') - pdf.set_font('Arial', '', 12.0) - pdf.ln(0.2) - pdf.image(images[4],x=1.5, w=5.5, h=4) - pdf.ln(0.3) - pdf.set_font('Arial', '', 11.0) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " * IsomiRs potentialy initiated from multiple loci") - - - if analysis=="2": - pdf.add_page('L') - - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Non-template IsomiRs",align='C') - pdf.ln(0.5) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "3' Additions of reference of isomiR sequence",align='C') - pdf.ln(0.7) - - yh=FPDF.get_y(pdf) - pdf.image(images[5],x=1.5,w=3.65, h=2.65) - pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65) - pdf.ln(0.5) - yh=FPDF.get_y(pdf) - pdf.image(images[6],x=1.5,w=3.65, h=2.65) - pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65) - - pdf.close() - pdf.output('report1.pdf','F') - - - - -#############################################################################################################################################################3 - -def pdf_after_DE(analysis): - - # Import FPDF class - from fpdf import FPDF - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - image_extensions = ("tem.png","non.png","a2.png") - - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'letter') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 16.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'Differential expression of miRNAs and Isoforms',align='C') - #pdf.ln(0.25) - - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNA and template isoforms',align='C') - - - # Smaller font for image captions - pdf.set_font('Arial', '', 10.0) - - # Image caption - pdf.ln(0.4) - pdf.image(images[0],x=0.8, w=7, h=8) - pdf.ln(0.3) - - if analysis=="2": - - pdf.add_page() - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed non-template isomiRs',align='C') - pdf.ln(0.4) - pdf.image(images[1],x=0.5, w=7.5, h=6.5) - - pdf.add_page() - pdf.ln(0.5) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNAs and isomiRs grouped by arm',align='C') - pdf.ln(0.4) - pdf.image(images[2],x=0.8, w=7, h=8) - pdf.ln(0.3) - - - pdf.output('report2.pdf', 'F') - - -
--- a/mirbase_ultra_v2.py Fri Oct 16 12:17:22 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1723 +0,0 @@ -from mirbase_functions import * -from mirbase_graphs import * -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - -subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) - -parser = argparse.ArgumentParser() -parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") -parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[]) -parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] ) -parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") -parser.add_argument("-gen", "--org_name", help="tool directory path", action="store") -parser.add_argument("-program", "--pro", help="choose type of analysis", action="store") -parser.add_argument("-f", "--flag", help="choose the database", action="store") -parser.add_argument("-umis", "--umi", help="choose the database", action="store") -parser.add_argument("-percentage", "--per", help="choose the database", action="store") -parser.add_argument("-counts", "--count", help="choose the database", action="store") -parser.add_argument("-name1", "--n1", help="choose the database", action="store") -parser.add_argument("-name2", "--n2", help="choose the database", action="store") -args = parser.parse_args() - -######################################################################################################################################### - -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################################################################# - -def duplicate_chroms_isoforms(List): - - dupes=[] - - for num in range(len(List)): - - if [List[num][9],List[num][0],List[num][2]] not in dupes : - dupes.append([List[num][9],List[num][0],List[num][2]]) - - for x in List: - for y in dupes: - if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: - y.append(x[2]) - - - double_List = [x[:] for x in List] - - chr_order=[] - for x in dupes: - temp = [] - for i in range(2,len(x)): - if x[i].split("chr")[1].split("(")[0].isdigit(): - temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) - else: - temp.append(x[i].split("chr")[1][0:4]) - - for z in temp: - if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: - temp = [str(j) for j in temp] - temp=list(set(temp)) - temp.sort() - chr_order.append(temp) - - final_dupes=[] - for i in range(len(dupes)): - final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) - for x in chr_order[i]: - result = re.match("[-+]?\d+$", str(x)) - if len(chr_order[i]) == len(set(chr_order[i])): - if result is not None: - - if int(x)<0: - final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) - else: - if result is not None: - if int(x) < 0: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) - - final_dupes.sort() - final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) - - for i in range(len(double_List)): - for x in final_dupes: - - if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) - double_List[i][2] = x[1]+gg - - if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - double_List[i][2]=x[1] - List[i][2] = x[1] - - List.sort() - new_list=list(List for List,_ in itertools.groupby(List)) - - double_List.sort() - new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) - - return new_list, new_double_List - - -############################################################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if "{" in unique_seq[j][2].split("_")[0]: - official=unique_seq[j][2].split("_")[0][:-4] - else: - official=unique_seq[j][2].split("_")[0] - - if mature_mirnas[i].split(" ")[0][1:] == official: - - temp_mature = mature_mirnas[i+1].strip().replace("U", "T") - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in double_fil_uni_seq: - if y[0]==x[2].split("_")[0]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - l.acquire() - if con=="c": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### -""" - -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - - uni_seq=[] - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - - t_name=unique_seq[j].copy() - t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -##################################################################################################################################################################################################################### -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -#################################################################################################################################################################################################################### - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - print("LHE_names") - print([len(LH8E_add_names),len(LH2E_add_names)]) - print([len(LH8E),len(LH2E)]) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - LH8E_new=[] - LH2E_new=[] - - if int(args.per)!=-1: - percent=int(args.per)/100 - print(percent) - print(args.count) - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - LH8E_new.append(LH8E[i]) - LH2E_new.append(LH2E[i]) - - filter_LH2E.extend(LH2E_new) - filter_LH8E.extend(LH8E_new) - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag): - - if flag == 1 and int(args.per)!=-1: - fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(args.per)!=-1: - fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2: - fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - -######################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -################################################################################################################## - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): - if any(y[2] in lst2 for lst2 in unique_iso)==False: - y[2]=">"+y[2] - unique_iso.append(y) - l.acquire() - for x in unique_iso: - mat_mirnas.append(x[2]) - mat_mirnas.append(x[9]) - l.release() - -######################################################################################################################### -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -###################################################################################################################################################### - -def merging_names(LH2E_copy,new): - - dupes=[] - final_LH2E =[] - - for num in range(len(LH2E_copy)): - - if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: - final_LH2E.append(LH2E_copy[num][1]) - final_LH2E.append(LH2E_copy[num][0]) - else: - dupes.append(LH2E_copy[num][1]) - - dupes=list(set(dupes)) - - for i in range(len(dupes)): - dupes[i]=[dupes[i]] - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - - new.extend(LH2E_copy) - - -###################################################################################################################################################### -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - - -############################################################################################################################################################################################################# -def hist_red(samples,flag): - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -################################################################################################################# - - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=best[2] - elif best[1][i] == "C": - C[i]+=best[2] - elif best[1][i] == "G": - G[i]+=best[2] - else: - T[i]+=best[2] - else: - total_reads+=y[2] - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "C": - C[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "G": - G[i]+=(y[2]) - else: - T[i]+=y[2] - - A[:] = [round(x*100,1) / total_reads for x in A] - C[:] = [round(x*100,1) / total_reads for x in C] - G[:] = [round(x*100,1) / total_reads for x in G] - T[:] = [round(x*100,1) / total_reads for x in T] - - - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) - h.grid(axis='y',linewidth=0.2) - plt.xticks(rotation=0, ha="right") - plt.ylabel("Counts (%)",fontsize=18) - plt.xlabel("Positions (nt)",fontsize=18) - plt.title(titlos,fontsize=20) - plt.tight_layout() - plt.savefig(file_bar, dpi=300) - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_title(titlos,fontsize=18) - crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - figure = plt.gcf() - figure.set_size_inches(6, 4) - crp_logo.fig.savefig(file_logo,dpi=300) - -########################################################################################################################################################################################################## - - - -def logo_seq_non_red(merge_LH2E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=1 - elif best[1][i] == "C": - C[i]+=1 - elif best[1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - else: - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=1 - elif y[0].split("_")[-1][i] == "C": - C[i]+=1 - elif y[0].split("_")[-1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) - h.set_xlabel("Positions (nt)") - h.set_ylabel("Unique sequences") - plt.xticks(rotation=0, ha="right") - plt.tight_layout() - plt.savefig("bar2.png", dpi=300) - - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - - # style using Logo methods - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - crp_logo.ax.set_title("Non-redundant") - figure = plt.gcf() - crp_logo.fig.savefig('logo2.png', dpi=300) - - -################################################################################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################### - -def download_matures(matures,org_name): - - #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' - url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' - data = urllib.request.urlopen(url).read() - file_mirna = gzip.decompress(data).decode('utf-8') - file_mirna = file_mirna.split("\n") - - for i in range(0,len(file_mirna)-1,2): - - if org_name in file_mirna[i]: - matures.append(file_mirna[i]) - matures.append(file_mirna[i+1]) - -################################################################################################################################################################################################################### -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - - - for x in pre_uni_seq_tre: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2]: - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### - -if __name__ == '__main__': - - starttime = time.time() - - q1 = Queue() - q2 = Queue() - lock = Lock() - manager = Manager() - - mature_mirnas=manager.list() - ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) - ps_mature.start() - - args.control[0]=args.control[0][1:] - args.control[len(args.control)-1][:-1] - control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] - - args.treated[0]=args.treated[0][1:] - args.treated[len(args.treated)-1][:-1] - treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] - - -############## Detection of templated isoforms ################ - - radar = manager.list([0,0,0,0]) - samples = manager.list() - data= manager.list() - names_con=manager.list() - samples_mirna_names=manager.list() - deseq=manager.list() - unmap_seq=manager.Value('i',0) - unmap_counts=manager.Value('i',0) - LH2E_names=manager.list() - ini_c_samples = manager.list() - - - radar1 = manager.list([0,0,0,0]) - samples1 = manager.list() - data1 = manager.list() - names_tre = manager.list() - samples_mirna_names1=manager.list() - deseq1=manager.list() - unmap1_seq = manager.Value('i',0) - unmap1_counts = manager.Value('i',0) - LH8E_names=manager.list() - ini_t_samples = manager.list() - ps_mature.join() - - - mature_mirnas=list(mature_mirnas) - - - starttime1 = time.time() - ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control] - ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - print('SAM took {} seconds'.format(time.time() - starttime1)) - - ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))] - ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))]) - [x.start() for x in ps_hist] - - starttime200=time.time() - - sc = list(samples) - st = list(samples1) - - names_con=list(names_con) - names_tre=list(names_tre) - samples_mirna_names=list(samples_mirna_names) - samples_mirna_names.sort() - samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names)) - - samples_mirna_names1=list(samples_mirna_names1) - samples_mirna_names1.sort() - samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1)) - - deseq=list(deseq) - deseq1=list(deseq1) - - new_names_con=manager.list() - new_names_tre=manager.list() - new_deseq=manager.list() - new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)]) - - [z.start() for z in ps_deseq] - [z.join() for z in ps_deseq] - new_deseq=list(new_deseq) - new_deseq1=list(new_deseq1) - - LH2E=[[x[0],x[2]] for x in new_deseq[0]] - [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq] - - LH8E=[[x[0],x[2]] for x in new_deseq1[0]] - [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1] - - print('Deseq took {} seconds'.format(time.time() - starttime200)) - - merg_nam_LH2E=manager.list() - merg_nam_LH8E=manager.list() - - LH2E_copy=copy.deepcopy(list(LH2E)) - LH8E_copy=copy.deepcopy(list(LH8E)) - - fil_sort_tre=manager.list() - fil_sort_con=manager.list() - raw_sort_tre=manager.list() - raw_sort_con=manager.list() - - ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con)) - ps_main.start() - - if args.anal=="2": - all_iso = manager.list() - ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso)) - ps_non_iso.start() - - ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))]) - [x.start() for x in ps_merge] - [x.join() for x in ps_merge] - - merg_nam_LH2E=list(merg_nam_LH2E) - merg_nam_LH8E=list(merg_nam_LH8E) - - starttime2 = time.time() - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1]) - procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))]) - if args.anal == "1": - procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))]) - - [p.start() for p in procs] - - - if args.anal=="1": - [x.join() for x in ps_hist] - [p.join() for p in procs] - ps_pdf = Process(target=pdf_before_DE,args=(args.anal)) - ps_pdf.start() - - print('Graphs took {} seconds'.format(time.time() - starttime2)) - - ps_main.join() - - fil_sort_con=list(fil_sort_con) - fil_sort_tre=list(fil_sort_tre) - if fil_sort_con==[]: - fil_sort_con=raw_sort_con - fil_sort_tre=raw_sort_tre - - raw_sort_con=list(raw_sort_con) - raw_sort_tre=list(raw_sort_tre) - names_con=list(new_names_con) - names_tre=list(new_names_tre) - - ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - if args.anal=="1": - ps_pdf.join() - if args.anal=="2": - [p.join() for p in procs] - [x.join() for x in ps_hist] - - ps_write.join() - [p.join() for p in ps1_matrix] - - -############################## Detection of Both ####################################### - - starttime10 = time.time() - - if args.anal == "2": - - n_data= manager.list() - n_names_con=manager.list() - n_samples_mirna_names=manager.list() - n_deseq=manager.list() - n_LH2E_names=manager.list() - - n_data1 = manager.list() - n_names_tre = manager.list() - n_samples_mirna_names1=manager.list() - n_deseq1=manager.list() - n_LH8E_names=manager.list() - - - new_mat_mirnas = list(mature_mirnas) - ps_non_iso.join() - - all_iso=list(all_iso) - new_mat_mirnas.extend(all_iso) - - starttime11=time.time() - - ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control] - ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - - print('Non-sam took {} seconds'.format(time.time() - starttime11)) - - starttime12=time.time() - - n_names_con=list(n_names_con) - n_names_tre=list(n_names_tre) - n_samples_mirna_names=list(n_samples_mirna_names) - n_samples_mirna_names.sort() - n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names)) - - n_samples_mirna_names1=list(n_samples_mirna_names1) - n_samples_mirna_names1.sort() - n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1)) - - n_deseq=list(n_deseq) - n_deseq1=list(n_deseq1) - - new_n_names_con=manager.list() - new_n_names_tre=manager.list() - n_new_deseq=manager.list() - n_new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)]) - - [x.start() for x in ps_deseq] - [x.join() for x in ps_deseq] - n_new_deseq=list(n_new_deseq) - n_new_deseq1=list(n_new_deseq1) - - print([len(n_new_deseq[0]),len(n_new_deseq[1])]) - print([len(n_new_deseq1[0]),len(n_new_deseq1[1])]) - - n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]] - [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq] - - n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]] - [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1] - - print('Non-deseq took {} seconds'.format(time.time() - starttime12)) - - merg_nam_n_LH2E=manager.list() - merg_nam_n_LH8E=manager.list() - - n_LH2E_copy=copy.deepcopy(list(n_LH2E)) - n_LH8E_copy=copy.deepcopy(list(n_LH8E)) - - n_sort_tre=manager.list() - n_sort_con=manager.list() - - n_fil_sort_con=manager.list() - n_fil_sort_tre=manager.list() - n_raw_sort_con=manager.list() - n_raw_sort_tre=manager.list() - - ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con)) - ps_main.start() - - ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))]) - [p.start() for p in ps_merge] - [p.join() for p in ps_merge] - - merg_nam_n_LH2E=list(merg_nam_n_LH2E) - merg_nam_n_LH8E=list(merg_nam_n_LH8E) - - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))]) - procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))]) - - starttime13=time.time() - [p.start() for p in procs] - [p.join() for p in procs] - - print('Graphs took {} seconds'.format(time.time() - starttime13)) - - procs1 = Process(target=pdf_before_DE,args=(args.anal)) - procs1.start() - - starttime14=time.time() - ps_main.join() - - n_fil_sort_con=list(n_fil_sort_con) - n_fil_sort_tre=list(n_fil_sort_tre) - if n_fil_sort_con==[]: - n_fil_sort_con=n_raw_sort_con - n_fil_sort_tre=n_raw_sort_tre - - n_raw_sort_con=list(n_raw_sort_con) - n_raw_sort_tre=list(n_raw_sort_tre) - n_names_con=list(new_n_names_con) - n_names_tre=list(new_n_names_tre) - - ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - ps_write.join() - [p.join() for p in ps1_matrix] - procs1.join() - print('That took {} seconds'.format(time.time() - starttime10)) - print('That took {} seconds'.format(time.time() - starttime)) - - - - - - - -
--- a/toolExample.xml Fri Oct 16 12:17:22 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,199 +0,0 @@ - -<tool id="fa_gc_content_1" name="IsoRead: miR and isomiR identification and classification" version="0.1.0"> - <description>for each sequence in a file</description> - <requirements> - <requirement type="package" version="1.7">fpdf</requirement> - <requirement type="package" version="0.8">logomaker</requirement> - <requirement type="package" version="0.6.0">plotnine</requirement> - <requirement type="package" version="3.7.4">python</requirement> - <requirement type="package" version="1.17.3">numpy</requirement> - <requirement type="package" version="3.1.2">matplotlib</requirement> - <requirement type="package" version="0.9.0">seaborn</requirement> - <requirement type="package" version="1.0.3">pandas</requirement> - </requirements> - <command> - #set controls=[] - #for $input in $control# - $controls.extend([str($input.element_identifier),str($input)]) - #end for# - #set treateds=[] - #for $input in $treated# - $treateds.extend([str($input.element_identifier),str($input)]) - #end for# - #if $mir_input.database == "1": - #if $f.fil == "1": - #set path=$mir_input.genome1.fields.path - python -W ignore $__tool_directory__/mirbase_ultra_v2.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -f "$mir_input.database" -umis $umis -percentage "-1" -counts "-1" -name1 "$fal1" -name2 "$fal2" - #end if - #if $f.fil == "2": - #set path=$mir_input.genome1.fields.path - python -W ignore $__tool_directory__/mirbase_ultra_v2.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -f "$mir_input.database" -umis $umis -percentage "$f.fil1" -counts "$f.fil2" -name1 "$fal1" -name2 "$fal2" - #end if - #else: - #if $f.fil == "1": - #set path=$mir_input.genome2.fields.value - python -W ignore $__tool_directory__/mirgene_ultra_v2.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -f "$mir_input.database" -umis $umis -percentage "-1" -counts "-1" -name1 "$fal1" -name2 "$fal2" - #end if - #if $f.fil == "2": - #set path=$mir_input.genome2.fields.value - python -W ignore $__tool_directory__/mirgene_ultra_v2.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -f "$mir_input.database" -umis $umis -percentage "$f.fil1" -counts "$f.fil2" -name1 "$fal1" -name2 "$fal2" - #end if - #end if - - <!-- - #if $mir_input.database == "1" and $f.filter == "2": - #set path=$mir_input.genome1.fields.path - python -W ignore $__tool_directory__/mirbase_ultra_v1.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -f "$mir_input.database" -umis $umis -percentage "$filters.fil1" -counts "$filter.fil2 -name1 $fal1 -name2 $fal2" - #else: - #set path=$mir_input.genome2.fields.value - python -W ignore $__tool_directory__/main_galaxy_mirgene.py -con $controls -tre $treateds -analysis $analysis -tool_dir $__tool_directory__ -gen "$path" -program $program -f "$mir_input.database" -percentage "$filters.fil1" -counts "$filters.fil2" -> - #end if - --> - </command> - <inputs> - <param name="analysis" type="select" label="Discover miR with templated or/and non-templated isomiRs" help="Choose the category of miRNAs for detection"> - <option value="1" selected="true">Detection of only templated miRNAs</option> - <option value="2">Detection of templated and non-templated miRNAs</option> - </param> - <!--param name="program" type="select" label="Choose differential expression/statistics module" help="Choose the category of miRNAs for detection"> - <option value="0" selected="true">Deseq2</option> - <option value="1">EdgeR</option> - <option value="2">Deseq2 and EdgeR</option> - </param--> - - <conditional name="mir_input"> - <param name="database" type="select" label="Choose Database of miRNAs organisms" help="Choose which database prefer to be used."> - <option value="1" selected="true">MirBase</option> - <option value="2">MirGene</option> - </param> - <when value="1"> - <param name="genome1" type="select" label="Reference miRNAs (organism)" help="If your genome coordinates of interest is not listed, contact the Galaxy team"> - <options from_data_table="n_spiecies" /> - </param> - </when> - <when value="2"> - <param name="genome2" type="select" label="Reference miRNAs (organism)" help="If your genome coordinates of interest is not listed, contact the Galaxy team"> - <options from_data_table="mirgene" /> - </param> - </when> - </conditional> - - - <param name="fal1" type="text" value="FactorLevel" label="Specify a factor level, typical values could be 'tumor', 'normal', 'treated' or 'control'"/> - <param name="control" format="sam" type="data" multiple="True" label="Select BAM files of the factor level samples" /> - <param name="fal2" type="text" value="FactorLevel" label="Specify a factor level, typical values could be 'tumor', 'normal', 'treated' or 'control'"/> - <param name="treated" format="sam" type="data" multiple="True" label="Select BAM files of the factor level samples" /> - - <conditional name="f"> - <param name="fil" type="select" label="Filter low counts" help="Treat genes with very low expression as unexpressed and filter out"> - <option value="1" selected="true">No</option> - <option value="2">Yes</option> - </param> - <when value="2"> - <param name="fil1" type="integer" value="0" label="Minimum percentage of the samples" help="Filter out all genes that do not meet the Minimum counts in at least this many samples of every category"/> - <param name="fil2" type="integer" value="0" label="Minimum counts" help="Filter out all genes that do not meet this minimum count"/> - </when> - <when value="1"> - </when> - </conditional> - - <param name="db" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Output Database files" /> - <param name="cmatrix" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Output Matrix files, one for each factor level" /> - <param name="c_files" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Output Count tables, one for each sample" /> - <param name="umis" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Collapsing sam files if samples include UMIs" /> - </inputs> - <outputs> - <collection name="list_output1" type="list" label="Database ${fal1} templated" > - <discover_datasets pattern="__name__" format="tabular" directory="split1" /> - <filter>db == 1 and (analysis == "1" or analysis == "2")</filter> - </collection> - <collection name="list_output2" type="list" label="Database ${fal2} templated" > - <discover_datasets pattern="__name__" format="tabular" directory="split2" /> - <filter>db == 1 and (analysis == "1" or analysis == "2")</filter> - </collection> - <collection name="list_output3" type="list" label="Database ${fal1} non-templated" > - <discover_datasets pattern="__name__" format="tabular" directory="split3" /> - <filter>db == 1 and analysis == "2"</filter> - </collection> - <collection name="list_output4" type="list" label="Database ${fal2} non-templated" > - <discover_datasets pattern="__name__" format="tabular" directory="split4" /> - <filter>db == 1 and analysis == "2"</filter> - </collection> - - <!-- - <data name="Results non templated control" format="tabular" label="${fal1} matrix counts of non templated results" from_work_dir="$__tool_directory__/Deseq_final_non_LH2E"> - <filter>cmatrix==1 and analysis == "2"</filter> - </data> - <data name="Results non templated treated" format="tabular" label="${fal2} matrix counts of non templated results" from_work_dir="$__tool_directory__/Deseq_final_non_LH8E"> - <filter>cmatrix==1 and analysis == "2"</filter> - </data> - <data name="Results templated control" format="tabular" label="${fal1} matrix counts of templated results" from_work_dir="$__tool_directory__/Deseq_final_temp_LH2E"> - <filter>cmatrix==1</filter> - </data> - <data name="Results templated treated" format="tabular" label="${fal2} matrix counts of templated results" from_work_dir="$__tool_directory__/Deseq_final_temp_LH8E"> - <filter>cmatrix==1</filter> - </data> - --> - <collection name="Counts" type="list" label="Count Matrices" > - <discover_datasets pattern="__name__" format="tabular" directory="Counts" /> - <filter>cmatrix==1</filter> - </collection> - - - <collection name="list_output9" type="list" label="Count files ${fal1} for Differential Expression" > - <discover_datasets pattern="__name__" format="tabular" directory="Diff/temp_con" /> - <filter>c_files==1 and (analysis == "1")</filter> - </collection> - <collection name="list_output10" type="list" label="Count files ${fal2} for Differential Expression" > - <discover_datasets pattern="__name__" format="tabular" directory="Diff/temp_tre" /> - <filter>c_files==1 and (analysis == "1")</filter> - </collection> - <collection name="list_output11" type="list" label="Count files ${fal1} for Differential Expression" > - <discover_datasets pattern="__name__" format="tabular" directory="Diff/n_temp_con" /> - <filter>c_files==1 and analysis == "2"</filter> - </collection> - <collection name="list_output12" type="list" label="Count files ${fal2} for Differential Expression" > - <discover_datasets pattern="__name__" format="tabular" directory="Diff/n_temp_tre" /> - <filter>c_files==1 and analysis == "2"</filter> - </collection> - - - <!-- - <collection name="list_output13" type="list" label="EdgeR count files ${fal1} templated" > - <discover_datasets pattern="__name__" format="tabular" directory="Edger/split5" /> - <filter>c_files==1 and (program=="1" or program=="2" ) and (analysis == "1")</filter> - </collection> - <collection name="list_output14" type="list" label="EdgeR count files ${fal2} templated" > - <discover_datasets pattern="__name__" format="tabular" directory="Edger/split6" /> - <filter>c_files==1 and (program=="1" or program=="2") and (analysis == "1")</filter> - </collection> - <collection name="list_output15" type="list" label="EdgeR count files ${fal1}" > - <discover_datasets pattern="__name__" format="tabular" directory="Edger/split7" /> - <filter>c_files==1 and (program=="1" or program=="2") and analysis == "2"</filter> - </collection> - <collection name="list_output16" type="list" label="EdgeR count files ${fal2}" > - <discover_datasets pattern="__name__" format="tabular" directory="Edger/split8" /> - <filter>c_files==1 and (program=="1" or program=="2") and analysis == "2"</filter> - </collection> - --> - - <!--data name="O" format="png" label="c_hist" from_work_dir="$__tool_direcory__/c_hist_red.png"/--> - <!--data name="O1" format="png" label="t_hist" from_work_dir="$__tool_direcory__/t_hist_red.png"/--> - <!--data name="OOOOOOOOO" format="png" label="c_non-hist" from_work_dir="$__tool_direcory__/c_hist_non_red.png"/--> - <!--data name="SSSSSSSSS" format="png" label="t_non-hist" from_work_dir="$__tool_direcory__/t_hist_non_red.png"/--> - <!--data name="OOOOOOOOO111" format="png" label="non-pie" from_work_dir="$__tool_direcory__/pie2.png"/--> - <!--data name="SSSSSSSSS1" format="png" label="pie" from_work_dir="$__tool_direcory__/pie1.png"/--> - <!--data name="OOOOO" format="png" label="spider1" from_work_dir="$__tool_direcory__/spider1.png"/--> - <!--data name="OOOOOO" format="png" label="spider2" from_work_dir="$__tool_direcory__/spider2.png"/--> - <!--data name="OOo" format="png" label="WWW" from_work_dir="$__tool_direcory__/bar2.png"/--> - <!--data name="OOOOOOOOO1" format="png" label="WWWWWWW" from_work_dir="$__tool_direcory__/bar1.png"/--> - <!--data name="OO" format="png" label="logo1" from_work_dir="$__tool_direcory__/logo1.png"/--> - <!--data name="O5" format="png" label="logo2" from_work_dir="$__tool_direcory__/logo2.png"/--> - <data name="Results non templated treated1" format="pdf" label="PDF" from_work_dir="$__tool_directory__/report1.pdf" /> - <!--data name="Results" format="png" label="results2" from_work_dir="$__tool_directory__/w.png"/--> - <data name="Results non templated treated3" format="tabular" label="results3" from_work_dir="$__tool_directory__/LH8E" /> - <!--data name="Results non templated treated4" format="tabular" label="results4" from_work_dir="$__tool_directory__/Deseq_final_temp_LH8E"/--> - </outputs> - <help> - </help> -</tool>