Mercurial > repos > glogobyte > isoread
changeset 24:d9b2a7df9d4b draft
Deleted selected files
| author | glogobyte | 
|---|---|
| date | Tue, 18 May 2021 05:48:30 +0000 | 
| parents | d2eea02053a0 | 
| children | 8c1c906b7ccd | 
| files | mirbase_functions.py mirbase_graphs.py mirbase_ultra_v2.py mirgene.loc.sample mirgene_functions.py mirgene_graphs.py mirgene_ultra_v2.py n_spiecies.loc.sample | 
| diffstat | 8 files changed, 0 insertions(+), 4296 deletions(-) [+] | 
line wrap: on
 line diff
--- a/mirbase_functions.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,823 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -"""---------------------- Simple Functions -----------------------""" - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - -"""---------------------- RNA-seq Functions ----------------------""" - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - - - -######################################################################################################################################################## - -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################################################################# - -def duplicate_chroms_isoforms(List): - - dupes=[] - - for num in range(len(List)): - - if [List[num][9],List[num][0],List[num][2]] not in dupes : - dupes.append([List[num][9],List[num][0],List[num][2]]) - - for x in List: - for y in dupes: - if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: - y.append(x[2]) - - - double_List = [x[:] for x in List] - - chr_order=[] - for x in dupes: - temp = [] - for i in range(2,len(x)): - if x[i].split("chr")[1].split("(")[0].isdigit(): - temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) - else: - temp.append(x[i].split("chr")[1][0:4]) - - for z in temp: - if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: - temp = [str(j) for j in temp] - temp=list(set(temp)) - temp.sort() - chr_order.append(temp) - - final_dupes=[] - for i in range(len(dupes)): - final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) - for x in chr_order[i]: - result = re.match("[-+]?\d+$", str(x)) - if len(chr_order[i]) == len(set(chr_order[i])): - if result is not None: - - if int(x)<0: - final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) - else: - if result is not None: - if int(x) < 0: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) - - final_dupes.sort() - final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) - - for i in range(len(double_List)): - for x in final_dupes: - - if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) - double_List[i][2] = x[1]+gg - - if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - double_List[i][2]=x[1] - List[i][2] = x[1] - - List.sort() - new_list=list(List for List,_ in itertools.groupby(List)) - - double_List.sort() - new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) - - return new_list, new_double_List - - -############################################################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if "{" in unique_seq[j][2].split("_")[0]: - official=unique_seq[j][2].split("_")[0][:-4] - else: - official=unique_seq[j][2].split("_")[0] - - if mature_mirnas[i].split(" ")[0][1:] == official: - - temp_mature = mature_mirnas[i+1].strip().replace("U", "T") - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in double_fil_uni_seq: - if y[0]==x[2].split("_")[0]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - l.acquire() - if con=="c": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### - -""" - -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - - uni_seq=[] - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - - t_name=unique_seq[j].copy() - t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -##################################################################################################################################################################################################################### -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -#################################################################################################################################################################################################################### - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - if int(per)!=-1: - percent=int(per)/100 - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - filter_LH8E.append(LH8E[i]) - filter_LH2E.append(LH2E[i]) - - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2: - fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - -######################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -################################################################################################################## - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): - if any(y[2] in lst2 for lst2 in unique_iso)==False: - y[2]=">"+y[2] - unique_iso.append(y) - l.acquire() - for x in unique_iso: - mat_mirnas.append(x[2]) - mat_mirnas.append(x[9]) - l.release() - -######################################################################################################################### - -def merging_names(LH2E_copy,new): - - dupes=[] - final_LH2E =[] - - for num in range(len(LH2E_copy)): - - if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: - final_LH2E.append(LH2E_copy[num][1]) - final_LH2E.append(LH2E_copy[num][0]) - else: - dupes.append(LH2E_copy[num][1]) - - dupes=list(set(dupes)) - - for i in range(len(dupes)): - dupes[i]=[dupes[i]] - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - - new.extend(LH2E_copy) - - -###################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################### - -def download_matures(matures,org_name): - - #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' - url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' - data = urllib.request.urlopen(url).read() - file_mirna = gzip.decompress(data).decode('utf-8') - file_mirna = file_mirna.split("\n") - - for i in range(0,len(file_mirna)-1,2): - - if org_name in file_mirna[i]: - matures.append(file_mirna[i]) - matures.append(file_mirna[i+1]) - -################################################################################################################################################################################################################### -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - - - for x in pre_uni_seq_tre: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2]: - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### -
--- a/mirbase_graphs.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,809 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -################################################################################################################################################################# -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -###################################################################################################################################################### - - -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - -##################################################################################################################################################### - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - - -############################################################################################################################################################################################################# - -def hist_red(samples,flag): - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -################################################################################################################# - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=best[2] - elif best[1][i] == "C": - C[i]+=best[2] - elif best[1][i] == "G": - G[i]+=best[2] - else: - T[i]+=best[2] - else: - total_reads+=y[2] - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "C": - C[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "G": - G[i]+=(y[2]) - else: - T[i]+=y[2] - - A[:] = [round(x*100,1) / total_reads for x in A] - C[:] = [round(x*100,1) / total_reads for x in C] - G[:] = [round(x*100,1) / total_reads for x in G] - T[:] = [round(x*100,1) / total_reads for x in T] - - - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) - h.grid(axis='y',linewidth=0.2) - plt.xticks(rotation=0, ha="right") - plt.ylabel("Counts (%)",fontsize=18) - plt.xlabel("Positions (nt)",fontsize=18) - plt.title(titlos,fontsize=20) - plt.tight_layout() - plt.savefig(file_bar, dpi=300) - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_title(titlos,fontsize=18) - crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - figure = plt.gcf() - figure.set_size_inches(6, 4) - crp_logo.fig.savefig(file_logo,dpi=300) - -########################################################################################################################################################################################################## - -def logo_seq_non_red(merge_LH2E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=1 - elif best[1][i] == "C": - C[i]+=1 - elif best[1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - else: - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=1 - elif y[0].split("_")[-1][i] == "C": - C[i]+=1 - elif y[0].split("_")[-1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) - h.set_xlabel("Positions (nt)") - h.set_ylabel("Unique sequences") - plt.xticks(rotation=0, ha="right") - plt.tight_layout() - plt.savefig("bar2.png", dpi=300) - - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - - # style using Logo methods - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - # style using Axes methods - crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - crp_logo.ax.set_title("Non-redundant") - figure = plt.gcf() - crp_logo.fig.savefig('logo2.png', dpi=300) - - -################################################################################################################################################################ - -def pdf_before_DE(analysis): - - # Import FPDF class - from fpdf import FPDF, fpdf - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - if analysis=="2": - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png") - else: - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png") - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - #sys.exit(images) - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'A4') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 20.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'IsomiR Profile Report',align='C') - pdf.ln(0.7) - pdf.set_font('Arial','', 16.0) - pdf.cell(pdf.w-0.5, 0.5, 'sRNA Length Distribution',align='C') - - # Smaller font for image captions - pdf.set_font('Arial', '', 11.0) - - # Image caption - pdf.ln(0.5) - - yh=FPDF.get_y(pdf) - pdf.image(images[0],x=0.3,w=4, h=3) - pdf.image(images[1],x=4,y=yh, w=4, h=3) - pdf.ln(0.3) - - # Image caption - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Mapped and unmapped reads to custom precussor arm reference DB (5p and 3p arms) in Control (left)") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " and Treated (right) groups") - - - pdf.ln(0.5) - h1=FPDF.get_y(pdf) - pdf.image(images[2],x=1, w=6.5, h=5) - h2=FPDF.get_y(pdf) - FPDF.set_y(pdf,h1+0.2) - pdf.set_font('Arial','', 14.0) - pdf.cell(pdf.w-0.5, 0.5, 'Template and non-template IsomiRs',align='C') - pdf.set_font('Arial', '', 11.0) - FPDF.set_y(pdf,h2) - FPDF.set_y(pdf,9.5) - # Image caption - pdf.cell(0.2) - if analysis=="2": - pdf.cell(3.0, 0.0, " Template, non-template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " in Control (left) and treated (right) groups") - else: - pdf.cell(3.0, 0.0, " Template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads in") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Control (left) and treated (right) groups") - - - - pdf.add_page() - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Reference form and isomiR among total miRNA reads",align='C') - pdf.ln(0.7) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "Template isomiR profile (redundant)",align='C') - pdf.ln(0.5) - pdf.image(images[3],x=1.5, w=5.5, h=4) - pdf.ln(0.6) - pdf.cell(pdf.w-0.5, 0.0, "Template isomiR profile (non-redundant)",align='C') - pdf.set_font('Arial', '', 12.0) - pdf.ln(0.2) - pdf.image(images[4],x=1.5, w=5.5, h=4) - pdf.ln(0.3) - pdf.set_font('Arial', '', 11.0) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " * IsomiRs potentialy initiated from multiple loci") - - - if analysis=="2": - pdf.add_page('L') - - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Non-template IsomiRs",align='C') - pdf.ln(0.5) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "3' Additions of reference of isomiR sequence",align='C') - pdf.ln(0.7) - - yh=FPDF.get_y(pdf) - pdf.image(images[5],x=1.5,w=3.65, h=2.65) - pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65) - pdf.ln(0.5) - yh=FPDF.get_y(pdf) - pdf.image(images[6],x=1.5,w=3.65, h=2.65) - pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65) - - pdf.close() - pdf.output('report1.pdf','F') - - - - -#############################################################################################################################################################3 - -def pdf_after_DE(analysis): - - # Import FPDF class - from fpdf import FPDF - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - image_extensions = ("tem.png","non.png","a2.png") - - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'letter') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 16.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'Differential expression of miRNAs and Isoforms',align='C') - #pdf.ln(0.25) - - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNA and template isoforms',align='C') - - - # Smaller font for image captions - pdf.set_font('Arial', '', 10.0) - - # Image caption - pdf.ln(0.4) - pdf.image(images[0],x=0.8, w=7, h=8) - pdf.ln(0.3) - - if analysis=="2": - - pdf.add_page() - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed non-template isomiRs',align='C') - pdf.ln(0.4) - pdf.image(images[1],x=0.5, w=7.5, h=6.5) - - pdf.add_page() - pdf.ln(0.5) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNAs and isomiRs grouped by arm',align='C') - pdf.ln(0.4) - pdf.image(images[2],x=0.8, w=7, h=8) - pdf.ln(0.3) - - - pdf.output('report2.pdf', 'F') - - -
--- a/mirbase_ultra_v2.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,367 +0,0 @@ -from mirbase_functions import * -from mirbase_graphs import * -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - -subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) - -parser = argparse.ArgumentParser() -parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") -parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[]) -parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] ) -parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") -parser.add_argument("-gen", "--org_name", help="tool directory path", action="store") -parser.add_argument("-program", "--pro", help="choose type of analysis", action="store") -parser.add_argument("-f", "--flag", help="choose the database", action="store") -parser.add_argument("-umis", "--umi", help="choose the database", action="store") -parser.add_argument("-percentage", "--per", help="choose the database", action="store") -parser.add_argument("-counts", "--count", help="choose the database", action="store") -parser.add_argument("-name1", "--n1", help="choose the database", action="store") -parser.add_argument("-name2", "--n2", help="choose the database", action="store") -args = parser.parse_args() - - -############################################################################################################################################################################################### - -if __name__ == '__main__': - - starttime = time.time() - - q1 = Queue() - q2 = Queue() - lock = Lock() - manager = Manager() - - mature_mirnas=manager.list() - ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) - ps_mature.start() - - args.control[0]=args.control[0][1:] - args.control[len(args.control)-1][:-1] - control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] - - args.treated[0]=args.treated[0][1:] - args.treated[len(args.treated)-1][:-1] - treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] - - -############## Detection of templated isoforms ################ - - radar = manager.list([0,0,0,0]) - samples = manager.list() - data= manager.list() - names_con=manager.list() - samples_mirna_names=manager.list() - deseq=manager.list() - unmap_seq=manager.Value('i',0) - unmap_counts=manager.Value('i',0) - LH2E_names=manager.list() - ini_c_samples = manager.list() - - - radar1 = manager.list([0,0,0,0]) - samples1 = manager.list() - data1 = manager.list() - names_tre = manager.list() - samples_mirna_names1=manager.list() - deseq1=manager.list() - unmap1_seq = manager.Value('i',0) - unmap1_counts = manager.Value('i',0) - LH8E_names=manager.list() - ini_t_samples = manager.list() - ps_mature.join() - - - mature_mirnas=list(mature_mirnas) - - - starttime1 = time.time() - ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control] - ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - print('SAM took {} seconds'.format(time.time() - starttime1)) - - ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))] - ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))]) - [x.start() for x in ps_hist] - - starttime200=time.time() - - sc = list(samples) - st = list(samples1) - - names_con=list(names_con) - names_tre=list(names_tre) - samples_mirna_names=list(samples_mirna_names) - samples_mirna_names.sort() - samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names)) - - samples_mirna_names1=list(samples_mirna_names1) - samples_mirna_names1.sort() - samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1)) - - deseq=list(deseq) - deseq1=list(deseq1) - - new_names_con=manager.list() - new_names_tre=manager.list() - new_deseq=manager.list() - new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)]) - - [z.start() for z in ps_deseq] - [z.join() for z in ps_deseq] - new_deseq=list(new_deseq) - new_deseq1=list(new_deseq1) - - LH2E=[[x[0],x[2]] for x in new_deseq[0]] - [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq] - - LH8E=[[x[0],x[2]] for x in new_deseq1[0]] - [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1] - - print('Deseq took {} seconds'.format(time.time() - starttime200)) - - merg_nam_LH2E=manager.list() - merg_nam_LH8E=manager.list() - - LH2E_copy=copy.deepcopy(list(LH2E)) - LH8E_copy=copy.deepcopy(list(LH8E)) - - fil_sort_tre=manager.list() - fil_sort_con=manager.list() - raw_sort_tre=manager.list() - raw_sort_con=manager.list() - - ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con,args.per,args.count)) - ps_main.start() - - if args.anal=="2": - all_iso = manager.list() - ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso)) - ps_non_iso.start() - - ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))]) - [x.start() for x in ps_merge] - [x.join() for x in ps_merge] - - merg_nam_LH2E=list(merg_nam_LH2E) - merg_nam_LH8E=list(merg_nam_LH8E) - - starttime2 = time.time() - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1]) - procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))]) - if args.anal == "1": - procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))]) - - [p.start() for p in procs] - - - if args.anal=="1": - [x.join() for x in ps_hist] - [p.join() for p in procs] - ps_pdf = Process(target=pdf_before_DE,args=(args.anal)) - ps_pdf.start() - - print('Graphs took {} seconds'.format(time.time() - starttime2)) - - ps_main.join() - - fil_sort_con=list(fil_sort_con) - fil_sort_tre=list(fil_sort_tre) - if fil_sort_con==[]: - fil_sort_con=raw_sort_con - fil_sort_tre=raw_sort_tre - - raw_sort_con=list(raw_sort_con) - raw_sort_tre=list(raw_sort_tre) - names_con=list(new_names_con) - names_tre=list(new_names_tre) - - ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1,args.per,args.n1,args.n2)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - if args.anal=="1": - ps_pdf.join() - if args.anal=="2": - [p.join() for p in procs] - [x.join() for x in ps_hist] - - ps_write.join() - [p.join() for p in ps1_matrix] - - -############################## Detection of Both ####################################### - - starttime10 = time.time() - - if args.anal == "2": - - n_data= manager.list() - n_names_con=manager.list() - n_samples_mirna_names=manager.list() - n_deseq=manager.list() - n_LH2E_names=manager.list() - - n_data1 = manager.list() - n_names_tre = manager.list() - n_samples_mirna_names1=manager.list() - n_deseq1=manager.list() - n_LH8E_names=manager.list() - - - new_mat_mirnas = list(mature_mirnas) - ps_non_iso.join() - - all_iso=list(all_iso) - new_mat_mirnas.extend(all_iso) - - starttime11=time.time() - - ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control] - ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - - print('Non-sam took {} seconds'.format(time.time() - starttime11)) - - starttime12=time.time() - - n_names_con=list(n_names_con) - n_names_tre=list(n_names_tre) - n_samples_mirna_names=list(n_samples_mirna_names) - n_samples_mirna_names.sort() - n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names)) - - n_samples_mirna_names1=list(n_samples_mirna_names1) - n_samples_mirna_names1.sort() - n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1)) - - n_deseq=list(n_deseq) - n_deseq1=list(n_deseq1) - - new_n_names_con=manager.list() - new_n_names_tre=manager.list() - n_new_deseq=manager.list() - n_new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)]) - - [x.start() for x in ps_deseq] - [x.join() for x in ps_deseq] - n_new_deseq=list(n_new_deseq) - n_new_deseq1=list(n_new_deseq1) - - n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]] - [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq] - - n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]] - [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1] - - print('Non-deseq took {} seconds'.format(time.time() - starttime12)) - - merg_nam_n_LH2E=manager.list() - merg_nam_n_LH8E=manager.list() - - n_LH2E_copy=copy.deepcopy(list(n_LH2E)) - n_LH8E_copy=copy.deepcopy(list(n_LH8E)) - - n_sort_tre=manager.list() - n_sort_con=manager.list() - - n_fil_sort_con=manager.list() - n_fil_sort_tre=manager.list() - n_raw_sort_con=manager.list() - n_raw_sort_tre=manager.list() - - ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con,args.per,args.count)) - ps_main.start() - - ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))]) - [p.start() for p in ps_merge] - [p.join() for p in ps_merge] - - merg_nam_n_LH2E=list(merg_nam_n_LH2E) - merg_nam_n_LH8E=list(merg_nam_n_LH8E) - - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))]) - procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))]) - - starttime13=time.time() - [p.start() for p in procs] - [p.join() for p in procs] - - print('Graphs took {} seconds'.format(time.time() - starttime13)) - - procs1 = Process(target=pdf_before_DE,args=(args.anal)) - procs1.start() - - starttime14=time.time() - ps_main.join() - - n_fil_sort_con=list(n_fil_sort_con) - n_fil_sort_tre=list(n_fil_sort_tre) - if n_fil_sort_con==[]: - n_fil_sort_con=n_raw_sort_con - n_fil_sort_tre=n_raw_sort_tre - - n_raw_sort_con=list(n_raw_sort_con) - n_raw_sort_tre=list(n_raw_sort_tre) - n_names_con=list(new_n_names_con) - n_names_tre=list(new_n_names_tre) - - ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2,args.per,args.n1,args.n2)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - ps_write.join() - [p.join() for p in ps1_matrix] - procs1.join() - print('That took {} seconds'.format(time.time() - starttime10)) - print('That took {} seconds'.format(time.time() - starttime)) - - - - - - - -
--- a/mirgene.loc.sample Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -# bowtie2_indices.loc.sample -# This is a *.loc.sample file distributed with Galaxy that enables tools -# to use a directory of indexed data files. This one is for Bowtie2 and Tophat2. -# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup -# First create these data files and save them in your own data directory structure. -# Then, create a bowtie_indices.loc file to use those indexes with tools. -# Copy this file, save it with the same name (minus the .sample), -# follow the format examples, and store the result in this directory. -# The file should include an one line entry for each index set. -# The path points to the "basename" for the set, not a specific file. -# It has four text columns seperated by TABS. -# -# <unique_build_id> <dbkey> <display_name> <file_base_path> -# -# So, for example, if you had hg18 indexes stored in: -# -# /depot/data2/galaxy/hg19/bowtie2/ -# -# containing hg19 genome and hg19.*.bt2 files, such as: -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 -# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 -# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 -# -# then the bowtie2_indices.loc entry could look like this: -# -# -Hsa Human (Homo sapiens) -# -# -# -# -#
--- a/mirgene_functions.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,749 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - - - -"""---------------------- Simple Functions -----------------------""" - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - -"""---------------------- RNA-seq Functions ----------------------""" - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - - - -######################################################################################################################################################## -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if mature_mirnas[i] == unique_seq[j][2]: - - temp_mature = mature_mirnas[i+1] - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) - - # Store name of arms, number of sequences and number of reads - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in unique_seq: - if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(z[2] for z in unique_seq) - for y in unique_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(unique_seq) - data.append([con,name,unique_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in unique_seq) - for y in unique_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(unique_seq) - data.append([con,name,unique_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### -""" -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - uni_seq=[] - - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - t_name=copy.deepcopy(unique_seq[j]) - t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -################################################################################################################################################################################################################# - -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -################################################################################################################################################################################################################## - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - if int(per)!=-1: - percent=int(per)/100 - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - filter_LH8E.append(LH8E[i]) - filter_LH2E.append(LH2E[i]) - - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - if flag == 2: - fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - -#################################################################################################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -#################################################################################################################################################################################################################### - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): - if any(y[2] in lst2 for lst2 in unique_iso)==False: - y[2]=">"+y[2] - unique_iso.append(y) - l.acquire() - for x in unique_iso: - mat_mirnas.append(x[2]) - mat_mirnas.append(x[9]) - l.release() - -######################################################################################################################### - -def merging_names(LH2E_copy,new): - - dupes=[] - final_LH2E =[] - - for num in range(len(LH2E_copy)): - - if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: - final_LH2E.append(LH2E_copy[num][1]) - final_LH2E.append(LH2E_copy[num][0]) - else: - dupes.append(LH2E_copy[num][1]) - - dupes=list(set(dupes)) - - for i in range(len(dupes)): - dupes[i]=[dupes[i]] - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: - fl=1 - if len(x[0])<len(y[i]): - del y[i] - y.append(x[0]) - break - - if fl==0: - y.append((x[0])) - - for y in dupes: - if len(y)>2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - new.extend(LH2E_copy) - -###################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################# - -def download_matures(matures,org_name): - - mature_mir=[] - - mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' - star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' - - data = urllib.request.urlopen(mat_url).read() - file_mirna = data.decode('utf-8') - mature_mir = file_mirna.split("\n") - mature_mir = [x.replace(">","") for x in mature_mir] - del mature_mir[-1] - - data = urllib.request.urlopen(star_url).read() - file_mirna = data.decode('utf-8') - star_mir = file_mirna.split("\n") - star_mir = [x.replace(">","") for x in star_mir] - del star_mir[-1] - - mature_mir.extend(star_mir) - - for i in range(1,len(mature_mir),2): - mature_mir[i]=mature_mir[i].replace("U","T") - - matures.extend(mature_mir) - -################################################################################################################### - -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if y[2] not in all_isoforms and len(y[2].split("_"))>2: - all_isoforms.append(y[2]) - all_isoforms.append(y[9]) - - for x in pre_uni_seq_tre: - for y in x: - if y[2] not in all_isoforms and len(y[2].split("_"))>2: - all_isoforms.append(y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### -
--- a/mirgene_graphs.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,847 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -######################################################################################################################### -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - # Plot - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - # Plot - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -#################################################################################################################################################################################################################### - -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - # Plot - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - # Plot - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[3]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] - num=0 - for y in x[0].split("/"): - if y.split("_")[-2]+"_"+y.split("_")[-1]==part: - num+=1 - if num==len(x[0].split("/")): - - if 0 == int(x[0].split("/")[0].split("_")[3]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("/")[0].split("_")[2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - else: - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[3]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] - num=0 - for y in x[0].split("/"): - if y.split("_")[-2]+"_"+y.split("_")[-1]==part: - num+=1 - if num==len(x[0].split("/")): - - if 0 == int(x[0].split("/")[0].split("_")[3]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("/")[0].split("_")[2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - else: - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - - -############################################################################################################################################################################################################## - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - -############################################################################################################################################################################################################# - -def hist_red(samples,flag): - - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -############################################################################################################################################################################################################################################## - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=best[2] - elif best[1][i] == "C": - C[i]+=best[2] - elif best[1][i] == "G": - G[i]+=best[2] - else: - T[i]+=best[2] - else: - total_reads+=y[2] - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "C": - C[i]+=(y[2]) - elif y[0].split("_")[-1][i] == "G": - G[i]+=(y[2]) - else: - T[i]+=y[2] - - A[:] = [round(x*100,1) / total_reads for x in A] - C[:] = [round(x*100,1) / total_reads for x in C] - G[:] = [round(x*100,1) / total_reads for x in G] - T[:] = [round(x*100,1) / total_reads for x in T] - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - - h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) - h.grid(axis='y',linewidth=0.2) - plt.xticks(rotation=0, ha="right") - plt.ylabel("Counts (%)",fontsize=18) - plt.xlabel("Positions (nt)",fontsize=18) - plt.title(titlos,fontsize=20) - plt.tight_layout() - plt.savefig(file_bar, dpi=300) - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - crp_logo.ax.set_title(titlos,fontsize=18) - crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - figure = plt.gcf() - figure.set_size_inches(6, 4) - crp_logo.fig.savefig(file_logo,dpi=300) - -########################################################################################################################################################################################################## - -def logo_seq_non_red(merge_LH2E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - for i in range(5): - if i<len(best[1]): - if best[1][i] == "A": - A[i]+=1 - elif best[1][i] == "C": - C[i]+=1 - elif best[1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - else: - for i in range(5): - if i<len(y[0].split("_")[-1]): - if y[0].split("_")[-1][i] == "A": - A[i]+=1 - elif y[0].split("_")[-1][i] == "C": - C[i]+=1 - elif y[0].split("_")[-1][i] == "G": - G[i]+=1 - else: - T[i]+=1 - - data = {'A':A,'C':C,'G':G,'T':T} - df = pd.DataFrame(data, index=[1,2,3,4,5]) - h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"])) - h.set_xlabel("Positions (nt)") - h.set_ylabel("Unique sequences") - plt.xticks(rotation=0, ha="right") - plt.tight_layout() - plt.savefig("bar2.png", dpi=300) - - - import logomaker as lm - crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') - - crp_logo.style_spines(visible=False) - crp_logo.style_spines(spines=['left', 'bottom'], visible=True) - crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) - - crp_logo.ax.set_ylabel("Unique sequences", labelpad=5) - crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5) - crp_logo.ax.xaxis.set_ticks_position('none') - crp_logo.ax.xaxis.set_tick_params(pad=-1) - crp_logo.ax.set_title("Non-redundant") - figure = plt.gcf() - crp_logo.fig.savefig('logo2.png', dpi=300) - - -################################################################################################################################################################################################################ - - - - - -################################################################################################################################################################ - -def pdf_before_DE(analysis): - - # Import FPDF class - from fpdf import FPDF, fpdf - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - if analysis=="2": - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png") - else: - image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png") - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - #sys.exit(images) - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'A4') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 20.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'IsomiR Profile Report',align='C') - pdf.ln(0.7) - pdf.set_font('Arial','', 16.0) - pdf.cell(pdf.w-0.5, 0.5, 'sRNA Length Distribution',align='C') - - # Smaller font for image captions - pdf.set_font('Arial', '', 11.0) - - # Image caption - pdf.ln(0.5) - - yh=FPDF.get_y(pdf) - pdf.image(images[0],x=0.3,w=4, h=3) - pdf.image(images[1],x=4,y=yh, w=4, h=3) - pdf.ln(0.3) - - # Image caption - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Mapped and unmapped reads to custom precussor arm reference DB (5p and 3p arms) in Control (left)") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " and Treated (right) groups") - - - pdf.ln(0.5) - h1=FPDF.get_y(pdf) - pdf.image(images[2],x=1, w=6.5, h=5) - h2=FPDF.get_y(pdf) - FPDF.set_y(pdf,h1+0.2) - pdf.set_font('Arial','', 14.0) - pdf.cell(pdf.w-0.5, 0.5, 'Template and non-template IsomiRs',align='C') - pdf.set_font('Arial', '', 11.0) - FPDF.set_y(pdf,h2) - FPDF.set_y(pdf,9.5) - # Image caption - pdf.cell(0.2) - if analysis=="2": - pdf.cell(3.0, 0.0, " Template, non-template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " in Control (left) and treated (right) groups") - else: - pdf.cell(3.0, 0.0, " Template, miRNA RefSeq and unmapped sequences as percentage of total sRNA reads in") - pdf.ln(0.2) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " Control (left) and treated (right) groups") - - - - pdf.add_page() - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Reference form and isomiR among total miRNA reads",align='C') - pdf.ln(0.7) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "Template isomiR profile (redundant)",align='C') - pdf.ln(0.5) - pdf.image(images[3],x=1.5, w=5.5, h=4) - pdf.ln(0.6) - pdf.cell(pdf.w-0.5, 0.0, "Template isomiR profile (non-redundant)",align='C') - pdf.set_font('Arial', '', 12.0) - pdf.ln(0.2) - pdf.image(images[4],x=1.5, w=5.5, h=4) - pdf.ln(0.3) - pdf.set_font('Arial', '', 11.0) - pdf.cell(0.2) - pdf.cell(3.0, 0.0, " * IsomiRs potentialy initiated from multiple loci") - - - if analysis=="2": - pdf.add_page('L') - - pdf.set_font('Arial', 'B', 16.0) - pdf.cell(pdf.w-0.5, 0.5, "Non-template IsomiRs",align='C') - pdf.ln(0.5) - pdf.set_font('Arial', 'B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, "3' Additions of reference of isomiR sequence",align='C') - pdf.ln(0.7) - - yh=FPDF.get_y(pdf) - pdf.image(images[5],x=1.5,w=3.65, h=2.65) - pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65) - pdf.ln(0.5) - yh=FPDF.get_y(pdf) - pdf.image(images[6],x=1.5,w=3.65, h=2.65) - pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65) - - pdf.close() - pdf.output('report1.pdf','F') - - - - -#############################################################################################################################################################3 - -def pdf_after_DE(analysis): - - # Import FPDF class - from fpdf import FPDF - - # Import glob module to find all the files matching a pattern - import glob - - # Image extensions - image_extensions = ("tem.png","non.png","a2.png") - - # This list will hold the images file names - images = [] - - # Build the image list by merging the glob results (a list of files) - # for each extension. We are taking images from current folder. - for extension in image_extensions: - images.extend(glob.glob(extension)) - - # Create instance of FPDF class - pdf = FPDF('P', 'in', 'letter') - # Add new page. Without this you cannot create the document. - pdf.add_page() - # Set font to Arial, 'B'old, 16 pts - pdf.set_font('Arial', 'B', 16.0) - - # Page header - pdf.cell(pdf.w-0.5, 0.5, 'Differential expression of miRNAs and Isoforms',align='C') - #pdf.ln(0.25) - - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNA and template isoforms',align='C') - - - # Smaller font for image captions - pdf.set_font('Arial', '', 10.0) - - # Image caption - pdf.ln(0.4) - pdf.image(images[0],x=0.8, w=7, h=8) - pdf.ln(0.3) - - if analysis=="2": - - pdf.add_page() - pdf.ln(0.7) - pdf.set_font('Arial','B', 12.0) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed non-template isomiRs',align='C') - pdf.ln(0.4) - pdf.image(images[1],x=0.5, w=7.5, h=6.5) - - pdf.add_page() - pdf.ln(0.5) - pdf.cell(pdf.w-0.5, 0.5, 'Top 50 most differentially expressed miRNAs and isomiRs grouped by arm',align='C') - pdf.ln(0.4) - pdf.image(images[2],x=0.8, w=7, h=8) - pdf.ln(0.3) - - - pdf.output('report2.pdf', 'F') - - -
--- a/mirgene_ultra_v2.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,365 +0,0 @@ -from mirgene_functions import * -from mirgene_graphs import * -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - -subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) - -parser = argparse.ArgumentParser() -parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") -parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[]) -parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] ) -parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") -parser.add_argument("-gen", "--org_name", help="tool directory path", action="store") -parser.add_argument("-program", "--pro", help="choose type of analysis", action="store") -parser.add_argument("-f", "--flag", help="choose the database", action="store") -parser.add_argument("-umis", "--umi", help="choose the database", action="store") -parser.add_argument("-percentage", "--per", help="choose the database", action="store") -parser.add_argument("-counts", "--count", help="choose the database", action="store") -parser.add_argument("-name1", "--n1", help="choose the database", action="store") -parser.add_argument("-name2", "--n2", help="choose the database", action="store") -args = parser.parse_args() - -#########################################################################3############################################################################################################################################################################################### - -if __name__ == '__main__': - - starttime = time.time() - - q1 = Queue() - q2 = Queue() - lock = Lock() - manager = Manager() - - mature_mirnas=manager.list() - ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) - ps_mature.start() - - args.control[0]=args.control[0][1:] - args.control[len(args.control)-1][:-1] - control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] - - args.treated[0]=args.treated[0][1:] - args.treated[len(args.treated)-1][:-1] - treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] - - -############## Detection of templated isoforms ################ - - radar = manager.list([0,0,0,0]) - samples = manager.list() - data= manager.list() - names_con=manager.list() - samples_mirna_names=manager.list() - deseq=manager.list() - unmap_seq=manager.Value('i',0) - unmap_counts=manager.Value('i',0) - LH2E_names=manager.list() - ini_c_samples = manager.list() - - - radar1 = manager.list([0,0,0,0]) - samples1 = manager.list() - data1 = manager.list() - names_tre = manager.list() - samples_mirna_names1=manager.list() - deseq1=manager.list() - unmap1_seq = manager.Value('i',0) - unmap1_counts = manager.Value('i',0) - LH8E_names=manager.list() - ini_t_samples = manager.list() - ps_mature.join() - - - mature_mirnas=list(mature_mirnas) - - starttime1 = time.time() - ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control] - ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - print('SAM took {} seconds'.format(time.time() - starttime1)) - - ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))] - ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))]) - [x.start() for x in ps_hist] - - starttime200=time.time() - sc = list(samples) - st = list(samples1) - - names_con=list(names_con) - names_tre=list(names_tre) - samples_mirna_names=list(samples_mirna_names) - samples_mirna_names.sort() - samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names)) - - samples_mirna_names1=list(samples_mirna_names1) - samples_mirna_names1.sort() - samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1)) - - deseq=list(deseq) - deseq1=list(deseq1) - - new_names_con=manager.list() - new_names_tre=manager.list() - new_deseq=manager.list() - new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)]) - - [z.start() for z in ps_deseq] - [z.join() for z in ps_deseq] - new_deseq=list(new_deseq) - new_deseq1=list(new_deseq1) - - LH2E=[[x[0],x[2]] for x in new_deseq[0]] - [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq] - - LH8E=[[x[0],x[2]] for x in new_deseq1[0]] - [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1] - - print('Deseq took {} seconds'.format(time.time() - starttime200)) - - merg_nam_LH2E=manager.list() - merg_nam_LH8E=manager.list() - - LH2E_copy=copy.deepcopy(list(LH2E)) - LH8E_copy=copy.deepcopy(list(LH8E)) - - fil_sort_tre=manager.list() - fil_sort_con=manager.list() - raw_sort_tre=manager.list() - raw_sort_con=manager.list() - - ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con,args.per,args.count)) - ps_main.start() - - if args.anal=="2": - all_iso = manager.list() - ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso)) - ps_non_iso.start() - - ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))]) - [x.start() for x in ps_merge] - [x.join() for x in ps_merge] - - merg_nam_LH2E=list(merg_nam_LH2E) - merg_nam_LH8E=list(merg_nam_LH8E) - - starttime2 = time.time() - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1]) - procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))]) - if args.anal == "1": - procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))]) - - [p.start() for p in procs] - - if args.anal=="1": - [x.join() for x in ps_hist] - [p.join() for p in procs] - ps_pdf = Process(target=pdf_before_DE,args=(args.anal)) - ps_pdf.start() - - print('Graphs took {} seconds'.format(time.time() - starttime2)) - - ps_main.join() - - fil_sort_con=list(fil_sort_con) - fil_sort_tre=list(fil_sort_tre) - if fil_sort_con==[]: - fil_sort_con=raw_sort_con - fil_sort_tre=raw_sort_tre - - raw_sort_con=list(raw_sort_con) - raw_sort_tre=list(raw_sort_tre) - names_con=list(new_names_con) - names_tre=list(new_names_tre) - - ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1,args.per,args.n1,args.n2)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - if args.anal=="1": - ps_pdf.join() - if args.anal=="2": - [p.join() for p in procs] - [x.join() for x in ps_hist] - - ps_write.join() - [p.join() for p in ps1_matrix] - -############################## Detection of Both ####################################### - - starttime10 = time.time() - - if args.anal == "2": - - n_data= manager.list() - n_names_con=manager.list() - n_samples_mirna_names=manager.list() - n_deseq=manager.list() - n_LH2E_names=manager.list() - - n_data1 = manager.list() - n_names_tre = manager.list() - n_samples_mirna_names1=manager.list() - n_deseq1=manager.list() - n_LH8E_names=manager.list() - - new_mat_mirnas = list(mature_mirnas) - ps_non_iso.join() - - all_iso=list(all_iso) - new_mat_mirnas.extend(all_iso) - - starttime11=time.time() - - ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control] - ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated]) - - [p.start() for p in ps_sam] - [p.join() for p in ps_sam] - - print('Non-sam took {} seconds'.format(time.time() - starttime11)) - - starttime12=time.time() - - n_names_con=list(n_names_con) - n_names_tre=list(n_names_tre) - n_samples_mirna_names=list(n_samples_mirna_names) - n_samples_mirna_names.sort() - n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names)) - - - n_samples_mirna_names1=list(n_samples_mirna_names1) - n_samples_mirna_names1.sort() - n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1)) - - n_deseq=list(n_deseq) - n_deseq1=list(n_deseq1) - - new_n_names_con=manager.list() - new_n_names_tre=manager.list() - n_new_deseq=manager.list() - n_new_deseq1=manager.list() - ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)] - ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)]) - - [x.start() for x in ps_deseq] - [x.join() for x in ps_deseq] - n_new_deseq=list(n_new_deseq) - n_new_deseq1=list(n_new_deseq1) - - n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]] - [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq] - - n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]] - [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1] - - print('Non-deseq took {} seconds'.format(time.time() - starttime12)) - - merg_nam_n_LH2E=manager.list() - merg_nam_n_LH8E=manager.list() - - n_LH2E_copy=copy.deepcopy(list(n_LH2E)) - n_LH8E_copy=copy.deepcopy(list(n_LH8E)) - - n_fil_sort_con=manager.list() - n_fil_sort_tre=manager.list() - n_raw_sort_con=manager.list() - n_raw_sort_tre=manager.list() - - ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con,args.per,args.count)) - ps_main.start() - - starttime14=time.time() - ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))] - ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))]) - [p.start() for p in ps_merge] - [p.join() for p in ps_merge] - - merg_nam_n_LH2E=list(merg_nam_n_LH2E) - merg_nam_n_LH8E=list(merg_nam_n_LH8E) - - print('Merging took {} seconds'.format(time.time() - starttime14)) - - procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data] - procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))]) - procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))]) - procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))]) - - - starttime13=time.time() - [p.start() for p in procs] - [p.join() for p in procs] - - print('Graphs took {} seconds'.format(time.time() - starttime13)) - - procs1 = Process(target=pdf_before_DE,args=(args.anal)) - procs1.start() - starttime16=time.time() - ps_main.join() - print('Main took {} seconds'.format(time.time() - starttime16)) - - starttime15=time.time() - n_fil_sort_con=list(n_fil_sort_con) - n_fil_sort_tre=list(n_fil_sort_tre) - if n_fil_sort_con==[]: - n_fil_sort_con=n_raw_sort_con - n_fil_sort_tre=n_raw_sort_tre - - n_raw_sort_con=list(n_raw_sort_con) - n_raw_sort_tre=list(n_raw_sort_tre) - n_names_con=list(new_n_names_con) - n_names_tre=list(new_n_names_tre) - - ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2,args.per,args.n1,args.n2)) - ps_write.start() - - ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))] - ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))]) - [p.start() for p in ps1_matrix] - - ps_write.join() - [p.join() for p in ps1_matrix] - procs1.join() - print('Files took {} seconds'.format(time.time() - starttime15)) - print('That took {} seconds'.format(time.time() - starttime10)) - print('That took {} seconds'.format(time.time() - starttime)) - - - - - - - -
--- a/n_spiecies.loc.sample Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,300 +0,0 @@ -# bowtie2_indices.loc.sample -# This is a *.loc.sample file distributed with Galaxy that enables tools -# to use a directory of indexed data files. This one is for Bowtie2 and Tophat2. -# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup -# First create these data files and save them in your own data directory structure. -# Then, create a bowtie_indices.loc file to use those indexes with tools. -# Copy this file, save it with the same name (minus the .sample), -# follow the format examples, and store the result in this directory. -# The file should include an one line entry for each index set. -# The path points to the "basename" for the set, not a specific file. -# It has four text columns seperated by TABS. -# -# <unique_build_id> <dbkey> <display_name> <file_base_path> -# -# So, for example, if you had hg18 indexes stored in: -# -# /depot/data2/galaxy/hg19/bowtie2/ -# -# containing hg19 genome and hg19.*.bt2 files, such as: -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 -# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 -# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 -# -# then the bowtie2_indices.loc entry could look like this: -# -Acacia auriculiformis Acacia auriculiformis Acacia auriculiformis -Acacia mangium Acacia mangium Acacia mangium -Acyrthosiphon pisum Acyrthosiphon pisum Acyrthosiphon pisum -Aedes aegypti Aedes aegypti Aedes aegypti -Aegilops tauschii Aegilops tauschii Aegilops tauschii -Alligator mississippiensis Alligator mississippiensis Alligator mississippiensis -Amborella trichopoda Amborella trichopoda Amborella trichopoda -Amphimedon queenslandica Amphimedon queenslandica Amphimedon queenslandica -Anas platyrhynchos Anas platyrhynchos Anas platyrhynchos -Anolis carolinensis Anolis carolinensis Anolis carolinensis -Anopheles gambiae Anopheles gambiae Anopheles gambiae -Apis mellifera Apis mellifera Apis mellifera -Aquilegia caerulea Aquilegia caerulea Aquilegia caerulea -Arabidopsis lyrata Arabidopsis lyrata Arabidopsis lyrata -Arabidopsis thaliana Arabidopsis thaliana Arabidopsis thaliana -Arachis hypogaea Arachis hypogaea Arachis hypogaea -Artibeus jamaicensis Artibeus jamaicensis Artibeus jamaicensis -Ascaris suum Ascaris suum Ascaris suum -Asparagus officinalis Asparagus officinalis Asparagus officinalis -Astatotilapia burtoni Astatotilapia burtoni Astatotilapia burtoni -Ateles geoffroyi Ateles geoffroyi Ateles geoffroyi -Avicennia marina Avicennia marina Avicennia marina -BK polyomavirus BK polyomavirus BK polyomavirus -Bactrocera dorsalis Bactrocera dorsalis Bactrocera dorsalis -Bandicoot papillomatosis Bandicoot papillomatosis Bandicoot papillomatosis -Biston betularia Biston betularia Biston betularia -Bombyx mori Bombyx mori Bombyx mori -Bos taurus Bos taurus Bos taurus -Bovine foamy Bovine foamy Bovine foamy -Bovine herpesvirus Bovine herpesvirus Bovine herpesvirus -Bovine leukemia Bovine leukemia Bovine leukemia -Brachypodium distachyon Brachypodium distachyon Brachypodium distachyon -Branchiostoma belcheri Branchiostoma belcheri Branchiostoma belcheri -Branchiostoma floridae Branchiostoma floridae Branchiostoma floridae -Brassica napus Brassica napus Brassica napus -Brassica oleracea Brassica oleracea Brassica oleracea -Brassica rapa Brassica rapa Brassica rapa -Brugia malayi Brugia malayi Brugia malayi -Bruguiera cylindrica Bruguiera cylindrica Bruguiera cylindrica -Bruguiera gymnorhiza Bruguiera gymnorhiza Bruguiera gymnorhiza -Caenorhabditis brenneri Caenorhabditis brenneri Caenorhabditis brenneri -Caenorhabditis briggsae Caenorhabditis briggsae Caenorhabditis briggsae -Caenorhabditis elegans Caenorhabditis elegans Caenorhabditis elegans -Caenorhabditis remanei Caenorhabditis remanei Caenorhabditis remanei -Callithrix jacchus Callithrix jacchus Callithrix jacchus -Camelina sativa Camelina sativa Camelina sativa -Canis familiaris Canis familiaris Canis familiaris -Capitella teleta Capitella teleta Capitella teleta -Capra hircus Capra hircus Capra hircus -Carica papaya Carica papaya Carica papaya -Cavia porcellus Cavia porcellus Cavia porcellus -Cerebratulus lacteus Cerebratulus lacteus Cerebratulus lacteus -Chlamydomonas reinhardtii Chlamydomonas reinhardtii Chlamydomonas reinhardtii -Chrysemys picta Chrysemys picta Chrysemys picta -Ciona intestinalis Ciona intestinalis Ciona intestinalis -Ciona savignyi Ciona savignyi Ciona savignyi -Citrus clementina Citrus clementina Citrus clementina -Citrus reticulata Citrus reticulata Citrus reticulata -Citrus sinensis Citrus sinensis Citrus sinensis -Citrus trifoliata Citrus trifoliata Citrus trifoliata -Columba livia Columba livia Columba livia -Cricetulus griseus Cricetulus griseus Cricetulus griseus -Cucumis melo Cucumis melo Cucumis melo -Cucumis sativus Cucumis sativus Cucumis sativus -Culex quinquefasciatus Culex quinquefasciatus Culex quinquefasciatus -Cunninghamia lanceolata Cunninghamia lanceolata Cunninghamia lanceolata -Cynara cardunculus Cynara cardunculus Cynara cardunculus -Cyprinus carpio Cyprinus carpio Cyprinus carpio -Danio rerio Danio rerio Danio rerio -Daphnia pulex Daphnia pulex Daphnia pulex -Dasypus novemcinctus Dasypus novemcinctus Dasypus novemcinctus -Daubentonia madagascariensis Daubentonia madagascariensis Daubentonia madagascariensis -Dictyostelium discoideum Dictyostelium discoideum Dictyostelium discoideum -Digitalis purpurea Digitalis purpurea Digitalis purpurea -Dinoponera quadriceps Dinoponera quadriceps Dinoponera quadriceps -Drosophila ananassae Drosophila ananassae Drosophila ananassae -Drosophila erecta Drosophila erecta Drosophila erecta -Drosophila grimshawi Drosophila grimshawi Drosophila grimshawi -Drosophila melanogaster Drosophila melanogaster Drosophila melanogaster -Drosophila mojavensis Drosophila mojavensis Drosophila mojavensis -Drosophila persimilis Drosophila persimilis Drosophila persimilis -Drosophila pseudoobscura Drosophila pseudoobscura Drosophila pseudoobscura -Drosophila sechellia Drosophila sechellia Drosophila sechellia -Drosophila simulans Drosophila simulans Drosophila simulans -Drosophila virilis Drosophila virilis Drosophila virilis -Drosophila willistoni Drosophila willistoni Drosophila willistoni -Drosophila yakuba Drosophila yakuba Drosophila yakuba -Duck enteritis Duck enteritis Duck enteritis -Echinococcus granulosus Echinococcus granulosus Echinococcus granulosus -Echinococcus multilocularis Echinococcus multilocularis Echinococcus multilocularis -Ectocarpus siliculosus Ectocarpus siliculosus Ectocarpus siliculosus -Elaeis guineensis Elaeis guineensis Elaeis guineensis -Electrophorus electricus Electrophorus electricus Electrophorus electricus -Epstein Barr Epstein Barr Epstein Barr -Eptesicus fuscus Eptesicus fuscus Eptesicus fuscus -Equus caballus Equus caballus Equus caballus -Eugenia uniflora Eugenia uniflora Eugenia uniflora -Fasciola hepatica Fasciola hepatica Fasciola hepatica -Festuca arundinacea Festuca arundinacea Festuca arundinacea -Fragaria vesca Fragaria vesca Fragaria vesca -Fugu rubripes Fugu rubripes Fugu rubripes -Gadus morhua Gadus morhua Gadus morhua -Gallus gallus Gallus gallus Gallus gallus -Glottidia pyramidata Glottidia pyramidata Glottidia pyramidata -Glycine max Glycine max Glycine max -Glycine soja Glycine soja Glycine soja -Gorilla gorilla Gorilla gorilla Gorilla gorilla -Gossypium arboreum Gossypium arboreum Gossypium arboreum -Gossypium herbaceum Gossypium herbaceum Gossypium herbaceum -Gossypium hirsutum Gossypium hirsutum Gossypium hirsutum -Gossypium raimondii Gossypium raimondii Gossypium raimondii -Gyrodactylus salaris Gyrodactylus salaris Gyrodactylus salaris -Haemonchus contortus Haemonchus contortus Haemonchus contortus -Haliotis rufescens Haliotis rufescens Haliotis rufescens -Helianthus annuus Helianthus annuus Helianthus annuus -Helianthus argophyllus Helianthus argophyllus Helianthus argophyllus -Helianthus ciliaris Helianthus ciliaris Helianthus ciliaris -Helianthus exilis Helianthus exilis Helianthus exilis -Helianthus paradoxus Helianthus paradoxus Helianthus paradoxus -Helianthus petiolaris Helianthus petiolaris Helianthus petiolaris -Helianthus tuberosus Helianthus tuberosus Helianthus tuberosus -Heliconius melpomene Heliconius melpomene Heliconius melpomene -Heligmosomoides polygyrus Heligmosomoides polygyrus Heligmosomoides polygyrus -Herpes B Herpes B Herpes B -Herpes Simplex Herpes Simplex Herpes Simplex -Herpesvirus of Herpesvirus of Herpesvirus of -Herpesvirus saimiri Herpesvirus saimiri Herpesvirus saimiri -Hevea brasiliensis Hevea brasiliensis Hevea brasiliensis -Hippoglossus hippoglossus Hippoglossus hippoglossus Hippoglossus hippoglossus -Homo sapiens Homo sapiens Homo sapiens -Hordeum vulgare Hordeum vulgare Hordeum vulgare -Human cytomegalovirus Human cytomegalovirus Human cytomegalovirus -Human herpesvirus Human herpesvirus Human herpesvirus -Human immunodeficiency Human immunodeficiency Human immunodeficiency -Hydra magnipapillata Hydra magnipapillata Hydra magnipapillata -Ictalurus punctatus Ictalurus punctatus Ictalurus punctatus -Infectious laryngotracheitis Infectious laryngotracheitis Infectious laryngotracheitis -Ixodes scapularis Ixodes scapularis Ixodes scapularis -JC polyomavirus JC polyomavirus JC polyomavirus -Kaposi sarcoma-associated Kaposi sarcoma-associated Kaposi sarcoma-associated -Lagothrix lagotricha Lagothrix lagotricha Lagothrix lagotricha -Lemur catta Lemur catta Lemur catta -Leucosolenia complicata Leucosolenia complicata Leucosolenia complicata -Linum usitatissimum Linum usitatissimum Linum usitatissimum -Locusta migratoria Locusta migratoria Locusta migratoria -Lottia gigantea Lottia gigantea Lottia gigantea -Lotus japonicus Lotus japonicus Lotus japonicus -Lytechinus variegatus Lytechinus variegatus Lytechinus variegatus -Macaca mulatta Macaca mulatta Macaca mulatta -Macaca nemestrina Macaca nemestrina Macaca nemestrina -Macropus eugenii Macropus eugenii Macropus eugenii -Malus domestica Malus domestica Malus domestica -Manduca sexta Manduca sexta Manduca sexta -Manihot esculenta Manihot esculenta Manihot esculenta -Mareks disease Mareks disease Mareks disease -Marsupenaeus japonicus Marsupenaeus japonicus Marsupenaeus japonicus -Medicago truncatula Medicago truncatula Medicago truncatula -Melibe leonina Melibe leonina Melibe leonina -Merkel cell Merkel cell Merkel cell -Mesocestoides corti Mesocestoides corti Mesocestoides corti -Metriaclima zebra Metriaclima zebra Metriaclima zebra -Microcebus murinus Microcebus murinus Microcebus murinus -Monodelphis domestica Monodelphis domestica Monodelphis domestica -Mouse cytomegalovirus Mouse cytomegalovirus Mouse cytomegalovirus -Mouse gammaherpesvirus Mouse gammaherpesvirus Mouse gammaherpesvirus -Mus musculus Mus musculus Mus musculus -Nasonia giraulti Nasonia giraulti Nasonia giraulti -Nasonia longicornis Nasonia longicornis Nasonia longicornis -Nasonia vitripennis Nasonia vitripennis Nasonia vitripennis -Nematostella vectensis Nematostella vectensis Nematostella vectensis -Neolamprologus brichardi Neolamprologus brichardi Neolamprologus brichardi -Nicotiana tabacum Nicotiana tabacum Nicotiana tabacum -Nomascus leucogenys Nomascus leucogenys Nomascus leucogenys -Oikopleura dioica Oikopleura dioica Oikopleura dioica -Ophiophagus hannah Ophiophagus hannah Ophiophagus hannah -Oreochromis niloticus Oreochromis niloticus Oreochromis niloticus -Ornithorhynchus anatinus Ornithorhynchus anatinus Ornithorhynchus anatinus -Oryctolagus cuniculus Oryctolagus cuniculus Oryctolagus cuniculus -Oryza sativa Oryza sativa Oryza sativa -Oryzias latipes Oryzias latipes Oryzias latipes -Otolemur garnettii Otolemur garnettii Otolemur garnettii -Ovis aries Ovis aries Ovis aries -Paeonia lactiflora Paeonia lactiflora Paeonia lactiflora -Pan paniscus Pan paniscus Pan paniscus -Pan troglodytes Pan troglodytes Pan troglodytes -Panagrellus redivivus Panagrellus redivivus Panagrellus redivivus -Panax ginseng Panax ginseng Panax ginseng -Papio hamadryas Papio hamadryas Papio hamadryas -Paralichthys olivaceus Paralichthys olivaceus Paralichthys olivaceus -Parasteatoda tepidariorum Parasteatoda tepidariorum Parasteatoda tepidariorum -Patiria miniata Patiria miniata Patiria miniata -Petromyzon marinus Petromyzon marinus Petromyzon marinus -Phaeodactylum tricornutum Phaeodactylum tricornutum Phaeodactylum tricornutum -Phaseolus vulgaris Phaseolus vulgaris Phaseolus vulgaris -Physcomitrella patens Physcomitrella patens Physcomitrella patens -Phytophthora infestans Phytophthora infestans Phytophthora infestans -Phytophthora ramorum Phytophthora ramorum Phytophthora ramorum -Phytophthora sojae Phytophthora sojae Phytophthora sojae -Picea abies Picea abies Picea abies -Pinus densata Pinus densata Pinus densata -Pinus taeda Pinus taeda Pinus taeda -Plutella xylostella Plutella xylostella Plutella xylostella -Polistes canadensis Polistes canadensis Polistes canadensis -Pongo pygmaeus Pongo pygmaeus Pongo pygmaeus -Populus euphratica Populus euphratica Populus euphratica -Populus trichocarpa Populus trichocarpa Populus trichocarpa -Pristionchus pacificus Pristionchus pacificus Pristionchus pacificus -Prunus persica Prunus persica Prunus persica -Pseudorabies virus Pseudorabies virus Pseudorabies virus -Pteropus alecto Pteropus alecto Pteropus alecto -Pundamilia nyererei Pundamilia nyererei Pundamilia nyererei -Pygathrix bieti Pygathrix bieti Pygathrix bieti -Python bivittatus Python bivittatus Python bivittatus -Raccoon polyomavirus Raccoon polyomavirus Raccoon polyomavirus -Rattus norvegicus Rattus norvegicus Rattus norvegicus -Rehmannia glutinosa Rehmannia glutinosa Rehmannia glutinosa -Rhesus lymphocryptovirus Rhesus lymphocryptovirus Rhesus lymphocryptovirus -Rhesus monkey Rhesus monkey Rhesus monkey -Rhipicephalus microplus Rhipicephalus microplus Rhipicephalus microplus -Ricinus communis Ricinus communis Ricinus communis -Saccharum officinarum Saccharum officinarum Saccharum officinarum -Saccharum sp. Saccharum sp. Saccharum sp. -Saccoglossus kowalevskii Saccoglossus kowalevskii Saccoglossus kowalevskii -Saguinus labiatus Saguinus labiatus Saguinus labiatus -Saimiri boliviensis Saimiri boliviensis Saimiri boliviensis -Salicornia europaea Salicornia europaea Salicornia europaea -Salmo salar Salmo salar Salmo salar -Salvia miltiorrhiza Salvia miltiorrhiza Salvia miltiorrhiza -Salvia sclarea Salvia sclarea Salvia sclarea -Sarcophilus harrisii Sarcophilus harrisii Sarcophilus harrisii -Schistosoma japonicum Schistosoma japonicum Schistosoma japonicum -Schistosoma mansoni Schistosoma mansoni Schistosoma mansoni -Schmidtea mediterranea Schmidtea mediterranea Schmidtea mediterranea -Selaginella moellendorffii Selaginella moellendorffii Selaginella moellendorffii -Simian foamy Simian foamy Simian foamy -Simian virus Simian virus Simian virus -Solanum lycopersicum Solanum lycopersicum Solanum lycopersicum -Solanum tuberosum Solanum tuberosum Solanum tuberosum -Sorghum bicolor Sorghum bicolor Sorghum bicolor -Spodoptera frugiperda Spodoptera frugiperda Spodoptera frugiperda -Strigamia maritima Strigamia maritima Strigamia maritima -Strongylocentrotus purpuratus Strongylocentrotus purpuratus Strongylocentrotus purpuratus -Strongyloides ratti Strongyloides ratti Strongyloides ratti -Sus scrofa Sus scrofa Sus scrofa -Sycon ciliatum Sycon ciliatum Sycon ciliatum -Symbiodinium microadriaticum Symbiodinium microadriaticum Symbiodinium microadriaticum -Symphalangus syndactylus Symphalangus syndactylus Symphalangus syndactylus -Taeniopygia guttata Taeniopygia guttata Taeniopygia guttata -Terebratulina retusa Terebratulina retusa Terebratulina retusa -Tetranychus urticae Tetranychus urticae Tetranychus urticae -Tetraodon nigroviridis Tetraodon nigroviridis Tetraodon nigroviridis -Theobroma cacao Theobroma cacao Theobroma cacao -Torque teno Torque teno Torque teno -Tribolium castaneum Tribolium castaneum Tribolium castaneum -Triops cancriformis Triops cancriformis Triops cancriformis -Triticum aestivum Triticum aestivum Triticum aestivum -Triticum turgidum Triticum turgidum Triticum turgidum -Tupaia chinensis Tupaia chinensis Tupaia chinensis -Vigna unguiculata Vigna unguiculata Vigna unguiculata -Vitis vinifera Vitis vinifera Vitis vinifera -Vriesea carinata Vriesea carinata Vriesea carinata -Xenopus laevis Xenopus laevis Xenopus laevis -Xenopus tropicalis Xenopus tropicalis Xenopus tropicalis -Xenoturbella bocki Xenoturbella bocki Xenoturbella bocki -Zea mays Zea mays Zea mays -# -#More examples: -# -#mm10 mm10 Mouse (mm10) /depot/data2/galaxy/mm10/bowtie2/mm10 -#dm3 dm3 D. melanogaster (dm3) /depot/data2/galaxy/mm10/bowtie2/dm3 -#
