# HG changeset patch # User glogobyte # Date 1621316910 0 # Node ID d9b2a7df9d4b621dac8f3c9e4e1ff1253cd1dcbb # Parent d2eea02053a071f29d2359afa13f958d932887bc Deleted selected files diff -r d2eea02053a0 -r d9b2a7df9d4b mirbase_functions.py --- a/mirbase_functions.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,823 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -"""---------------------- Simple Functions -----------------------""" - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - -"""---------------------- RNA-seq Functions ----------------------""" - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - - - -######################################################################################################################################################## - -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################################################################# - -def duplicate_chroms_isoforms(List): - - dupes=[] - - for num in range(len(List)): - - if [List[num][9],List[num][0],List[num][2]] not in dupes : - dupes.append([List[num][9],List[num][0],List[num][2]]) - - for x in List: - for y in dupes: - if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: - y.append(x[2]) - - - double_List = [x[:] for x in List] - - chr_order=[] - for x in dupes: - temp = [] - for i in range(2,len(x)): - if x[i].split("chr")[1].split("(")[0].isdigit(): - temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) - else: - temp.append(x[i].split("chr")[1][0:4]) - - for z in temp: - if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: - temp = [str(j) for j in temp] - temp=list(set(temp)) - temp.sort() - chr_order.append(temp) - - final_dupes=[] - for i in range(len(dupes)): - final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) - for x in chr_order[i]: - result = re.match("[-+]?\d+$", str(x)) - if len(chr_order[i]) == len(set(chr_order[i])): - if result is not None: - - if int(x)<0: - final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" - else: - final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) - else: - if result is not None: - if int(x) < 0: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" - else: - final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) - - final_dupes.sort() - final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) - - for i in range(len(double_List)): - for x in final_dupes: - - if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) - double_List[i][2] = x[1]+gg - - if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: - double_List[i][2]=x[1] - List[i][2] = x[1] - - List.sort() - new_list=list(List for List,_ in itertools.groupby(List)) - - double_List.sort() - new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) - - return new_list, new_double_List - - -############################################################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if "{" in unique_seq[j][2].split("_")[0]: - official=unique_seq[j][2].split("_")[0][:-4] - else: - official=unique_seq[j][2].split("_")[0] - - if mature_mirnas[i].split(" ")[0][1:] == official: - - temp_mature = mature_mirnas[i+1].strip().replace("U", "T") - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in double_fil_uni_seq: - if y[0]==x[2].split("_")[0]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - l.acquire() - if con=="c": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in double_fil_uni_seq) - for y in double_fil_uni_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(dedup_unique_seq) - data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### - -""" - -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - - uni_seq=[] - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - - t_name=unique_seq[j].copy() - t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -##################################################################################################################################################################################################################### -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -#################################################################################################################################################################################################################### - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]==x[0].split("_")[0]: - fl=1 - if len(x[0])2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - if int(per)!=-1: - percent=int(per)/100 - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - filter_LH8E.append(LH8E[i]) - filter_LH2E.append(LH2E[i]) - - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2: - fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - -######################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -################################################################################################################## - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - - new.extend(LH2E_copy) - - -###################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################### - -def download_matures(matures,org_name): - - #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' - url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' - data = urllib.request.urlopen(url).read() - file_mirna = gzip.decompress(data).decode('utf-8') - file_mirna = file_mirna.split("\n") - - for i in range(0,len(file_mirna)-1,2): - - if org_name in file_mirna[i]: - matures.append(file_mirna[i]) - matures.append(file_mirna[i+1]) - -################################################################################################################################################################################################################### -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2] : - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - - - for x in pre_uni_seq_tre: - for y in x: - if ">"+y[2] not in all_isoforms and ")_" in y[2]: - all_isoforms.append(">"+y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### - diff -r d2eea02053a0 -r d9b2a7df9d4b mirbase_graphs.py --- a/mirbase_graphs.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,809 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -################################################################################################################################################################# -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -###################################################################################################################################################### - - -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if "chr" in x[0].split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[-1]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[-2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if "chr" in y.split("_")[-1]: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - for y in x[0].split("/"): - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - -##################################################################################################################################################### - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - - -############################################################################################################################################################################################################# - -def hist_red(samples,flag): - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -################################################################################################################# - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i -# -# So, for example, if you had hg18 indexes stored in: -# -# /depot/data2/galaxy/hg19/bowtie2/ -# -# containing hg19 genome and hg19.*.bt2 files, such as: -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 -# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 -# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 -# -# then the bowtie2_indices.loc entry could look like this: -# -# -Hsa Human (Homo sapiens) -# -# -# -# -# diff -r d2eea02053a0 -r d9b2a7df9d4b mirgene_functions.py --- a/mirgene_functions.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,749 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - - - -"""---------------------- Simple Functions -----------------------""" - -# Read a file and return it as a list -def read(path, flag): - if flag == 0: - with open(path) as fp: - file=fp.readlines() - fp.close() - return file - - if flag == 1: - with open(path) as fp: - file = fp.read().splitlines() - fp.close() - return file - -# Write a list to a txt file -def write(path, list): - with open(path,'w') as fp: - for x in list: - fp.write(str("\t".join(x[1:-1]))) - fp.close() - -"""---------------------- RNA-seq Functions ----------------------""" - -# Detect the longest common substring sequence between two mirnas -def longestSubstring(str1, str2): - - from difflib import SequenceMatcher - # initialize SequenceMatcher object with - # input string - seqMatch = SequenceMatcher(None, str1, str2) - - # find match of longest sub-string - # output will be like Match(a=0, b=0, size=5) - match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) - - # print longest substring - if (match.size != 0): - return str1[match.a: match.a + match.size] - else: - print('No longest common sub-string found') - - - -######################################################################################################################################################## -def collapse_sam(path): - - ini_sam=read(path,0) - main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] - - uni_seq = [] - for x in main_sam: - - if [x[2], x[9]] not in uni_seq: - uni_seq.append([x[2], x[9]]) - - new_main_sam=[] - incr_num=0 - for i in range(len(uni_seq)): - count=0 - incr_num+=1 - for y in main_sam: - if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: - count+=1 - temp=y - temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - temp[0]=str(incr_num)+"-"+str(count) - new_main_sam.append(temp) - - new_sam=intro_sam+new_main_sam - - return new_sam - -################################################################################################################################################################# - -def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): - - # read the sam file - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] - - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - - for i in range(len(mature_mirnas)): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - for j in range(len(unique_seq)): - - if mature_mirnas[i] == unique_seq[j][2]: - - temp_mature = mature_mirnas[i+1] - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - # Problem with hsa-miR-8485 - if mat_diff[1]!=0 and unique_diff[1]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - elif mat_diff[0]!=0 and unique_diff[0]!=0: - unique_seq[j]=1 - pre_pos = 0 - post_pos = 0 - - else: - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq+1 - - - if pre_pos != 0 or post_pos != 0: - if pre_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) - elif post_pos == 0: - unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) - else: - unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) - - for x in range(unique_seq.count(1)): - unique_seq.remove(1) - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) - - # Store name of arms, number of sequences and number of reads - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - - for y in sorted_uni_arms: - counts=0 - seqs=0 - for x in unique_seq: - if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: - counts+=int(x[0].split("-")[1]) - seqs+=1 - - y[1]=seqs - y[2]=counts - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(z[2] for z in unique_seq) - for y in unique_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(unique_seq) - data.append([con,name,unique_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - - if con=="t": - LHE.extend(z[2] for z in unique_seq) - for y in unique_seq: - samples_mirna_names.append([y[2],y[9]]) - deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) - LHE_names.extend(LHE) - unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) - unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) - names.append(name) - samples.append(unique_seq) - data.append([con,name,unique_seq,sorted_uni_arms]) - ini_sample.append(new_main_sam) - l.release() - - -###################################################################################################################################### -""" -Read a sam file from Bowtie and do the followings: - -1) Remove reverse stranded mapped reads -2) Remove unmapped reads -3) Remove all sequences with reads less than 11 reads -4) Sort the arms with the most sequences in decreading rate -5) Sort the sequences of every arm with the most reads in decreasing rate -6) Calculate total number of sequences of every arm -7) Calculate total number of reads of sequences of every arm. -8) Store all the informations in a txt file - -""" - -def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): - - - ini_sam=read(path,0) - new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] - unique_seq=[] - unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] - uni_seq=[] - - # Calculate the shifted positions for every isomir and add them to the name of it - sorted_uni_arms = [] - for i in range(1,len(mature_mirnas),2): - tmp_count_reads = 0 # calculate the total number of reads - tmp_count_seq = 0 # calculate the total number of sequences - - for j in range(len(unique_seq)): - - temp_mature = mature_mirnas[i].strip().replace("U", "T") - - if temp_mature in unique_seq[j][9]: - - off_part = longestSubstring(temp_mature, unique_seq[j][9]) - - mat_diff = temp_mature.split(off_part) - mat_diff = [len(mat_diff[0]), len(mat_diff[1])] - - unique_diff = unique_seq[j][9].split(off_part) - if len(unique_diff)<=2: - unique_diff = [len(unique_diff[0]), len(unique_diff[1])] - - pre_pos = mat_diff[0]-unique_diff[0] - post_pos = unique_diff[1]-mat_diff[1] - - lengthofmir = len(off_part) + post_pos - if pre_pos == 0: - tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) - tmp_count_seq = tmp_count_seq + 1 - - if pre_pos == 0: - t_name=copy.deepcopy(unique_seq[j]) - t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) - uni_seq.append(t_name) - - if tmp_count_reads != 0 and tmp_count_seq != 0: - sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) - - - sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) - unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) - - LHE=[] - - l.acquire() - if con=="c": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - - - if con=="t": - LHE.extend(x[2] for x in unique_seq if x[2]!="*") - for x in unique_seq: - if x[2]!="*": - n_samples_mirna_names.append([x[2],x[9]]) - n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) - n_LHE_names.extend(LHE) - names.append(name) - data.append([con,name,unique_seq,sorted_uni_arms]) - l.release() - -################################################################################################################################################################################################################# - -def deseq2_temp(samples_mirna_names,deseq,con,l): - - samples_mirna_names.sort(key=lambda x:[0]) - - for i in range(len(deseq)): - for y in samples_mirna_names: - flag = 0 - for x in deseq[i]: - if y[0] == x[0]: - flag = 1 - break - - if flag == 0: - deseq[i].append([y[0], "0", y[1]]) - - [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] - deseq_final = [[x[0],x[2]] for x in deseq[0]] - [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] - - l.acquire() - if con=="c": - q1.put(deseq_final) - - if con=="t": - q2.put(deseq_final) - l.release() - - -################################################################################################################################################################################################################## - -def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): - - LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] - LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] - - LH8E_add_names.sort() - LH2E_add_names.sort() - LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) - LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) - - LH2E.sort() - LH8E.sort() - LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - zeros=["0"]*(len(LH8E[0])-2) - [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] - LH8E=LH8E+LH8E_add_names - - zeros=["0"]*(len(LH2E[0])-2) - [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] - LH2E=LH2E+LH2E_add_names - - dupes=[] - final_LH2E =[] - - for num,_ in enumerate(LH2E): - - if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: - final_LH2E.append(LH2E[num][1]) - final_LH2E.append(LH2E[num][0]) - else: - dupes.append(LH2E[num][1]) - - dupes=list(set(dupes)) - - dupes=[[x] for x in dupes] - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - fl=0 - if len(y)==1: - y.append(x[0]) - else: - for i in range(1,len(y)): - if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: - fl=1 - if len(x[0])2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - for x in LH8E: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - - - LH2E.sort() - LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) - - LH8E.sort() - LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) - - if int(per)!=-1: - percent=int(per)/100 - - c_col_filter=round(percent*(len(LH2E[1])-2)) - t_col_filter=round(percent*(len(LH8E[1])-2)) - - for i, _ in enumerate(LH2E): - c_cols=0 - t_cols=0 - - c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) - t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) - - if c_cols>=c_col_filter or t_cols>=t_col_filter: - filter_LH8E.append(LH8E[i]) - filter_LH2E.append(LH2E[i]) - - raw_LH2E.extend(LH2E) - raw_LH8E.extend(LH8E) - -################################################################################################################################################################################################################## - -def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): - - if flag == 1 and int(per)!=-1: - fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 2 and int(per)!=-1: - fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in fil_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in fil_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - - if flag == 1: - fp = open('Counts/Raw '+n2+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - if flag == 2: - fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_tre: - fp.write("\t"+y) - - - for x in raw_LH8E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - - fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') - fp.write("Name\t") - fp.write("Sequence") - for y in names_con: - fp.write("\t"+y) - - for x in raw_LH2E: - fp.write("\n%s" % "\t".join(x)) - fp.close() - -#################################################################################################################################################################################################################### - -def ssamples(names,samp,folder,pro): - - for i in range(2,len(samp[0])): - - fp = open(folder+names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+names[i-2]+"\n") - - for x in samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - fp.close() - -#################################################################################################################################################################################################################### - -def DB_write(con,name,unique_seq,sorted_uni_arms,f): - - if f==1: - # Write a txt file with all the information - if con=="c": - fp = open('split1/'+name, 'w') - - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split2/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - - if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): - - temp.append(unique_seq[j]) - - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - if f==2: - - if con=="c": - fp = open('split3/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - if con=="t": - fp = open('split4/'+name, 'w') - fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) - - for i in range(len(sorted_uni_arms)): - temp = [] - for j in range(len(unique_seq)): - if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: - temp.append(unique_seq[j]) - if temp!=[]: - temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) - fp.write("*********************************************************************************************************\n") - fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) - fp.write("*********************************************************************************************************\n\n") - [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] - fp.write("\n" + "\n") - fp.close() - - -########################################################################################################################## - -def new_mat_seq(pre_unique_seq,mat_mirnas,l): - - unique_iso = [] - for x in pre_unique_seq: - if len(x[2].split("_"))==3: - for y in pre_unique_seq: - if x[2] in y[2] and int(x[0].split("-")[1])2: - for i in range(len(y)-1,1,-1): - y[1]=y[1]+"/"+y[i] - del y[i] - - - for x in LH2E_copy: - for y in dupes: - if x[1]==y[0]: - x[0]=y[1] - - LH2E_copy.sort() - LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) - new.extend(LH2E_copy) - -###################################################################################################################################################### - -def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): - - for i in range(2,len(tem_samp[0])): - - fp = open(folder+tem_names[i-2]+'.txt','w') - fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") - - for x in tem_samp: - fp.write("%s" % "\t".join([x[0],x[i]])+"\n") - - for j in range(len(non_names)): - if non_names[j]==tem_names[i-2]: - for x in non_samp: - fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") - fp.close() - -################################################################################################################################################################################################################# - -def download_matures(matures,org_name): - - mature_mir=[] - - mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' - star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' - - data = urllib.request.urlopen(mat_url).read() - file_mirna = data.decode('utf-8') - mature_mir = file_mirna.split("\n") - mature_mir = [x.replace(">","") for x in mature_mir] - del mature_mir[-1] - - data = urllib.request.urlopen(star_url).read() - file_mirna = data.decode('utf-8') - star_mir = file_mirna.split("\n") - star_mir = [x.replace(">","") for x in star_mir] - del star_mir[-1] - - mature_mir.extend(star_mir) - - for i in range(1,len(mature_mir),2): - mature_mir[i]=mature_mir[i].replace("U","T") - - matures.extend(mature_mir) - -################################################################################################################### - -def non_template_ref(sc,st,all_isoforms): - - pre_uni_seq_con = list(sc) - pre_uni_seq_tre = list(st) - - for x in pre_uni_seq_con: - for y in x: - if y[2] not in all_isoforms and len(y[2].split("_"))>2: - all_isoforms.append(y[2]) - all_isoforms.append(y[9]) - - for x in pre_uni_seq_tre: - for y in x: - if y[2] not in all_isoforms and len(y[2].split("_"))>2: - all_isoforms.append(y[2]) - all_isoforms.append(y[9]) - -################################################################################################################################################################################################ - -def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): - - for y in mir_names: - flag=0 - for x in sample: - if y[0]==x[0]: - flag=1 - break - if flag==0: - sample.append([y[0],"0",y[1]]) - - sample.sort(key=lambda x: x[0]) - sample=list(sample for sample,_ in itertools.groupby(sample)) - - l.acquire() - new_d.append(sample) - sample_order.append(sample_name) - l.release() - -############################################################################################################################################################################################### - diff -r d2eea02053a0 -r d9b2a7df9d4b mirgene_graphs.py --- a/mirgene_graphs.py Wed Oct 28 08:13:30 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,847 +0,0 @@ -import itertools -import time -import sys -import os -import urllib.request -import gzip -from multiprocessing import Process, Queue, Lock, Pool, Manager, Value -import subprocess -import argparse -from collections import OrderedDict -from matplotlib.backends.backend_pdf import PdfPages -import pandas as pd -from math import pi -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.ticker import PercentFormatter -import seaborn as sns -import scipy.stats as stats -from plotnine import * -import math -import re -import matplotlib.ticker as mtick -import copy - - -######################################################################################################################### -def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E] - t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - c_non = len(c_non_samples) - c_non_counts = sum(x[2] for x in c_non_samples) - t_non = len(t_non_samples) - t_non_counts = sum(x[2] for x in t_non_samples) - - c_unmap = c_unmap - c_non - t_unmap = c_unmap - t_non - - c_unmap_counts=c_unmap_counts - c_non_counts - t_unmap_counts=t_unmap_counts - t_non_counts - - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - fig = plt.figure(figsize=(7,5)) - - labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - # Plot - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control Group (reads)',fontsize=12) - - labels = 'miRNA RefSeq','Template', 'Unmapped','non-template' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] - colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] - # Plot - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated Group (reads)', fontsize=12) - plt.savefig('pie_non.png',dpi=300) - -#################################################################################################################################################################################################################### - -def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_templ = 0 - c_tem_counts = 0 - c_mature = 0 - c_mat_counts = 0 - t_templ = 0 - t_tem_counts = 0 - t_mature = 0 - t_mat_counts = 0 - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - else: - c_templ+=1 - c_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - c_templ+=1 - c_tem_counts += x[2] - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - else: - t_templ+=1 - t_tem_counts += x[2] - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - t_templ+=1 - t_tem_counts += x[2] - - - fig = plt.figure() - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - # Plot - ax1 = plt.subplot2grid((1,2),(0,0)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Control group (reads)', fontsize=12) - labels = 'miRNA RefSeq','Template', 'Unmapped' - sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] - colors = ['gold', 'yellowgreen', 'lightskyblue'] - explode = (0.2, 0.05, 0.1) - # Plot - ax2 = plt.subplot2grid((1,2),(0,1)) - patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) - [x.set_fontsize(8) for x in texts] - plt.title('Treated group (reads)',fontsize = 12) - plt.savefig('pie_tem.png',dpi=300) - -################################################################################################################################################################################################################### - -def make_spider(merge_LH2E,merge_LH8E): - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E] - t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E] - - c_5 = 0 - c_5_counts = 0 - c_3 = 0 - c_3_counts = 0 - c_both =0 - c_both_counts=0 - c_mature = 0 - c_mat_counts = 0 - c_exception=0 - c_exception_counts=0 - - - t_5 = 0 - t_5_counts = 0 - t_3 = 0 - t_3_counts = 0 - t_both = 0 - t_both_counts = 0 - t_mature = 0 - t_mat_counts = 0 - t_exception = 0 - t_exception_counts=0 - - for x in c_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - elif 0 == int(x[0].split("_")[3]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("_")[2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - c_mature+=1 - c_mat_counts += x[2] - f=1 - break - if f==0: - part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] - num=0 - for y in x[0].split("/"): - if y.split("_")[-2]+"_"+y.split("_")[-1]==part: - num+=1 - if num==len(x[0].split("/")): - - if 0 == int(x[0].split("/")[0].split("_")[3]): - c_5+=1 - c_5_counts += x[2] - elif 0 == int(x[0].split("/")[0].split("_")[2]): - c_3+=1 - c_3_counts += x[2] - else: - c_both+=1 - c_both_counts+=x[2] - else: - c_exception+=1 - c_exception_counts += x[2] - - - for x in t_samples: - - if "/" not in x[0]: - if len(x[0].split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - elif 0 == int(x[0].split("_")[3]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("_")[2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - - else: - f=0 - for y in x[0].split("/"): - if len(y.split("_"))==2: - t_mature+=1 - t_mat_counts += x[2] - f=1 - break - if f==0: - part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] - num=0 - for y in x[0].split("/"): - if y.split("_")[-2]+"_"+y.split("_")[-1]==part: - num+=1 - if num==len(x[0].split("/")): - - if 0 == int(x[0].split("/")[0].split("_")[3]): - t_5+=1 - t_5_counts += x[2] - elif 0 == int(x[0].split("/")[0].split("_")[2]): - t_3+=1 - t_3_counts += x[2] - else: - t_both+=1 - t_both_counts+=x[2] - else: - t_exception+=1 - t_exception_counts += x[2] - - - c_all = c_5+c_3+c_both+c_mature+c_exception - c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts - - t_all = t_5+t_3+t_both+t_mature + t_exception - t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts - - c_5 = round(c_5/c_all*100,2) - c_3 = round(c_3/c_all*100,2) - c_both = round(c_both/c_all*100,2) - c_mature = round(c_mature/c_all*100,2) - c_exception = round(c_exception/c_all*100,2) - - c_5_counts = round(c_5_counts/c_all_counts*100,2) - c_3_counts = round(c_3_counts/c_all_counts*100,2) - c_both_counts = round(c_both_counts/c_all_counts*100,2) - c_mat_counts = round(c_mat_counts/c_all_counts*100,2) - c_exception_counts = round(c_exception_counts/c_all_counts*100,2) - - t_5 = round(t_5/t_all*100,2) - t_3 = round(t_3/t_all*100,2) - t_both = round(t_both/t_all*100,2) - t_mature = round(t_mature/t_all*100,2) - t_exception = round(t_exception/t_all*100,2) - - t_5_counts = round(t_5_counts/t_all_counts*100,2) - t_3_counts = round(t_3_counts/t_all_counts*100,2) - t_both_counts = round(t_both_counts/t_all_counts*100,2) - t_mat_counts = round(t_mat_counts/t_all_counts*100,2) - t_exception_counts = round(t_exception_counts/t_all_counts*100,2) - - radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) - radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) - - df=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both,t_both], - """3' isomiRs""":[c_3,t_3], - 'miRNA RefSeq':[c_mature,t_mature], - """5' isomiRs""":[c_5,t_5], - 'Others*':[c_exception,t_exception]}) - - df1=pd.DataFrame({ - 'group':['Controls','Treated'], - """5' and 3' isomiRs""":[c_both_counts,t_both_counts], - """3' isomiRs""":[c_3_counts,t_3_counts], - 'miRNA RefSeq':[c_mat_counts,t_mat_counts], - """5' isomiRs""":[c_5_counts,t_5_counts], - 'Others*':[c_exception_counts,t_exception_counts]}) - - spider_last(df,radar_max,1) - spider_last(df1,radar_max_counts,2) - - -############################################################################################################################################################################################################## - -def spider_last(df,radar_max,flag): - # ------- PART 1: Create background - fig = plt.figure() - # number of variable - categories=list(df)[1:] - N = len(categories) - - # What will be the angle of each axis in the plot? (we divide the plot / number of variable) - angles = [n / float(N) * 2 * pi for n in range(N)] - angles += angles[:1] - - # Initialise the spider plot - ax = plt.subplot(111, polar=True) - - # If you want the first axis to be on top: - ax.set_theta_offset(pi/2) - ax.set_theta_direction(-1) - - # Draw one axe per variable + add labels labels yet - plt.xticks(angles[:-1], categories, fontsize=11) - - # Draw ylabels - radar_max=round(radar_max+radar_max*0.1) - mul=len(str(radar_max))-1 - maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) - sep = round(maxi/4) - plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) - plt.ylim(0, maxi) - - # ------- PART 2: Add plots - - # Plot each individual = each line of the data - # I don't do a loop, because plotting more than 3 groups makes the chart unreadable - - # Ind1 - values=df.loc[0].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls") - ax.fill(angles, values, 'b', alpha=0.1) - - # Ind2 - values=df.loc[1].drop('group').values.flatten().tolist() - values += values[:1] - ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated") - ax.fill(angles, values, 'r', alpha=0.1) - - # Add legend - if flag==1: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_non_red.png',dpi=300) - else: - plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) - plt.savefig('spider_red.png',dpi=300) - -############################################################################################################################################################################################################# - -def hist_red(samples,flag): - - lengths=[] - cat=[] - total_reads=0 - seq=[] - - if flag == "c": - title = "Length Distribution of Control group (Redudant reads)" - if flag == "t": - title = "Length Distribution of Treated group (Redudant reads)" - - for i in samples: - for x in i: - lengths.append(len(x[9])) - if x[1]=="0": - seq.append([x[9],x[0].split("-")[1],"Mapped"]) - cat.append("Mapped") - if x[1] == "4": - seq.append([x[9],x[0].split("-")[1],"Unmapped"]) - cat.append("Unmapped") - - uni_len=list(set(lengths)) - uni_len=[x for x in uni_len if x<=35] - low=min(lengths) - up=max(lengths) - seq.sort() - uni_seq=list(seq for seq,_ in itertools.groupby(seq)) - dim=up-low - - if dim>20: - s=5 - else: - s=8 - total_reads+=sum([int(x[1]) for x in uni_seq]) - - map_reads=[] - unmap_reads=[] - length=[] - for y in uni_len: - map_temp=0 - unmap_temp=0 - for x in uni_seq: - if len(x[0])==y and x[2]=="Mapped": - map_temp+=int(x[1]) - if len(x[0])==y and x[2]=="Unmapped": - unmap_temp+=int(x[1]) - if y<=35: - length.append(y) - map_reads.append(round(map_temp/total_reads*100,2)) - unmap_reads.append(round(unmap_temp/total_reads*100,2)) - - ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) - ylim=ylim+ylim*20/100 - fig, ax = plt.subplots() - width=0.8 - ax.bar(length, unmap_reads, width, label='Unmapped') - h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') - plt.xticks(np.arange(length[0], length[-1]+1, 1)) - plt.xlabel('Length (nt)',fontsize=14) - plt.ylabel('Percentage',fontsize=14) - plt.title(title,fontsize=14) - ax.legend() - plt.ylim([0, ylim]) - ax.grid(axis='y',linewidth=0.2) - - if flag=='c': - plt.savefig('c_hist_red.png',dpi=300) - - if flag=='t': - plt.savefig('t_hist_red.png',dpi=300) - -############################################################################################################################################################################################################################################## - -def logo_seq_red(merge, flag): - - if flag=="c": - titlos="Control group (Redundant)" - file_logo="c_logo.png" - file_bar="c_bar.png" - if flag=="t": - titlos="Treated group (Redundant)" - file_logo="t_logo.png" - file_bar="t_bar.png" - - c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] - - A=[0]*5 - C=[0]*5 - G=[0]*5 - T=[0]*5 - total_reads=0 - - for y in c_samples: - if "/" in y[0]: - length=[] - for x in y[0].split("/"): - length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) - - best=min(length) - total_reads+=best[2] - for i in range(5): - if i -# -# So, for example, if you had hg18 indexes stored in: -# -# /depot/data2/galaxy/hg19/bowtie2/ -# -# containing hg19 genome and hg19.*.bt2 files, such as: -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa -# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 -# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 -# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 -# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 -# -# then the bowtie2_indices.loc entry could look like this: -# -Acacia auriculiformis Acacia auriculiformis Acacia auriculiformis -Acacia mangium Acacia mangium Acacia mangium -Acyrthosiphon pisum Acyrthosiphon pisum Acyrthosiphon pisum -Aedes aegypti Aedes aegypti Aedes aegypti -Aegilops tauschii Aegilops tauschii Aegilops tauschii -Alligator mississippiensis Alligator mississippiensis Alligator mississippiensis -Amborella trichopoda Amborella trichopoda Amborella trichopoda -Amphimedon queenslandica Amphimedon queenslandica Amphimedon queenslandica -Anas platyrhynchos Anas platyrhynchos Anas platyrhynchos -Anolis carolinensis Anolis carolinensis Anolis carolinensis -Anopheles gambiae Anopheles gambiae Anopheles gambiae -Apis mellifera Apis mellifera Apis mellifera -Aquilegia caerulea Aquilegia caerulea Aquilegia caerulea -Arabidopsis lyrata Arabidopsis lyrata Arabidopsis lyrata -Arabidopsis thaliana Arabidopsis thaliana Arabidopsis thaliana -Arachis hypogaea Arachis hypogaea Arachis hypogaea -Artibeus jamaicensis Artibeus jamaicensis Artibeus jamaicensis -Ascaris suum Ascaris suum Ascaris suum -Asparagus officinalis Asparagus officinalis Asparagus officinalis -Astatotilapia burtoni Astatotilapia burtoni Astatotilapia burtoni -Ateles geoffroyi Ateles geoffroyi Ateles geoffroyi -Avicennia marina Avicennia marina Avicennia marina -BK polyomavirus BK polyomavirus BK polyomavirus -Bactrocera dorsalis Bactrocera dorsalis Bactrocera dorsalis -Bandicoot papillomatosis Bandicoot papillomatosis Bandicoot papillomatosis -Biston betularia Biston betularia Biston betularia -Bombyx mori Bombyx mori Bombyx mori -Bos taurus Bos taurus Bos taurus -Bovine foamy Bovine foamy Bovine foamy -Bovine herpesvirus Bovine herpesvirus Bovine herpesvirus -Bovine leukemia Bovine leukemia Bovine leukemia -Brachypodium distachyon Brachypodium distachyon Brachypodium distachyon -Branchiostoma belcheri Branchiostoma belcheri Branchiostoma belcheri -Branchiostoma floridae Branchiostoma floridae Branchiostoma floridae -Brassica napus Brassica napus Brassica napus -Brassica oleracea Brassica oleracea Brassica oleracea -Brassica rapa Brassica rapa Brassica rapa -Brugia malayi Brugia malayi Brugia malayi -Bruguiera cylindrica Bruguiera cylindrica Bruguiera cylindrica -Bruguiera gymnorhiza Bruguiera gymnorhiza Bruguiera gymnorhiza -Caenorhabditis brenneri Caenorhabditis brenneri Caenorhabditis brenneri -Caenorhabditis briggsae Caenorhabditis briggsae Caenorhabditis briggsae -Caenorhabditis elegans Caenorhabditis elegans Caenorhabditis elegans -Caenorhabditis remanei Caenorhabditis remanei Caenorhabditis remanei -Callithrix jacchus Callithrix jacchus Callithrix jacchus -Camelina sativa Camelina sativa Camelina sativa -Canis familiaris Canis familiaris Canis familiaris -Capitella teleta Capitella teleta Capitella teleta -Capra hircus Capra hircus Capra hircus -Carica papaya Carica papaya Carica papaya -Cavia porcellus Cavia porcellus Cavia porcellus -Cerebratulus lacteus Cerebratulus lacteus Cerebratulus lacteus -Chlamydomonas reinhardtii Chlamydomonas reinhardtii Chlamydomonas reinhardtii -Chrysemys picta Chrysemys picta Chrysemys picta -Ciona intestinalis Ciona intestinalis Ciona intestinalis -Ciona savignyi Ciona savignyi Ciona savignyi -Citrus clementina Citrus clementina Citrus clementina -Citrus reticulata Citrus reticulata Citrus reticulata -Citrus sinensis Citrus sinensis Citrus sinensis -Citrus trifoliata Citrus trifoliata Citrus trifoliata -Columba livia Columba livia Columba livia -Cricetulus griseus Cricetulus griseus Cricetulus griseus -Cucumis melo Cucumis melo Cucumis melo -Cucumis sativus Cucumis sativus Cucumis sativus -Culex quinquefasciatus Culex quinquefasciatus Culex quinquefasciatus -Cunninghamia lanceolata Cunninghamia lanceolata Cunninghamia lanceolata -Cynara cardunculus Cynara cardunculus Cynara cardunculus -Cyprinus carpio Cyprinus carpio Cyprinus carpio -Danio rerio Danio rerio Danio rerio -Daphnia pulex Daphnia pulex Daphnia pulex -Dasypus novemcinctus Dasypus novemcinctus Dasypus novemcinctus -Daubentonia madagascariensis Daubentonia madagascariensis Daubentonia madagascariensis -Dictyostelium discoideum Dictyostelium discoideum Dictyostelium discoideum -Digitalis purpurea Digitalis purpurea Digitalis purpurea -Dinoponera quadriceps Dinoponera quadriceps Dinoponera quadriceps -Drosophila ananassae Drosophila ananassae Drosophila ananassae -Drosophila erecta Drosophila erecta Drosophila erecta -Drosophila grimshawi Drosophila grimshawi Drosophila grimshawi -Drosophila melanogaster Drosophila melanogaster Drosophila melanogaster -Drosophila mojavensis Drosophila mojavensis Drosophila mojavensis -Drosophila persimilis Drosophila persimilis Drosophila persimilis -Drosophila pseudoobscura Drosophila pseudoobscura Drosophila pseudoobscura -Drosophila sechellia Drosophila sechellia Drosophila sechellia -Drosophila simulans Drosophila simulans Drosophila simulans -Drosophila virilis Drosophila virilis Drosophila virilis -Drosophila willistoni Drosophila willistoni Drosophila willistoni -Drosophila yakuba Drosophila yakuba Drosophila yakuba -Duck enteritis Duck enteritis Duck enteritis -Echinococcus granulosus Echinococcus granulosus Echinococcus granulosus -Echinococcus multilocularis Echinococcus multilocularis Echinococcus multilocularis -Ectocarpus siliculosus Ectocarpus siliculosus Ectocarpus siliculosus -Elaeis guineensis Elaeis guineensis Elaeis guineensis -Electrophorus electricus Electrophorus electricus Electrophorus electricus -Epstein Barr Epstein Barr Epstein Barr -Eptesicus fuscus Eptesicus fuscus Eptesicus fuscus -Equus caballus Equus caballus Equus caballus -Eugenia uniflora Eugenia uniflora Eugenia uniflora -Fasciola hepatica Fasciola hepatica Fasciola hepatica -Festuca arundinacea Festuca arundinacea Festuca arundinacea -Fragaria vesca Fragaria vesca Fragaria vesca -Fugu rubripes Fugu rubripes Fugu rubripes -Gadus morhua Gadus morhua Gadus morhua -Gallus gallus Gallus gallus Gallus gallus -Glottidia pyramidata Glottidia pyramidata Glottidia pyramidata -Glycine max Glycine max Glycine max -Glycine soja Glycine soja Glycine soja -Gorilla gorilla Gorilla gorilla Gorilla gorilla -Gossypium arboreum Gossypium arboreum Gossypium arboreum -Gossypium herbaceum Gossypium herbaceum Gossypium herbaceum -Gossypium hirsutum Gossypium hirsutum Gossypium hirsutum -Gossypium raimondii Gossypium raimondii Gossypium raimondii -Gyrodactylus salaris Gyrodactylus salaris Gyrodactylus salaris -Haemonchus contortus Haemonchus contortus Haemonchus contortus -Haliotis rufescens Haliotis rufescens Haliotis rufescens -Helianthus annuus Helianthus annuus Helianthus annuus -Helianthus argophyllus Helianthus argophyllus Helianthus argophyllus -Helianthus ciliaris Helianthus ciliaris Helianthus ciliaris -Helianthus exilis Helianthus exilis Helianthus exilis -Helianthus paradoxus Helianthus paradoxus Helianthus paradoxus -Helianthus petiolaris Helianthus petiolaris Helianthus petiolaris -Helianthus tuberosus Helianthus tuberosus Helianthus tuberosus -Heliconius melpomene Heliconius melpomene Heliconius melpomene -Heligmosomoides polygyrus Heligmosomoides polygyrus Heligmosomoides polygyrus -Herpes B Herpes B Herpes B -Herpes Simplex Herpes Simplex Herpes Simplex -Herpesvirus of Herpesvirus of Herpesvirus of -Herpesvirus saimiri Herpesvirus saimiri Herpesvirus saimiri -Hevea brasiliensis Hevea brasiliensis Hevea brasiliensis -Hippoglossus hippoglossus Hippoglossus hippoglossus Hippoglossus hippoglossus -Homo sapiens Homo sapiens Homo sapiens -Hordeum vulgare Hordeum vulgare Hordeum vulgare -Human cytomegalovirus Human cytomegalovirus Human cytomegalovirus -Human herpesvirus Human herpesvirus Human herpesvirus -Human immunodeficiency Human immunodeficiency Human immunodeficiency -Hydra magnipapillata Hydra magnipapillata Hydra magnipapillata -Ictalurus punctatus Ictalurus punctatus Ictalurus punctatus -Infectious laryngotracheitis Infectious laryngotracheitis Infectious laryngotracheitis -Ixodes scapularis Ixodes scapularis Ixodes scapularis -JC polyomavirus JC polyomavirus JC polyomavirus -Kaposi sarcoma-associated Kaposi sarcoma-associated Kaposi sarcoma-associated -Lagothrix lagotricha Lagothrix lagotricha Lagothrix lagotricha -Lemur catta Lemur catta Lemur catta -Leucosolenia complicata Leucosolenia complicata Leucosolenia complicata -Linum usitatissimum Linum usitatissimum Linum usitatissimum -Locusta migratoria Locusta migratoria Locusta migratoria -Lottia gigantea Lottia gigantea Lottia gigantea -Lotus japonicus Lotus japonicus Lotus japonicus -Lytechinus variegatus Lytechinus variegatus Lytechinus variegatus -Macaca mulatta Macaca mulatta Macaca mulatta -Macaca nemestrina Macaca nemestrina Macaca nemestrina -Macropus eugenii Macropus eugenii Macropus eugenii -Malus domestica Malus domestica Malus domestica -Manduca sexta Manduca sexta Manduca sexta -Manihot esculenta Manihot esculenta Manihot esculenta -Mareks disease Mareks disease Mareks disease -Marsupenaeus japonicus Marsupenaeus japonicus Marsupenaeus japonicus -Medicago truncatula Medicago truncatula Medicago truncatula -Melibe leonina Melibe leonina Melibe leonina -Merkel cell Merkel cell Merkel cell -Mesocestoides corti Mesocestoides corti Mesocestoides corti -Metriaclima zebra Metriaclima zebra Metriaclima zebra -Microcebus murinus Microcebus murinus Microcebus murinus -Monodelphis domestica Monodelphis domestica Monodelphis domestica -Mouse cytomegalovirus Mouse cytomegalovirus Mouse cytomegalovirus -Mouse gammaherpesvirus Mouse gammaherpesvirus Mouse gammaherpesvirus -Mus musculus Mus musculus Mus musculus -Nasonia giraulti Nasonia giraulti Nasonia giraulti -Nasonia longicornis Nasonia longicornis Nasonia longicornis -Nasonia vitripennis Nasonia vitripennis Nasonia vitripennis -Nematostella vectensis Nematostella vectensis Nematostella vectensis -Neolamprologus brichardi Neolamprologus brichardi Neolamprologus brichardi -Nicotiana tabacum Nicotiana tabacum Nicotiana tabacum -Nomascus leucogenys Nomascus leucogenys Nomascus leucogenys -Oikopleura dioica Oikopleura dioica Oikopleura dioica -Ophiophagus hannah Ophiophagus hannah Ophiophagus hannah -Oreochromis niloticus Oreochromis niloticus Oreochromis niloticus -Ornithorhynchus anatinus Ornithorhynchus anatinus Ornithorhynchus anatinus -Oryctolagus cuniculus Oryctolagus cuniculus Oryctolagus cuniculus -Oryza sativa Oryza sativa Oryza sativa -Oryzias latipes Oryzias latipes Oryzias latipes -Otolemur garnettii Otolemur garnettii Otolemur garnettii -Ovis aries Ovis aries Ovis aries -Paeonia lactiflora Paeonia lactiflora Paeonia lactiflora -Pan paniscus Pan paniscus Pan paniscus -Pan troglodytes Pan troglodytes Pan troglodytes -Panagrellus redivivus Panagrellus redivivus Panagrellus redivivus -Panax ginseng Panax ginseng Panax ginseng -Papio hamadryas Papio hamadryas Papio hamadryas -Paralichthys olivaceus Paralichthys olivaceus Paralichthys olivaceus -Parasteatoda tepidariorum Parasteatoda tepidariorum Parasteatoda tepidariorum -Patiria miniata Patiria miniata Patiria miniata -Petromyzon marinus Petromyzon marinus Petromyzon marinus -Phaeodactylum tricornutum Phaeodactylum tricornutum Phaeodactylum tricornutum -Phaseolus vulgaris Phaseolus vulgaris Phaseolus vulgaris -Physcomitrella patens Physcomitrella patens Physcomitrella patens -Phytophthora infestans Phytophthora infestans Phytophthora infestans -Phytophthora ramorum Phytophthora ramorum Phytophthora ramorum -Phytophthora sojae Phytophthora sojae Phytophthora sojae -Picea abies Picea abies Picea abies -Pinus densata Pinus densata Pinus densata -Pinus taeda Pinus taeda Pinus taeda -Plutella xylostella Plutella xylostella Plutella xylostella -Polistes canadensis Polistes canadensis Polistes canadensis -Pongo pygmaeus Pongo pygmaeus Pongo pygmaeus -Populus euphratica Populus euphratica Populus euphratica -Populus trichocarpa Populus trichocarpa Populus trichocarpa -Pristionchus pacificus Pristionchus pacificus Pristionchus pacificus -Prunus persica Prunus persica Prunus persica -Pseudorabies virus Pseudorabies virus Pseudorabies virus -Pteropus alecto Pteropus alecto Pteropus alecto -Pundamilia nyererei Pundamilia nyererei Pundamilia nyererei -Pygathrix bieti Pygathrix bieti Pygathrix bieti -Python bivittatus Python bivittatus Python bivittatus -Raccoon polyomavirus Raccoon polyomavirus Raccoon polyomavirus -Rattus norvegicus Rattus norvegicus Rattus norvegicus -Rehmannia glutinosa Rehmannia glutinosa Rehmannia glutinosa -Rhesus lymphocryptovirus Rhesus lymphocryptovirus Rhesus lymphocryptovirus -Rhesus monkey Rhesus monkey Rhesus monkey -Rhipicephalus microplus Rhipicephalus microplus Rhipicephalus microplus -Ricinus communis Ricinus communis Ricinus communis -Saccharum officinarum Saccharum officinarum Saccharum officinarum -Saccharum sp. Saccharum sp. Saccharum sp. -Saccoglossus kowalevskii Saccoglossus kowalevskii Saccoglossus kowalevskii -Saguinus labiatus Saguinus labiatus Saguinus labiatus -Saimiri boliviensis Saimiri boliviensis Saimiri boliviensis -Salicornia europaea Salicornia europaea Salicornia europaea -Salmo salar Salmo salar Salmo salar -Salvia miltiorrhiza Salvia miltiorrhiza Salvia miltiorrhiza -Salvia sclarea Salvia sclarea Salvia sclarea -Sarcophilus harrisii Sarcophilus harrisii Sarcophilus harrisii -Schistosoma japonicum Schistosoma japonicum Schistosoma japonicum -Schistosoma mansoni Schistosoma mansoni Schistosoma mansoni -Schmidtea mediterranea Schmidtea mediterranea Schmidtea mediterranea -Selaginella moellendorffii Selaginella moellendorffii Selaginella moellendorffii -Simian foamy Simian foamy Simian foamy -Simian virus Simian virus Simian virus -Solanum lycopersicum Solanum lycopersicum Solanum lycopersicum -Solanum tuberosum Solanum tuberosum Solanum tuberosum -Sorghum bicolor Sorghum bicolor Sorghum bicolor -Spodoptera frugiperda Spodoptera frugiperda Spodoptera frugiperda -Strigamia maritima Strigamia maritima Strigamia maritima -Strongylocentrotus purpuratus Strongylocentrotus purpuratus Strongylocentrotus purpuratus -Strongyloides ratti Strongyloides ratti Strongyloides ratti -Sus scrofa Sus scrofa Sus scrofa -Sycon ciliatum Sycon ciliatum Sycon ciliatum -Symbiodinium microadriaticum Symbiodinium microadriaticum Symbiodinium microadriaticum -Symphalangus syndactylus Symphalangus syndactylus Symphalangus syndactylus -Taeniopygia guttata Taeniopygia guttata Taeniopygia guttata -Terebratulina retusa Terebratulina retusa Terebratulina retusa -Tetranychus urticae Tetranychus urticae Tetranychus urticae -Tetraodon nigroviridis Tetraodon nigroviridis Tetraodon nigroviridis -Theobroma cacao Theobroma cacao Theobroma cacao -Torque teno Torque teno Torque teno -Tribolium castaneum Tribolium castaneum Tribolium castaneum -Triops cancriformis Triops cancriformis Triops cancriformis -Triticum aestivum Triticum aestivum Triticum aestivum -Triticum turgidum Triticum turgidum Triticum turgidum -Tupaia chinensis Tupaia chinensis Tupaia chinensis -Vigna unguiculata Vigna unguiculata Vigna unguiculata -Vitis vinifera Vitis vinifera Vitis vinifera -Vriesea carinata Vriesea carinata Vriesea carinata -Xenopus laevis Xenopus laevis Xenopus laevis -Xenopus tropicalis Xenopus tropicalis Xenopus tropicalis -Xenoturbella bocki Xenoturbella bocki Xenoturbella bocki -Zea mays Zea mays Zea mays -# -#More examples: -# -#mm10 mm10 Mouse (mm10) /depot/data2/galaxy/mm10/bowtie2/mm10 -#dm3 dm3 D. melanogaster (dm3) /depot/data2/galaxy/mm10/bowtie2/dm3 -#