Mercurial > repos > eganrol > rnaseqdataannotation
changeset 26:f183f8648c5a draft default tip
Uploaded
author | eganrol |
---|---|
date | Wed, 10 Dec 2014 06:42:21 -0500 |
parents | ea3f7a6504e0 |
children | |
files | RNAseqDataAnnotation/RNAseqDataAnnotation.R RNAseqDataAnnotation/RNAseqDataAnnotation.xml |
diffstat | 2 files changed, 280 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.R Wed Dec 10 06:42:21 2014 -0500 @@ -0,0 +1,223 @@ +#Author : keime / lornage +#Date : 2014/11 + + +######################################################################################################## +#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations + +#arguments +#Species : Name of the species +#ensversion : version of Ensembl to use +#fileout : tab-delimited file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length +#corresp : data.frame linking the file loaded into galaxy to the corresponding condition +#nfiles : number of files(conditions) + +#output : a data.frame with the following columns : +#ensembl gene id +#raw read counts for each library (one column per library) +#normalized data for each library (one column per library) +#normalized data divided by gene length for each library (one column per library) +#Gene name +#Description + +#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 + +#Methods : +#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 +#and implemented in the DESeq2 Bioconductor package +#Gene length correspond to the median of the size of all transcripts corresponding to this gene +######################################################################################################### + + + +RNAseqDataAnnotation = function(Species, ensversion, fileout, corresp ,nfiles){ + + #Create a string vector called libnames that contains the name of the samples + libnames=rep("",nfiles) + for (i in 1:nfiles){ + libnames[i]=toString(corresp$Sample_name[i]) + } + + #For all files in corresp read the corresponding file into R + suppressPackageStartupMessages(library(plyr, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE)) + datalist = list() + for(i in 1:nfiles){ + rawdata=read.table(toString(corresp$Files[i]), header =T, sep ="\t") + #noSpikes_htseq. + nbrrows=nrow(rawdata) + datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files + colnames(datalist[[i]]) = c("ID",libnames[i]) + } + + #Join all the files in a data.frame called datafile with rownames = gene id + datafile = join_all(datalist, by = "ID", type = "left", match = "all") + + #Calculate the number of geneID pro file + nbID=data.frame(rep("",nfiles)) + for(i in 1:nfiles){ + nbID[,i]=nrow(datalist[[i]]) + } + totalnbID=apply((nbID[,1:nfiles]),1,sum) + + #Verify that all the files contain the same gene ID + if (nrow(datafile)*nfiles==totalnbID[1]){ + + #Suppress genes not expressed in all samples + datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] + row.names(datafile)=datafile[,1] + data=datafile[,-1] + + #Number of libraries + nblib= dim(data)[2] + #Determine Data + normalization if the specie is not known + if (Species=="None"){ + #Normalized data calculation + nbcol = dim(data)[2] #nb of column in the data.frame + suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE)) + conds = factor(1:nblib) + design = data.frame(Condition=conds) + dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) + dds = estimateSizeFactors(dds) + datanorm = t(t(data)/sizeFactors(dds)) + + #Data + normalization + dataall = data.frame(row.names(datafile), data, datanorm ) + + #Renames columns + colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) + write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) + } + + #Determine Data + normalization + annotation if the specie is known + else{ + #Add annotations and calculate gene length + suppressPackageStartupMessages(library(biomaRt, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE)) + + #Convert Ensembl version to host + correspondingdate = toString(ensversion) + host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") + + #Load the correct bmdataset + bmdataset = toString(Species) + ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) + if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { + annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) + } + else{ + annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) + } + + #because all the annotations are not always found in a first step + not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] + if (length(not) !=0){ + if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { + annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl) + annotation2 = rbind(annotation1, annotationnot) + } + else { + annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) + annotation2 = rbind(annotation1, annotationnot) + } + } + else{ + annotation2 = annotation1 + } + + + #because all the annotations are not always found in a first or second step + not = rownames(data)[!rownames(data) %in% unique(annotation2$ensembl_gene_id)] + if (length(not) !=0){ + if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) { + annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl) + annotation = rbind(annotation2, annotationnot) + } + else { + annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) + annotation = rbind(annotation2, annotationnot) + } + } + else{ + annotation = annotation2 + } + + #Exon length + ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) + colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") + + #Transcript length + tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) + tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") + colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") + + #Gene length = median of the size of all transcripts corresponding to this gene + glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) + + #Data with gene length + datalen = merge(data, glen, by="row.names") + colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") + + #Data with annotations and gene length + annotationgene = unique(annotation[,1:3]) + dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") + + #To keep only the first part of the gene description (before [) + tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) + f = function(l){ + if (length(l)>=1){ + return(l[[1]]) + } + else{ + return("") + } + } + tmpdescok = unlist(lapply(tmpdesc, f)) + dataannot$description = tmpdescok + + #Normalized data calculation + nbcol = dim(dataannot)[2] #nb of column in the data.frame + suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE)) + conds = factor(1:nblib) + design = data.frame(Condition=conds) + dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) + dds = estimateSizeFactors(dds) + datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) + + #Normalized data adjusted for gene length (normalized data / gene length) + rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) + + #Data + annotations + rpkn + dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) + + #Renames columns + colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") + write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) + + } + } + else{ + print("The files are not the same length") + } +} + +# Build a dataframe containing the files loaded into galaxy and their corresponding condition + +args <- commandArgs(trailingOnly = TRUE) + +Files=c() +Sample_name =c() +nbcells = (length(args)-3) +for (i in seq(1,nbcells,2)){ + Files = c(Files, args[3+i]) + Sample_name = c(Sample_name, args[4+i]) +} +nfiles=nbcells/2 +corresp = data.frame(Files=Files, Sample_name=Sample_name) + +# Take the informations given by the galaxy user to run the script +RNAseqDataAnnotation(args[1], args[2],args[3],corresp,nfiles) + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Wed Dec 10 06:42:21 2014 -0500 @@ -0,0 +1,57 @@ +<tool id="RNAseqDataAnnotation" name="RNAseqDataAnnotation" version="1.0.0"> + <description>tool for RNAseq Data Normalisation and Annotation</description> + <requirements> + </requirements> + + + <command interpreter="R --vanilla --silent --slave -f"> +./RNAseqDataAnnotation.R --args + $Species + $ensversion + $fileout + #for $i in $datafiles + ${i.file} + ${i.condition} + #end for + </command> + + <inputs> + <repeat name="datafiles" title="File" help="Upload the files to concatenate and annotate"> + <param name="file" label="Select file" type="data"/> + <param name="condition" label="Condition of the file" type="text"/> + </repeat> + <param name="Species" type="select" label="Select the specie for your data" help="If your specie of interest is not listed, your data will be normalized but no annotation will be added. Contact us if you want us to add your specie." > + <option value="hsapiens_gene_ensembl">Homo sapiens</option> + <option value="mmusculus_gene_ensembl">Mus musculus</option> + <option value="None">Other specie</option> + </param> + + <param name="ensversion" type="select" label="Select the version of Ensembl to use" > + <option value="oct2014">Version 77</option> + <option value="aug2014">Version 76</option> + <option value="feb2014">Version 75</option> + <option value="dec2013">Version 74</option> + <option value="sep2013">Version 73</option> + <option value="jun2013">Version 72</option> + <option value="apr2013">Version 71</option> + <option value="jan2013">Version 70</option> + <option value="oct2012">Version 69</option> + <option value="jul2012">Version 68</option> + <option value="may2012">Version 67</option> + </param> + </inputs> + + <outputs> + <data name="fileout" label="Data_normalization_annotation.txt" format="tabular"/> + </outputs> + <help> +**What it does** + This function concatenates htseq-count result files and normalizes data. + If the specie is known, data will be annotated using the specified Ensembl annotations. + +**How** + Normalization is performed using the method described in Genome Biology 2010;11(10):R106. + and implemented in the DESeq2 Bioconductor package. + Gene length correspond to the median of the size of all transcripts corresponding to this gene. + </help> + </tool>