Mercurial > repos > eganrol > rnaseqdataannotation

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.R	Wed Dec 10 06:42:21 2014 -0500
@@ -0,0 +1,223 @@
+#Author : keime / lornage
+#Date : 2014/11
+
+
+########################################################################################################
+#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations
+
+#arguments
+#Species : Name of the species
+#ensversion : version of Ensembl to use
+#fileout : tab-delimited file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
+#corresp : data.frame linking the file loaded into galaxy to the corresponding condition
+#nfiles : number of files(conditions)
+
+#output : a data.frame with the following columns :
+#ensembl gene id
+#raw read counts for each library (one column per library)
+#normalized data for each library (one column per library)
+#normalized data divided by gene length for each library (one column per library)
+#Gene name
+#Description
+
+#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1
+
+#Methods :
+#Normalization is performed using the method described in Genome Biology 2010;11(10):R106
+#and implemented in the DESeq2 Bioconductor package
+#Gene length correspond to the median of the size of all transcripts corresponding to this gene
+#########################################################################################################
+
+
+
+RNAseqDataAnnotation = function(Species, ensversion, fileout, corresp ,nfiles){
+
+  #Create a string vector called libnames that contains the name of the samples
+	libnames=rep("",nfiles)
+	for (i in 1:nfiles){
+		libnames[i]=toString(corresp$Sample_name[i])
+	}
+
+  #For all files in corresp read the corresponding file into R
+	suppressPackageStartupMessages(library(plyr, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE))
+	datalist = list()
+	for(i in 1:nfiles){
+		rawdata=read.table(toString(corresp$Files[i]), header =T, sep ="\t")
+		#noSpikes_htseq.
+		nbrrows=nrow(rawdata)
+		datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
+		colnames(datalist[[i]]) = c("ID",libnames[i])
+	}
+
+  #Join all the files in a data.frame called datafile with rownames = gene id
+	datafile = join_all(datalist, by = "ID", type = "left", match = "all")
+
+  #Calculate the number of geneID pro file
+	nbID=data.frame(rep("",nfiles))
+	for(i in 1:nfiles){
+		nbID[,i]=nrow(datalist[[i]])
+	}
+	totalnbID=apply((nbID[,1:nfiles]),1,sum)
+
+  #Verify that all the files contain the same gene ID
+	if (nrow(datafile)*nfiles==totalnbID[1]){
+
+  #Suppress genes not expressed in all samples
+		datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
+		row.names(datafile)=datafile[,1]
+		data=datafile[,-1]
+
+  #Number of libraries
+		nblib= dim(data)[2]
+  #Determine Data + normalization if the specie is not known
+		if (Species=="None"){
+  #Normalized data calculation
+			nbcol = dim(data)[2] #nb of column in the data.frame
+			suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE, verbose=FALSE, quietly = TRUE))
+			conds = factor(1:nblib)
+			design = data.frame(Condition=conds)
+			dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
+			dds = estimateSizeFactors(dds)
+			datanorm = t(t(data)/sizeFactors(dds))
+
+  #Data + normalization
+			dataall = data.frame(row.names(datafile), data, datanorm )
+
+  #Renames columns
+			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
+			write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
+		}
+
+  #Determine Data + normalization + annotation if the specie is known
+		else{
+  #Add annotations and calculate gene length
+			suppressPackageStartupMessages(library(biomaRt, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE))
+
+  #Convert Ensembl version to host
+			correspondingdate = toString(ensversion)
+			host  = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")
+
+  #Load the correct bmdataset
+			bmdataset = toString(Species)
+			ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset)
+			if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
+				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
+			}
+			else{
+				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
+			}
+
+  #because all the annotations are not always found in a first step
+			not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
+			if (length(not) !=0){
+				if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
+					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl)
+					annotation2 = rbind(annotation1, annotationnot)
+				}
+				else {
+					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
+					annotation2 = rbind(annotation1, annotationnot)
+				}
+			}
+			else{
+				annotation2 = annotation1
+			}
+
+
+  #because all the annotations are not always found in a first or second step
+			not = rownames(data)[!rownames(data) %in% unique(annotation2$ensembl_gene_id)]
+			if (length(not) !=0){
+				if (toString(ensversion)=="oct2014" | toString(ensversion)=="aug2014" ) {
+					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=not, mart=ensembl)
+					annotation = rbind(annotation2, annotationnot)
+				}
+				else {
+					annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
+					annotation = rbind(annotation2, annotationnot)
+				}
+			}
+			else{
+				annotation = annotation2
+			}
+
+  #Exon length
+			ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
+			colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
+
+  #Transcript length
+			tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
+			tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
+			colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
+
+  #Gene length = median of the size of all transcripts corresponding to this gene
+			glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
+
+  #Data with gene length
+			datalen = merge(data, glen, by="row.names")
+			colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
+
+  #Data with annotations and gene length
+			annotationgene = unique(annotation[,1:3])
+			dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
+
+  #To keep only the first part of the gene description (before [)
+			tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
+			f = function(l){
+				if (length(l)>=1){
+					return(l[[1]])
+				}
+				else{
+					return("")
+				}
+			}
+			tmpdescok = unlist(lapply(tmpdesc, f))
+			dataannot$description = tmpdescok
+
+  #Normalized data calculation
+			nbcol = dim(dataannot)[2] #nb of column in the data.frame
+			suppressPackageStartupMessages(library(DESeq2, lib.loc = NULL, character.only = FALSE, logical.return = FALSE, warn.conflicts = FALSE,verbose=FALSE, quietly = TRUE))
+			conds = factor(1:nblib)
+			design = data.frame(Condition=conds)
+			dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
+			dds = estimateSizeFactors(dds)
+			datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
+
+  #Normalized data adjusted for gene length (normalized data / gene length)
+			rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
+
+  #Data + annotations + rpkn
+			dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)]  )
+
+  #Renames columns
+			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
+            write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
+
+		}
+	}
+	else{
+		print("The files are not the same length")
+	}
+}
+
+# Build a dataframe containing the files loaded into galaxy and their corresponding condition
+
+args <- commandArgs(trailingOnly = TRUE)
+
+Files=c()
+Sample_name =c()
+nbcells = (length(args)-3)
+for (i in seq(1,nbcells,2)){
+	Files = c(Files, args[3+i])
+	Sample_name = c(Sample_name, args[4+i])
+}
+nfiles=nbcells/2
+corresp = data.frame(Files=Files, Sample_name=Sample_name)
+
+# Take the informations given by the galaxy user to run the script
+RNAseqDataAnnotation(args[1], args[2],args[3],corresp,nfiles)
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.xml	Wed Dec 10 06:42:21 2014 -0500
@@ -0,0 +1,57 @@
+<tool id="RNAseqDataAnnotation" name="RNAseqDataAnnotation" version="1.0.0">
+  <description>tool for RNAseq Data Normalisation and Annotation</description>
+  <requirements>
+  </requirements>
+
+
+  <command interpreter="R --vanilla --silent --slave -f">
+./RNAseqDataAnnotation.R --args
+	$Species
+	$ensversion
+	$fileout
+	#for $i in $datafiles
+        ${i.file}
+        ${i.condition}
+    #end for
+ </command>
+
+  <inputs>
+      <repeat name="datafiles" title="File" help="Upload the files to concatenate and annotate">
+      <param name="file" label="Select file" type="data"/>
+      <param name="condition" label="Condition of the file" type="text"/>
+      </repeat>
+	  <param name="Species" type="select" label="Select the specie for your data" help="If your specie of interest is not listed, your data will be normalized but no annotation will be added. Contact us if you want us to add your specie." >
+		<option value="hsapiens_gene_ensembl">Homo sapiens</option>
+		<option value="mmusculus_gene_ensembl">Mus musculus</option>
+		<option value="None">Other specie</option>
+	  </param>
+
+	  <param name="ensversion" type="select" label="Select the version of Ensembl to use" >
+		<option value="oct2014">Version 77</option>
+		<option value="aug2014">Version 76</option>
+		<option value="feb2014">Version 75</option>
+		<option value="dec2013">Version 74</option>
+		<option value="sep2013">Version 73</option>
+		<option value="jun2013">Version 72</option>
+		<option value="apr2013">Version 71</option>
+		<option value="jan2013">Version 70</option>
+		<option value="oct2012">Version 69</option>
+		<option value="jul2012">Version 68</option>
+		<option value="may2012">Version 67</option>
+	  </param>
+  </inputs>
+
+  <outputs>
+  <data name="fileout" label="Data_normalization_annotation.txt" format="tabular"/>
+  </outputs>
+ <help>
+**What it does**
+	This function concatenates htseq-count result files and normalizes data.
+	If the specie is known, data will be annotated using the specified Ensembl annotations.
+
+**How**
+	Normalization is performed using the method described in Genome Biology 2010;11(10):R106.
+	and implemented in the DESeq2 Bioconductor package.
+	Gene length correspond to the median of the size of all transcripts corresponding to this gene.
+ </help>
+ </tool>