# HG changeset patch
# User eganrol
# Date 1416472012 18000
# Node ID 30506ce512f09c622ffb1874f94e3adeac0b6343
# Parent 90a12c339d99e59fefe8c521cfbb6272974f45f1
Uploaded
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/RNAseqDataAnnotation.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.R Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,209 @@
+#Author : keime / lornage
+#Date : 2014/11
+
+
+########################################################################################################
+#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations
+
+#arguments
+#path2htseqfiles : path to htseq-count result files
+#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header
+#Species : latin name of the species
+#ensversion : version of Ensembl to use
+#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
+#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host
+# (Column1 : Version Column2 : Host)
+#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use
+# (Column1 : Specie Column2 : Dataset)
+
+#output : a data.frame with the following columns :
+#ensembl gene id
+#raw read counts for each library (one column per library)
+#normalized data for each library (one column per library)
+#normalized data divided by gene length for each library (one column per library)
+#Gene name
+#Description
+
+#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1
+
+#Methods :
+#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end
+#Normalization is performed using the method described in Genome Biology 2010;11(10):R106
+#and implemented in the DESeq2 Bioconductor package
+#Gene length correspond to the median of the size of all transcripts corresponding to this gene
+#########################################################################################################
+
+
+
+RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){
+
+ #Create a list with the file names in path2htseqfiles
+ sampleFiles=list.files(path2htseqfiles)
+ sampleFiles=strsplit(sampleFiles,".txt")
+ #_noSpikes_htseq
+ nfiles=length(sampleFiles)
+
+ #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names
+ corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character"))
+ corresp$File = strsplit(corresp$File,".fastq.gz")
+
+ #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles
+ libnames=rep("",nfiles)
+ for (i in 1:nfiles){
+ libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]]
+ }
+
+ #For all files located in path2htseqfiles read the corresponding file into R
+ library(plyr)
+ datalist = list()
+ for(i in 1:nfiles){
+ rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep="."))
+ #noSpikes_htseq.
+ nbrrows=nrow(rawdata)
+ datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
+ colnames(datalist[[i]]) = c("ID",libnames[i])
+ }
+
+ #Join all the files in a data.frame called datafile with rownames = gene id
+ datafile = join_all(datalist, by = "ID", type = "left", match = "all")
+
+ #Calculate the number of geneID pro file
+ nbID=data.frame(rep("",nfiles))
+ for(i in 1:nfiles){
+ nbID[,i]=nrow(datalist[[i]])
+ }
+ totalnbID=apply((nbID[,1:4]),1,sum)
+
+ #Verify that all the files contain the same gene ID
+ if (nrow(datafile)*4==totalnbID[1]){
+
+ #Suppress genes not expressed in all samples
+ datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
+ row.names(datafile)=datafile[,1]
+ data=datafile[,-1]
+
+ #Number of libraries
+ nblib= dim(data)[2]
+ #Determine Data + normalization if the specie is not known
+ if (Species==""){
+ #Normalized data calculation
+ nbcol = dim(data)[2] #nb of column in the data.frame
+ library(DESeq2)
+ conds = factor(1:nblib)
+ design = data.frame(Condition=conds)
+ dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
+ dds = estimateSizeFactors(dds)
+ datanorm = t(t(data)/sizeFactors(dds))
+
+ #Data + normalization
+ dataall = data.frame(row.names(datafile), data, datanorm )
+
+ #Renames columns
+ colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
+ write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
+ }
+
+ #Determine Data + normalization + annotation if the specie is known
+ else{
+ #Add annotations and calculate gene length
+ library(biomaRt)
+
+ #Convert Ensembl version to host
+ conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character"))
+ correspondingdate = conversionfile[conversionfile$Version == ensversion, 2]
+ host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")
+
+ #Convert species name to the name of the corresponding bmdataset
+ conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character"))
+ bmdataset = conversion[conversion$Specie == Species, 2]
+ ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset)
+ if (ensversion<=75){
+ annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
+ }
+ else{
+ annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
+ }
+
+ #because all the annotations are not always found in a first step
+ not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
+ if (length(not) !=0){
+ annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
+ annotation = rbind(annotation1, annotationnot)
+ }
+ else{
+ annotation = annotation1
+ }
+
+ #Exon length
+ ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
+ colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
+
+ #Transcript length
+ tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
+ tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
+ colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
+
+ #Gene length = median of the size of all transcripts corresponding to this gene
+ glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
+
+ #Data with gene length
+ datalen = merge(data, glen, by="row.names")
+ colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
+
+ #Data with annotations and gene length
+ annotationgene = unique(annotation[,1:3])
+ dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
+
+ #To keep only the first part of the gene description (before [)
+ tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
+ f = function(l){
+ if (length(l)>=1){
+ return(l[[1]])
+ }
+ else{
+ return("")
+ }
+ }
+ tmpdescok = unlist(lapply(tmpdesc, f))
+ dataannot$description = tmpdescok
+
+ #Normalized data calculation
+ nbcol = dim(dataannot)[2] #nb of column in the data.frame
+ library(DESeq2)
+ conds = factor(1:nblib)
+ design = data.frame(Condition=conds)
+ dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
+ dds = estimateSizeFactors(dds)
+ datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
+
+ #Normalized data adjusted for gene length (normalized data / gene length)
+ rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
+
+ #Data + annotations + rpkn
+ dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] )
+
+ #Renames columns
+ colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
+ write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
+
+ #Return(dataall)
+
+ }
+ }
+ else{
+ print("The files are not the same length")
+ }
+}
+
+args <- commandArgs(trailingOnly = TRUE)
+print(args)
+
+RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7])
+
+#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt
+
+
+
+
+
+
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/RNAseqDataAnnotation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,52 @@
+
+ tool for RNAseq Data Normalisation and Annotation
+
+
+ R_3_0_2
+ DESeq2biomaRt
+
+
+
+ R --slave --vanilla --file=RNAseqDataAnnotation.R --args
+ $path2htseqfiles
+ $samplenamefile
+ $Species
+ $ensversion
+ $conversionensemblversion
+ $conversionensemblname
+ $fileout
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does*
+**Example**
+
+
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/packages.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/packages.R Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,5 @@
+source("http://bioconductor.org/biocLite.R")
+biocLite()
+biocLite("DESeq2")
+biocLite("biomaRt")
+install.packages("plyr")
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,12 @@
+Version Host
+77 oct2014
+76 aug2014
+75 feb2014
+74 dec2013
+73 sep2013
+72 jun2013
+71 apr2013
+70 jan2013
+69 oct2012
+68 jul2012
+67 may2012
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,3 @@
+Specie Dataset
+Homo_sapiens hsapiens_gene_ensembl
+Mus_musculus mmusculus_gene_ensembl
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier1.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Fichier1.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,19 @@
+ENSG00000000005 0
+ENSG00000000419 2661
+ENSG00000000457 602
+ENSG00000000460 2077
+ENSG00000000938 2
+ENSG00000000971 75
+ENSG00000001036 2389
+ENSG00000001084 1730
+ENSG00000001167 1473
+ENSG00000001460 387
+ENSG00000001461 905
+ENSG00000001497 2975
+ENSG00000001561 19
+ENSG00000001617 118
+ENSG00000001626 2
+ENSG00000001629 2559
+ENSG00000001630 314
+ENSG00000001631 1581
+ENSG00000002016 307
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier2.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Fichier2.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,19 @@
+ENSG00000000005 0
+ENSG00000000419 3409
+ENSG00000000457 706
+ENSG00000000460 2385
+ENSG00000000938 0
+ENSG00000000971 100
+ENSG00000001036 2876
+ENSG00000001084 2154
+ENSG00000001167 1695
+ENSG00000001460 405
+ENSG00000001461 1010
+ENSG00000001497 3344
+ENSG00000001561 27
+ENSG00000001617 132
+ENSG00000001626 1
+ENSG00000001629 3042
+ENSG00000001630 352
+ENSG00000001631 1865
+ENSG00000002016 375
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier3.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Fichier3.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,19 @@
+ENSG00000000005 0
+ENSG00000000419 2171
+ENSG00000000457 484
+ENSG00000000460 1056
+ENSG00000000938 1
+ENSG00000000971 157
+ENSG00000001036 2019
+ENSG00000001084 1580
+ENSG00000001167 1290
+ENSG00000001460 311
+ENSG00000001461 1607
+ENSG00000001497 2217
+ENSG00000001561 34
+ENSG00000001617 54
+ENSG00000001626 2
+ENSG00000001629 2116
+ENSG00000001630 207
+ENSG00000001631 1501
+ENSG00000002016 263
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier4.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/Fichier4.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,19 @@
+ENSG00000000005 0
+ENSG00000000419 2495
+ENSG00000000457 521
+ENSG00000000460 1092
+ENSG00000000938 1
+ENSG00000000971 192
+ENSG00000001036 2217
+ENSG00000001084 1685
+ENSG00000001167 1509
+ENSG00000001460 362
+ENSG00000001461 1622
+ENSG00000001497 2369
+ENSG00000001561 41
+ENSG00000001617 69
+ENSG00000001626 4
+ENSG00000001629 2361
+ENSG00000001630 215
+ENSG00000001631 1626
+ENSG00000002016 295
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/ichierconvertitnames.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/test-data/ichierconvertitnames.txt Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,5 @@
+File Sample_name
+Fichier1.fastq.gz siLuc2
+Fichier2.fastq.gz siLuc3
+Fichier3.fastq.gz siMitf3
+Fichier4.fastq.gz siMitf4
diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/RNAseqDataAnnotation/tool_dependencies.xml Thu Nov 20 03:26:52 2014 -0500
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ R CMD BATCH packages.R
+ echo "export PATH=$PATH" > $INSTALL_DIR/env.sh
+ chmod 755 $INSTALL_DIR/env.sh
+
+
+
+
+
+