# HG changeset patch # User eganrol # Date 1416472012 18000 # Node ID 30506ce512f09c622ffb1874f94e3adeac0b6343 # Parent 90a12c339d99e59fefe8c521cfbb6272974f45f1 Uploaded diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/RNAseqDataAnnotation.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.R Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,209 @@ +#Author : keime / lornage +#Date : 2014/11 + + +######################################################################################################## +#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations + +#arguments +#path2htseqfiles : path to htseq-count result files +#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header +#Species : latin name of the species +#ensversion : version of Ensembl to use +#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length +#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host +# (Column1 : Version Column2 : Host) +#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use +# (Column1 : Specie Column2 : Dataset) + +#output : a data.frame with the following columns : +#ensembl gene id +#raw read counts for each library (one column per library) +#normalized data for each library (one column per library) +#normalized data divided by gene length for each library (one column per library) +#Gene name +#Description + +#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 + +#Methods : +#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end +#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 +#and implemented in the DESeq2 Bioconductor package +#Gene length correspond to the median of the size of all transcripts corresponding to this gene +######################################################################################################### + + + +RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){ + + #Create a list with the file names in path2htseqfiles + sampleFiles=list.files(path2htseqfiles) + sampleFiles=strsplit(sampleFiles,".txt") + #_noSpikes_htseq + nfiles=length(sampleFiles) + + #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names + corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character")) + corresp$File = strsplit(corresp$File,".fastq.gz") + + #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles + libnames=rep("",nfiles) + for (i in 1:nfiles){ + libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]] + } + + #For all files located in path2htseqfiles read the corresponding file into R + library(plyr) + datalist = list() + for(i in 1:nfiles){ + rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep=".")) + #noSpikes_htseq. + nbrrows=nrow(rawdata) + datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files + colnames(datalist[[i]]) = c("ID",libnames[i]) + } + + #Join all the files in a data.frame called datafile with rownames = gene id + datafile = join_all(datalist, by = "ID", type = "left", match = "all") + + #Calculate the number of geneID pro file + nbID=data.frame(rep("",nfiles)) + for(i in 1:nfiles){ + nbID[,i]=nrow(datalist[[i]]) + } + totalnbID=apply((nbID[,1:4]),1,sum) + + #Verify that all the files contain the same gene ID + if (nrow(datafile)*4==totalnbID[1]){ + + #Suppress genes not expressed in all samples + datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] + row.names(datafile)=datafile[,1] + data=datafile[,-1] + + #Number of libraries + nblib= dim(data)[2] + #Determine Data + normalization if the specie is not known + if (Species==""){ + #Normalized data calculation + nbcol = dim(data)[2] #nb of column in the data.frame + library(DESeq2) + conds = factor(1:nblib) + design = data.frame(Condition=conds) + dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) + dds = estimateSizeFactors(dds) + datanorm = t(t(data)/sizeFactors(dds)) + + #Data + normalization + dataall = data.frame(row.names(datafile), data, datanorm ) + + #Renames columns + colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) + write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) + } + + #Determine Data + normalization + annotation if the specie is known + else{ + #Add annotations and calculate gene length + library(biomaRt) + + #Convert Ensembl version to host + conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character")) + correspondingdate = conversionfile[conversionfile$Version == ensversion, 2] + host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") + + #Convert species name to the name of the corresponding bmdataset + conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character")) + bmdataset = conversion[conversion$Specie == Species, 2] + ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) + if (ensversion<=75){ + annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) + } + else{ + annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) + } + + #because all the annotations are not always found in a first step + not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] + if (length(not) !=0){ + annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) + annotation = rbind(annotation1, annotationnot) + } + else{ + annotation = annotation1 + } + + #Exon length + ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) + colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") + + #Transcript length + tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) + tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") + colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") + + #Gene length = median of the size of all transcripts corresponding to this gene + glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) + + #Data with gene length + datalen = merge(data, glen, by="row.names") + colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") + + #Data with annotations and gene length + annotationgene = unique(annotation[,1:3]) + dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") + + #To keep only the first part of the gene description (before [) + tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) + f = function(l){ + if (length(l)>=1){ + return(l[[1]]) + } + else{ + return("") + } + } + tmpdescok = unlist(lapply(tmpdesc, f)) + dataannot$description = tmpdescok + + #Normalized data calculation + nbcol = dim(dataannot)[2] #nb of column in the data.frame + library(DESeq2) + conds = factor(1:nblib) + design = data.frame(Condition=conds) + dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) + dds = estimateSizeFactors(dds) + datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) + + #Normalized data adjusted for gene length (normalized data / gene length) + rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) + + #Data + annotations + rpkn + dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) + + #Renames columns + colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") + write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) + + #Return(dataall) + + } + } + else{ + print("The files are not the same length") + } +} + +args <- commandArgs(trailingOnly = TRUE) +print(args) + +RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7]) + +#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt + + + + + + diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/RNAseqDataAnnotation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,52 @@ + + tool for RNAseq Data Normalisation and Annotation + + + R_3_0_2 + DESeq2biomaRt + + + + R --slave --vanilla --file=RNAseqDataAnnotation.R --args + $path2htseqfiles + $samplenamefile + $Species + $ensversion + $conversionensemblversion + $conversionensemblname + $fileout + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does* +**Example** + + diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/packages.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/packages.R Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,5 @@ +source("http://bioconductor.org/biocLite.R") +biocLite() +biocLite("DESeq2") +biocLite("biomaRt") +install.packages("plyr") diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,12 @@ +Version Host +77 oct2014 +76 aug2014 +75 feb2014 +74 dec2013 +73 sep2013 +72 jun2013 +71 apr2013 +70 jan2013 +69 oct2012 +68 jul2012 +67 may2012 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,3 @@ +Specie Dataset +Homo_sapiens hsapiens_gene_ensembl +Mus_musculus mmusculus_gene_ensembl diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Fichier1.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,19 @@ +ENSG00000000005 0 +ENSG00000000419 2661 +ENSG00000000457 602 +ENSG00000000460 2077 +ENSG00000000938 2 +ENSG00000000971 75 +ENSG00000001036 2389 +ENSG00000001084 1730 +ENSG00000001167 1473 +ENSG00000001460 387 +ENSG00000001461 905 +ENSG00000001497 2975 +ENSG00000001561 19 +ENSG00000001617 118 +ENSG00000001626 2 +ENSG00000001629 2559 +ENSG00000001630 314 +ENSG00000001631 1581 +ENSG00000002016 307 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier2.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Fichier2.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,19 @@ +ENSG00000000005 0 +ENSG00000000419 3409 +ENSG00000000457 706 +ENSG00000000460 2385 +ENSG00000000938 0 +ENSG00000000971 100 +ENSG00000001036 2876 +ENSG00000001084 2154 +ENSG00000001167 1695 +ENSG00000001460 405 +ENSG00000001461 1010 +ENSG00000001497 3344 +ENSG00000001561 27 +ENSG00000001617 132 +ENSG00000001626 1 +ENSG00000001629 3042 +ENSG00000001630 352 +ENSG00000001631 1865 +ENSG00000002016 375 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier3.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Fichier3.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,19 @@ +ENSG00000000005 0 +ENSG00000000419 2171 +ENSG00000000457 484 +ENSG00000000460 1056 +ENSG00000000938 1 +ENSG00000000971 157 +ENSG00000001036 2019 +ENSG00000001084 1580 +ENSG00000001167 1290 +ENSG00000001460 311 +ENSG00000001461 1607 +ENSG00000001497 2217 +ENSG00000001561 34 +ENSG00000001617 54 +ENSG00000001626 2 +ENSG00000001629 2116 +ENSG00000001630 207 +ENSG00000001631 1501 +ENSG00000002016 263 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/Fichier4.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/Fichier4.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,19 @@ +ENSG00000000005 0 +ENSG00000000419 2495 +ENSG00000000457 521 +ENSG00000000460 1092 +ENSG00000000938 1 +ENSG00000000971 192 +ENSG00000001036 2217 +ENSG00000001084 1685 +ENSG00000001167 1509 +ENSG00000001460 362 +ENSG00000001461 1622 +ENSG00000001497 2369 +ENSG00000001561 41 +ENSG00000001617 69 +ENSG00000001626 4 +ENSG00000001629 2361 +ENSG00000001630 215 +ENSG00000001631 1626 +ENSG00000002016 295 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/test-data/ichierconvertitnames.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/test-data/ichierconvertitnames.txt Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,5 @@ +File Sample_name +Fichier1.fastq.gz siLuc2 +Fichier2.fastq.gz siLuc3 +Fichier3.fastq.gz siMitf3 +Fichier4.fastq.gz siMitf4 diff -r 90a12c339d99 -r 30506ce512f0 RNAseqDataAnnotation/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RNAseqDataAnnotation/tool_dependencies.xml Thu Nov 20 03:26:52 2014 -0500 @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + R CMD BATCH packages.R + echo "export PATH=$PATH" > $INSTALL_DIR/env.sh + chmod 755 $INSTALL_DIR/env.sh + + + + + +