# HG changeset patch # User eganrol # Date 1416416717 18000 # Node ID 80216215973c380e1c469e133b85e19f79f350ff # Parent 22da2dc0e10323afc9d3cd004645d6fea839460e Deleted selected files diff -r 22da2dc0e103 -r 80216215973c RNAseqDataAnnotation/RNAseqDataAnnotation.R --- a/RNAseqDataAnnotation/RNAseqDataAnnotation.R Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,209 +0,0 @@ -#Author : keime / lornage -#Date : 2014/11 - - -######################################################################################################## -#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations - -#arguments -#path2htseqfiles : path to htseq-count result files -#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header -#Species : latin name of the species -#ensversion : version of Ensembl to use -#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length -#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host -# (Column1 : Version Column2 : Host) -#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use -# (Column1 : Specie Column2 : Dataset) - -#output : a data.frame with the following columns : -#ensembl gene id -#raw read counts for each library (one column per library) -#normalized data for each library (one column per library) -#normalized data divided by gene length for each library (one column per library) -#Gene name -#Description - -#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1 - -#Methods : -#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end -#Normalization is performed using the method described in Genome Biology 2010;11(10):R106 -#and implemented in the DESeq2 Bioconductor package -#Gene length correspond to the median of the size of all transcripts corresponding to this gene -######################################################################################################### - - - -RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){ - - #Create a list with the file names in path2htseqfiles - sampleFiles=list.files(path2htseqfiles) - sampleFiles=strsplit(sampleFiles,".txt") - #_noSpikes_htseq - nfiles=length(sampleFiles) - - #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names - corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character")) - corresp$File = strsplit(corresp$File,".fastq.gz") - - #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles - libnames=rep("",nfiles) - for (i in 1:nfiles){ - libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]] - } - - #For all files located in path2htseqfiles read the corresponding file into R - library(plyr) - datalist = list() - for(i in 1:nfiles){ - rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep=".")) - #noSpikes_htseq. - nbrrows=nrow(rawdata) - datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files - colnames(datalist[[i]]) = c("ID",libnames[i]) - } - - #Join all the files in a data.frame called datafile with rownames = gene id - datafile = join_all(datalist, by = "ID", type = "left", match = "all") - - #Calculate the number of geneID pro file - nbID=data.frame(rep("",nfiles)) - for(i in 1:nfiles){ - nbID[,i]=nrow(datalist[[i]]) - } - totalnbID=apply((nbID[,1:4]),1,sum) - - #Verify that all the files contain the same gene ID - if (nrow(datafile)*4==totalnbID[1]){ - - #Suppress genes not expressed in all samples - datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,] - row.names(datafile)=datafile[,1] - data=datafile[,-1] - - #Number of libraries - nblib= dim(data)[2] - #Determine Data + normalization if the specie is not known - if (Species==""){ - #Normalized data calculation - nbcol = dim(data)[2] #nb of column in the data.frame - library(DESeq2) - conds = factor(1:nblib) - design = data.frame(Condition=conds) - dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition) - dds = estimateSizeFactors(dds) - datanorm = t(t(data)/sizeFactors(dds)) - - #Data + normalization - dataall = data.frame(row.names(datafile), data, datanorm ) - - #Renames columns - colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)")) - write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) - } - - #Determine Data + normalization + annotation if the specie is known - else{ - #Add annotations and calculate gene length - library(biomaRt) - - #Convert Ensembl version to host - conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character")) - correspondingdate = conversionfile[conversionfile$Version == ensversion, 2] - host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="") - - #Convert species name to the name of the corresponding bmdataset - conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character")) - bmdataset = conversion[conversion$Specie == Species, 2] - ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset) - if (ensversion<=75){ - annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) - } - else{ - annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl) - } - - #because all the annotations are not always found in a first step - not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)] - if (length(not) !=0){ - annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl) - annotation = rbind(annotation1, annotationnot) - } - else{ - annotation = annotation1 - } - - #Exon length - ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1) - colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length") - - #Transcript length - tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum) - tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id") - colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id") - - #Gene length = median of the size of all transcripts corresponding to this gene - glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median) - - #Data with gene length - datalen = merge(data, glen, by="row.names") - colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length") - - #Data with annotations and gene length - annotationgene = unique(annotation[,1:3]) - dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id") - - #To keep only the first part of the gene description (before [) - tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T) - f = function(l){ - if (length(l)>=1){ - return(l[[1]]) - } - else{ - return("") - } - } - tmpdescok = unlist(lapply(tmpdesc, f)) - dataannot$description = tmpdescok - - #Normalized data calculation - nbcol = dim(dataannot)[2] #nb of column in the data.frame - library(DESeq2) - conds = factor(1:nblib) - design = data.frame(Condition=conds) - dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition) - dds = estimateSizeFactors(dds) - datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds)) - - #Normalized data adjusted for gene length (normalized data / gene length) - rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 )) - - #Data + annotations + rpkn - dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] ) - - #Renames columns - colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description") - write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F) - - #Return(dataall) - - } - } - else{ - print("The files are not the same length") - } -} - -args <- commandArgs(trailingOnly = TRUE) -print(args) - -RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7]) - -#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt - - - - - - diff -r 22da2dc0e103 -r 80216215973c RNAseqDataAnnotation/RNAseqDataAnnotation.xml --- a/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ - - tool for RNAseq Data Normalisation and Annotation - - - R_3_0_2 - DESeq2biomaRt - - - - R --slave --vanilla --file=RNAseqDataAnnotation.R --args - $path2htseqfiles - $samplenamefile - $Species - $ensversion - $conversionensemblversion - $conversionensemblname - $fileout - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does* -**Example** - - diff -r 22da2dc0e103 -r 80216215973c RNAseqDataAnnotation/packages.R --- a/RNAseqDataAnnotation/packages.R Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -source("http://bioconductor.org/biocLite.R") -biocLite() -biocLite("DESeq2") -biocLite("biomaRt") -install.packages("plyr") diff -r 22da2dc0e103 -r 80216215973c RNAseqDataAnnotation/tool_dependencies.xml --- a/RNAseqDataAnnotation/tool_dependencies.xml Wed Nov 19 12:00:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - R CMD BATCH packages.R - echo "export PATH=$PATH" > $INSTALL_DIR/env.sh - chmod 755 $INSTALL_DIR/env.sh - - - - - -