# HG changeset patch
# User eganrol
# Date 1416416388 18000
# Node ID 2c23b93b1026ca09ad5ae9f7b1258a475af4344f
# Parent d30a5a491cea7ad8c3ba00b76298a7adaf90e128
Deleted selected files
diff -r d30a5a491cea -r 2c23b93b1026 RNAseqDataAnnotation/RNAseqDataAnnotation.R
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.R Wed Nov 19 11:59:20 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,209 +0,0 @@
-#Author : keime / lornage
-#Date : 2014/11
-
-
-########################################################################################################
-#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations
-
-#arguments
-#path2htseqfiles : path to htseq-count result files
-#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header
-#Species : latin name of the species
-#ensversion : version of Ensembl to use
-#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
-#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host
-# (Column1 : Version Column2 : Host)
-#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use
-# (Column1 : Specie Column2 : Dataset)
-
-#output : a data.frame with the following columns :
-#ensembl gene id
-#raw read counts for each library (one column per library)
-#normalized data for each library (one column per library)
-#normalized data divided by gene length for each library (one column per library)
-#Gene name
-#Description
-
-#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1
-
-#Methods :
-#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end
-#Normalization is performed using the method described in Genome Biology 2010;11(10):R106
-#and implemented in the DESeq2 Bioconductor package
-#Gene length correspond to the median of the size of all transcripts corresponding to this gene
-#########################################################################################################
-
-
-
-RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){
-
- #Create a list with the file names in path2htseqfiles
- sampleFiles=list.files(path2htseqfiles)
- sampleFiles=strsplit(sampleFiles,".txt")
- #_noSpikes_htseq
- nfiles=length(sampleFiles)
-
- #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names
- corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character"))
- corresp$File = strsplit(corresp$File,".fastq.gz")
-
- #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles
- libnames=rep("",nfiles)
- for (i in 1:nfiles){
- libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]]
- }
-
- #For all files located in path2htseqfiles read the corresponding file into R
- library(plyr)
- datalist = list()
- for(i in 1:nfiles){
- rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep="."))
- #noSpikes_htseq.
- nbrrows=nrow(rawdata)
- datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
- colnames(datalist[[i]]) = c("ID",libnames[i])
- }
-
- #Join all the files in a data.frame called datafile with rownames = gene id
- datafile = join_all(datalist, by = "ID", type = "left", match = "all")
-
- #Calculate the number of geneID pro file
- nbID=data.frame(rep("",nfiles))
- for(i in 1:nfiles){
- nbID[,i]=nrow(datalist[[i]])
- }
- totalnbID=apply((nbID[,1:4]),1,sum)
-
- #Verify that all the files contain the same gene ID
- if (nrow(datafile)*4==totalnbID[1]){
-
- #Suppress genes not expressed in all samples
- datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
- row.names(datafile)=datafile[,1]
- data=datafile[,-1]
-
- #Number of libraries
- nblib= dim(data)[2]
- #Determine Data + normalization if the specie is not known
- if (Species==""){
- #Normalized data calculation
- nbcol = dim(data)[2] #nb of column in the data.frame
- library(DESeq2)
- conds = factor(1:nblib)
- design = data.frame(Condition=conds)
- dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
- dds = estimateSizeFactors(dds)
- datanorm = t(t(data)/sizeFactors(dds))
-
- #Data + normalization
- dataall = data.frame(row.names(datafile), data, datanorm )
-
- #Renames columns
- colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
- write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
- }
-
- #Determine Data + normalization + annotation if the specie is known
- else{
- #Add annotations and calculate gene length
- library(biomaRt)
-
- #Convert Ensembl version to host
- conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character"))
- correspondingdate = conversionfile[conversionfile$Version == ensversion, 2]
- host = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")
-
- #Convert species name to the name of the corresponding bmdataset
- conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character"))
- bmdataset = conversion[conversion$Specie == Species, 2]
- ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset)
- if (ensversion<=75){
- annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
- }
- else{
- annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
- }
-
- #because all the annotations are not always found in a first step
- not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
- if (length(not) !=0){
- annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
- annotation = rbind(annotation1, annotationnot)
- }
- else{
- annotation = annotation1
- }
-
- #Exon length
- ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
- colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
-
- #Transcript length
- tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
- tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
- colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
-
- #Gene length = median of the size of all transcripts corresponding to this gene
- glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
-
- #Data with gene length
- datalen = merge(data, glen, by="row.names")
- colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
-
- #Data with annotations and gene length
- annotationgene = unique(annotation[,1:3])
- dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
-
- #To keep only the first part of the gene description (before [)
- tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
- f = function(l){
- if (length(l)>=1){
- return(l[[1]])
- }
- else{
- return("")
- }
- }
- tmpdescok = unlist(lapply(tmpdesc, f))
- dataannot$description = tmpdescok
-
- #Normalized data calculation
- nbcol = dim(dataannot)[2] #nb of column in the data.frame
- library(DESeq2)
- conds = factor(1:nblib)
- design = data.frame(Condition=conds)
- dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
- dds = estimateSizeFactors(dds)
- datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
-
- #Normalized data adjusted for gene length (normalized data / gene length)
- rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
-
- #Data + annotations + rpkn
- dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)] )
-
- #Renames columns
- colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
- write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
-
- #Return(dataall)
-
- }
- }
- else{
- print("The files are not the same length")
- }
-}
-
-args <- commandArgs(trailingOnly = TRUE)
-print(args)
-
-RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7])
-
-#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt
-
-
-
-
-
-
diff -r d30a5a491cea -r 2c23b93b1026 RNAseqDataAnnotation/RNAseqDataAnnotation.xml
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.xml Wed Nov 19 11:59:20 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-
- tool for RNAseq Data Normalisation and Annotation
-
-
- R
- DESeq2biomaRt
-
-
-
- R --slave --vanilla --file=RNAseqDataAnnotation.R --args
- $path2htseqfiles
- $samplenamefile
- $Species
- $ensversion
- $conversionensemblversion
- $conversionensemblname
- $fileout
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does*
-**Example**
-
-
diff -r d30a5a491cea -r 2c23b93b1026 RNAseqDataAnnotation/packages.R
--- a/RNAseqDataAnnotation/packages.R Wed Nov 19 11:59:20 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-source("http://bioconductor.org/biocLite.R")
-biocLite()
-biocLite("DESeq2")
-biocLite("biomaRt")
-install.packages("plyr")
diff -r d30a5a491cea -r 2c23b93b1026 RNAseqDataAnnotation/tool_dependencies.xml
--- a/RNAseqDataAnnotation/tool_dependencies.xml Wed Nov 19 11:59:20 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,22 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
- R CMD BATCH packages.R
- echo "export PATH=$PATH" > $INSTALL_DIR/env.sh
- chmod 755 $INSTALL_DIR/env.sh
-
-
-
-
-
-