Mercurial > repos > eganrol > rnaseqdataannotation

--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.R	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,209 +0,0 @@
-#Author : keime / lornage
-#Date : 2014/11
-
-
-########################################################################################################
-#This function concatenates htseq-count result files, normalizes data and annotates data using Ensembl annotations
-
-#arguments
-#path2htseqfiles : path to htseq-count result files
-#samplenamefile : path ta a tabulated text file with 2 columns : 1. File name 2. Sample names and an header
-#Species : latin name of the species
-#ensversion : version of Ensembl to use
-#fileout : .txt file containing for each library ; gene id, raw read counts, normalized data as well as normalized data/gene length
-#conversionensembleversion : tab-delimited file allowing conversion of the Ensembl version to the host
-#							 (Column1 : Version	 Column2 : Host)
-#conversionensemblname : tab-delimited file allowing conversion of species name to the name of the Ensembl dataset to use
-#						 (Column1 : Specie Column2 : Dataset)
-
-#output : a data.frame with the following columns :
-#ensembl gene id
-#raw read counts for each library (one column per library)
-#normalized data for each library (one column per library)
-#normalized data divided by gene length for each library (one column per library)
-#Gene name
-#Description
-
-#require : biomaRt and DESeq2 Bioconductor packages / package plyr1.8.1
-
-#Methods :
-#Considering that the resulting files of HTSeq-count have 5 lines of comments in the end
-#Normalization is performed using the method described in Genome Biology 2010;11(10):R106
-#and implemented in the DESeq2 Bioconductor package
-#Gene length correspond to the median of the size of all transcripts corresponding to this gene
-#########################################################################################################
-
-
-
-RNAseqDataAnnotation = function(path2htseqfiles, samplenamefile, Species, ensversion, fileout, conversionensemblversion, conversionensemblname){
-
-  #Create a list with the file names in path2htseqfiles
-	sampleFiles=list.files(path2htseqfiles)
-	sampleFiles=strsplit(sampleFiles,".txt")
-	#_noSpikes_htseq
-	nfiles=length(sampleFiles)
-
-  #Read the data in samplenamefile. Create a data frame establishing the correspondence between file names and sample names
-	corresp = read.table(samplenamefile,header=T,sep="\t",colClasses=c("character","character"))
-	corresp$File = strsplit(corresp$File,".fastq.gz")
-
-  #Create a string vector called libnames that contains the name of the samples in the same order as in sampleFiles
-	libnames=rep("",nfiles)
-	for (i in 1:nfiles){
-		libnames[i]=corresp$Sample_name[corresp$File==sampleFiles[[i]]]
-	}
-
-  #For all files located in path2htseqfiles read the corresponding file into R
-	library(plyr)
-	datalist = list()
-	for(i in 1:nfiles){
-		rawdata=read.table(paste(paste(path2htseqfiles,sampleFiles[i],sep="/"),"txt",sep="."))
-		#noSpikes_htseq.
-		nbrrows=nrow(rawdata)
-		datalist[[i]]=rawdata[1:(nbrrows-5), ] # skip the last 5 lines of HTSeq-count files
-		colnames(datalist[[i]]) = c("ID",libnames[i])
-	}
-
-  #Join all the files in a data.frame called datafile with rownames = gene id
-	datafile = join_all(datalist, by = "ID", type = "left", match = "all")
-
-  #Calculate the number of geneID pro file
-	nbID=data.frame(rep("",nfiles))
-	for(i in 1:nfiles){
-		nbID[,i]=nrow(datalist[[i]])
-	}
-	totalnbID=apply((nbID[,1:4]),1,sum)
-
-  #Verify that all the files contain the same gene ID
-	if (nrow(datafile)*4==totalnbID[1]){
-
-  #Suppress genes not expressed in all samples
-		datafile = datafile[apply(datafile[,2:(nfiles+1)],1,sum)!=0,]
-		row.names(datafile)=datafile[,1]
-		data=datafile[,-1]
-
-  #Number of libraries
-		nblib= dim(data)[2]
-  #Determine Data + normalization if the specie is not known
-		if (Species==""){
-  #Normalized data calculation
-			nbcol = dim(data)[2] #nb of column in the data.frame
-			library(DESeq2)
-			conds = factor(1:nblib)
-			design = data.frame(Condition=conds)
-			dds = DESeqDataSetFromMatrix(countData=data, colData=design, design=~Condition)
-			dds = estimateSizeFactors(dds)
-			datanorm = t(t(data)/sizeFactors(dds))
-
-  #Data + normalization
-			dataall = data.frame(row.names(datafile), data, datanorm )
-
-  #Renames columns
-			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"))
-			write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
-		}
-
-  #Determine Data + normalization + annotation if the specie is known
-		else{
-  #Add annotations and calculate gene length
-			library(biomaRt)
-
-  #Convert Ensembl version to host
-			conversionfile = read.table(conversionensemblversion,header=T,sep="\t",colClasses=c("numeric","character"))
-			correspondingdate = conversionfile[conversionfile$Version == ensversion, 2]
-			host  = paste(correspondingdate, ".archive.ensembl.org/biomart/martservice/", sep="")
-
-  #Convert species name to the name of the corresponding bmdataset
-			conversion = read.table(conversionensemblname,header=T,sep="\t",colClasses=c("character","character"))
-			bmdataset = conversion[conversion$Specie == Species, 2]
-			ensembl=useMart("ENSEMBL_MART_ENSEMBL", host=host, dataset=bmdataset)
-			if (ensversion<=75){
-				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
-			}
-			else{
-				annotation1 = getBM(attributes=c("ensembl_gene_id","external_gene_name","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"),filters="ensembl_gene_id", values=rownames(data), mart=ensembl)
-			}
-
-  #because all the annotations are not always found in a first step
-			not = rownames(data)[!rownames(data) %in% unique(annotation1$ensembl_gene_id)]
-			if (length(not) !=0){
-				annotationnot = getBM(attributes=c("ensembl_gene_id","external_gene_id","description", "ensembl_transcript_id","exon_chrom_start","exon_chrom_end"), filters="ensembl_gene_id", values=not, mart=ensembl)
-			annotation = rbind(annotation1, annotationnot)
-			}
-			else{
-				annotation = annotation1
-			}
-
-  #Exon length
-			ensinfos.exlen = data.frame(annotation$ensembl_gene_id, annotation$ensembl_transcript_id, abs(annotation$exon_chrom_start - annotation$exon_chrom_end)+1)
-			colnames(ensinfos.exlen) = c("ensembl_gene_id", "ensembl_transcript_id", "exon_length")
-
-  #Transcript length
-			tlen = tapply(ensinfos.exlen$exon_length, ensinfos.exlen$ensembl_transcript_id, sum)
-			tlen.gene = merge(tlen, unique(ensinfos.exlen[,1:2]), by.x="row.names", by.y="ensembl_transcript_id")
-			colnames(tlen.gene) = c("ensembl_transcript_id", "transcript_length","ensembl_gene_id")
-
-  #Gene length = median of the size of all transcripts corresponding to this gene
-			glen = tapply(tlen.gene$transcript_length, tlen.gene$ensembl_gene_id, median)
-
-  #Data with gene length
-			datalen = merge(data, glen, by="row.names")
-			colnames(datalen) = c("Ensembl_gene_id",colnames(data), "Gene_length")
-
-  #Data with annotations and gene length
-			annotationgene = unique(annotation[,1:3])
-			dataannot = merge(datalen, annotationgene, by.x="Ensembl_gene_id", by.y="ensembl_gene_id")
-
-  #To keep only the first part of the gene description (before [)
-			tmpdesc = strsplit(as.character(dataannot$description),"[", fixed=T)
-			f = function(l){
-				if (length(l)>=1){
-					return(l[[1]])
-				}
-				else{
-					return("")
-				}
-			}
-			tmpdescok = unlist(lapply(tmpdesc, f))
-			dataannot$description = tmpdescok
-
-  #Normalized data calculation
-			nbcol = dim(dataannot)[2] #nb of column in the data.frame
-			library(DESeq2)
-			conds = factor(1:nblib)
-			design = data.frame(Condition=conds)
-			dds = DESeqDataSetFromMatrix(countData=dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)], colData=design, design=~Condition)
-			dds = estimateSizeFactors(dds)
-			datanorm = t(t(dataannot[,-c(1,nbcol,nbcol-1,nbcol-2)])/sizeFactors(dds))
-
-  #Normalized data adjusted for gene length (normalized data / gene length)
-			rpkn = datanorm / (as.vector(dataannot[,nbcol-2]/1000 ))
-
-  #Data + annotations + rpkn
-			dataall = data.frame(dataannot[,-c(nbcol,nbcol-1,nbcol-2)] , datanorm, rpkn, dataannot[,c(nbcol-1,nbcol)]  )
-
-  #Renames columns
-			colnames(dataall) = c("Ensembl gene id", paste(libnames,"(raw read counts)"), paste(libnames,"(normalized)"), paste(libnames,"(normalized and divided by gene length in kb)"), "Gene name", "Description")
-			write.table(dataall, file=fileout, sep="\t", quote=F, row.names=F)
-
-  #Return(dataall)
-
-		}
-	}
-	else{
-		print("The files are not the same length")
-	}
-}
-
-args <- commandArgs(trailingOnly = TRUE)
-print(args)
-
-RNAseqDataAnnotation(args[1], args[2],args[3], args[4], args[5], args[6], args[7])
-
-#R --slave --vanilla --verbose --file=/home/lornage/Bureau/Pour_galaxy/RNAseqDataAnnotation.R --args /home/lornage/Bureau/Test_function /home/lornage/Bureau/ichierconvertitnames.txt Homo_sapiens 75 /home/lornage/Bureau/testttttt5.txt /home/lornage/Bureau/Script_R/Ensembl_Version_Host.txt /home/lornage/Bureau/Script_R/Ensemble_Specie_Dataset.txt
-
-
-
-
-
-
--- a/RNAseqDataAnnotation/RNAseqDataAnnotation.xml	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-<tool id="RNAseqDataAnnotation" name="RNAseqDataAnnotation" version="1.0.0">
-  <description>tool for RNAseq Data Normalisation and Annotation</description>
-  <requirements>
-    <!--<requirement type="set_environment">SCRIPT_PATH</requirement>-->
-    <requirement type="package" version="3.0.2">R_3_0_2</requirement>
-    <requirement type="package" version="1.0">DESeq2biomaRt</requirement>
-  </requirements>
-
- <command>
-	R --slave --vanilla --file=RNAseqDataAnnotation.R --args
-	$path2htseqfiles
-	$samplenamefile
-	$Species
-	$ensversion
-	$conversionensemblversion
-	$conversionensemblname
-	$fileout
- </command>
-
-  <inputs>
-	  <param name="path2htseqfiles" label="Path to the directory containing the files from HTSeq-count" type="text"/>
-	  <param name="samplenamefile" label="Conversion file sample/conditions" type="data" format="tabular" help="file should be tab-delimited"/>
-	  <param name="Species" type="select" label="Select the specie for your data" help="If your specie of interest is not listed, your data will be normalized but no annotation will be added. Contact us if you want us to add your specie." >
-		<option value="Homo_sapiens">Homo sapiens</option>
-		<option value="Mus_musculus">Mus musculus</option>
-		<option value="">Other specie</option>
-	  </param>
-	  <param name="ensversion" type="select" label="Select the version of Ensembl to use" >
-		<option value="67">Version 67</option>
-		<option value="68">Version 68</option>
-		<option value="69">Version 69</option>
-		<option value="70">Version 70</option>
-		<option value="71">Version 71</option>
-		<option value="72">Version 72</option>
-		<option value="73">Version 73</option>
-		<option value="74">Version 74</option>
-		<option value="75">Version 75</option>
-		<option value="76">Version 76</option>
-		<option value="77">Version 77</option>
-	  </param>
-	  <param name="conversionensemblversion" label="File for conversion Ensembl to version" type="data" format="tabular" help="Tab-delimited input file" />
-	  <param name="conversionensemblname" label="File for conversion Ensemble name of the specie " type="data" format="tabular" help="Tab-delimited input file"/>
-  </inputs>
-
-  <outputs>
-<param name="fileout" label="Path where the resulting file should be stored" type="data" format="tabular"/>
-  </outputs>
- <help>
-**What it does*
-**Example**
- </help>
- </tool>
--- a/RNAseqDataAnnotation/packages.R	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-source("http://bioconductor.org/biocLite.R")
-biocLite()
-biocLite("DESeq2")
-biocLite("biomaRt")
-install.packages("plyr")
--- a/RNAseqDataAnnotation/test-data/Ensembl_Version_Host.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-Version	Host
-77	oct2014
-76	aug2014
-75	feb2014
-74	dec2013
-73	sep2013
-72	jun2013
-71	apr2013
-70	jan2013
-69	oct2012
-68	jul2012
-67	may2012
--- a/RNAseqDataAnnotation/test-data/Ensemble_Specie_Dataset.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-Specie	Dataset
-Homo_sapiens	hsapiens_gene_ensembl
-Mus_musculus	mmusculus_gene_ensembl
--- a/RNAseqDataAnnotation/test-data/Fichier1.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-ENSG00000000005	0
-ENSG00000000419	2661
-ENSG00000000457	602
-ENSG00000000460	2077
-ENSG00000000938	2
-ENSG00000000971	75
-ENSG00000001036	2389
-ENSG00000001084	1730
-ENSG00000001167	1473
-ENSG00000001460	387
-ENSG00000001461	905
-ENSG00000001497	2975
-ENSG00000001561	19
-ENSG00000001617	118
-ENSG00000001626	2
-ENSG00000001629	2559
-ENSG00000001630	314
-ENSG00000001631	1581
-ENSG00000002016	307
--- a/RNAseqDataAnnotation/test-data/Fichier2.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-ENSG00000000005	0
-ENSG00000000419	3409
-ENSG00000000457	706
-ENSG00000000460	2385
-ENSG00000000938	0
-ENSG00000000971	100
-ENSG00000001036	2876
-ENSG00000001084	2154
-ENSG00000001167	1695
-ENSG00000001460	405
-ENSG00000001461	1010
-ENSG00000001497	3344
-ENSG00000001561	27
-ENSG00000001617	132
-ENSG00000001626	1
-ENSG00000001629	3042
-ENSG00000001630	352
-ENSG00000001631	1865
-ENSG00000002016	375
--- a/RNAseqDataAnnotation/test-data/Fichier3.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-ENSG00000000005	0
-ENSG00000000419	2171
-ENSG00000000457	484
-ENSG00000000460	1056
-ENSG00000000938	1
-ENSG00000000971	157
-ENSG00000001036	2019
-ENSG00000001084	1580
-ENSG00000001167	1290
-ENSG00000001460	311
-ENSG00000001461	1607
-ENSG00000001497	2217
-ENSG00000001561	34
-ENSG00000001617	54
-ENSG00000001626	2
-ENSG00000001629	2116
-ENSG00000001630	207
-ENSG00000001631	1501
-ENSG00000002016	263
--- a/RNAseqDataAnnotation/test-data/Fichier4.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-ENSG00000000005	0
-ENSG00000000419	2495
-ENSG00000000457	521
-ENSG00000000460	1092
-ENSG00000000938	1
-ENSG00000000971	192
-ENSG00000001036	2217
-ENSG00000001084	1685
-ENSG00000001167	1509
-ENSG00000001460	362
-ENSG00000001461	1622
-ENSG00000001497	2369
-ENSG00000001561	41
-ENSG00000001617	69
-ENSG00000001626	4
-ENSG00000001629	2361
-ENSG00000001630	215
-ENSG00000001631	1626
-ENSG00000002016	295
--- a/RNAseqDataAnnotation/test-data/ichierconvertitnames.txt	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-File	Sample_name
-Fichier1.fastq.gz	siLuc2
-Fichier2.fastq.gz	siLuc3
-Fichier3.fastq.gz	siMitf3
-Fichier4.fastq.gz	siMitf4
--- a/RNAseqDataAnnotation/tool_dependencies.xml	Thu Nov 20 03:51:36 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="R_3_0_2" version="3.0.2">
-		<repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="DESeq2biomaRt" version="1.0">
-		<install version="1.0">
-			<actions>
-                <action type="set_environment_for_install">
-					<repository changeset_revision="b6fe8ca3230d" name="package_r_3_0_2" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu">
-						<package name="R_3_0_2" version="3.0.2" />
-					</repository>
-				</action>
-                <action type="shell_command">R CMD BATCH source("http://bioconductor.org/biocLite.R")</action>
-                <action type="shell_command">R CMD BATCH biocLite()</action>
-                <action type="shell_command">R CMD BATCH biocLite("DESeq2")</action>
-                <action type="shell_command">R CMD BATCH biocLite("biomaRt")</action>
-                <action type="shell_command">R CMD BATCH install.packages("plyr")</action>
-				<!--<action type="shell_command">echo "export PATH=$PATH" > $INSTALL_DIR/env.sh </action>-->
-				<!--<action type="shell_command">chmod 755 $INSTALL_DIR/env.sh </action>-->
-			</actions>
-       </install>
-     <readme>
-     </readme>
-   </package>
-</tool_dependency>