comparison get_deseq_dataset.R @ 44:1cb33de18af5 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 5bf5011af827e93ddeecbfba4815fe9b85f02594
author iuc
date Tue, 09 Dec 2025 17:43:58 +0000
parents 6ef2cba4e35a
children
comparison
equal deleted inserted replaced
43:621ddf5967d2 44:1cb33de18af5
1 get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) { 1 get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) {
2 dir <- ""
2 3
3 dir <- "" 4 has_header <- !is.null(header)
5 use_txi <- !is.null(tximport)
6 if (use_txi) {
7 if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport")
8 if (tolower(file_ext(tx2gene)) == "gff") {
9 gff_file <- tx2gene
10 } else {
11 gff_file <- NULL
12 tx2gene <- read.table(tx2gene, header = has_header)
13 }
14 }
4 15
5 has_header <- !is.null(header) 16 if (!use_txi && has_header) {
6 use_txi <- !is.null(tximport) 17 countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
7 if (use_txi) { 18 tbl <- do.call("cbind", countfiles)
8 if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport") 19 colnames(tbl) <- rownames(sample_table) # take sample ids from header
9 if (tolower(file_ext(tx2gene)) == "gff") { 20
10 gff_file <- tx2gene 21 # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
22 old_special_names <- c(
23 "no_feature",
24 "ambiguous",
25 "too_low_aQual",
26 "not_aligned",
27 "alignment_not_unique"
28 )
29 special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
30 tbl <- tbl[!special_rows, , drop = FALSE]
31
32 dds <- DESeqDataSetFromMatrix(
33 countData = tbl,
34 colData = subset(sample_table, select = -filename),
35 design = design_formula
36 )
37 } else if (!use_txi && !has_header) {
38 # construct the object from HTSeq files
39 dds <- DESeqDataSetFromHTSeqCount(
40 sampleTable = sample_table,
41 directory = dir,
42 design = design_formula
43 )
44 colnames(dds) <- row.names(sample_table)
11 } else { 45 } else {
12 gff_file <- NULL 46 # construct the object using tximport
13 tx2gene <- read.table(tx2gene, header = has_header) 47 library("tximport")
48 txi_files <- as.character(sample_table$filename)
49 labs <- row.names(sample_table)
50 names(txi_files) <- labs
51 if (!is.null(gff_file)) {
52 # first need to make the tx2gene table
53 # this takes ~2-3 minutes using Bioconductor functions
54 suppressPackageStartupMessages({
55 library("GenomicFeatures")
56 })
57 txdb <- makeTxDbFromGFF(gff_file)
58 k <- keys(txdb, keytype = "TXNAME")
59 tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
60 # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
61 tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME) # nolint
62 }
63 try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
64 if (!exists("txi")) {
65 # Remove version from transcript IDs in tx2gene...
66 tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME) # nolint
67 # ...and in txi_files
68 txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
69 }
70 dds <- DESeqDataSetFromTximport(
71 txi,
72 subset(sample_table, select = -c(filename)),
73 design_formula
74 )
14 } 75 }
15 } 76 return(dds)
16
17 if (!use_txi && has_header) {
18 countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
19 tbl <- do.call("cbind", countfiles)
20 colnames(tbl) <- rownames(sample_table) # take sample ids from header
21
22 # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
23 old_special_names <- c(
24 "no_feature",
25 "ambiguous",
26 "too_low_aQual",
27 "not_aligned",
28 "alignment_not_unique"
29 )
30 special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
31 tbl <- tbl[!special_rows, , drop = FALSE]
32
33 dds <- DESeqDataSetFromMatrix(
34 countData = tbl,
35 colData = subset(sample_table, select = -filename),
36 design = design_formula
37 )
38 } else if (!use_txi && !has_header) {
39
40 # construct the object from HTSeq files
41 dds <- DESeqDataSetFromHTSeqCount(
42 sampleTable = sample_table,
43 directory = dir,
44 design = design_formula
45 )
46 colnames(dds) <- row.names(sample_table)
47
48 } else {
49 # construct the object using tximport
50 library("tximport")
51 txi_files <- as.character(sample_table$filename)
52 labs <- row.names(sample_table)
53 names(txi_files) <- labs
54 if (!is.null(gff_file)) {
55 # first need to make the tx2gene table
56 # this takes ~2-3 minutes using Bioconductor functions
57 suppressPackageStartupMessages({
58 library("GenomicFeatures")
59 })
60 txdb <- makeTxDbFromGFF(gff_file)
61 k <- keys(txdb, keytype = "TXNAME")
62 tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
63 # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
64 tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME) # nolint
65 }
66 try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
67 if (!exists("txi")) {
68 # Remove version from transcript IDs in tx2gene...
69 tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME) # nolint
70 # ...and in txi_files
71 txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
72 }
73 dds <- DESeqDataSetFromTximport(
74 txi,
75 subset(sample_table, select = -c(filename)),
76 design_formula
77 )
78 }
79 return(dds)
80 } 77 }