17
|
1 # Name: region_motif_intersect.r
|
|
2 # Description: Takes a bed file of target regions and counts intersections
|
|
3 # of each motif (built in rdata database) and target regions.
|
|
4 # Author: Jeremy liu
|
|
5 # Email: jeremy.liu@yale.edu
|
|
6 # Date: 14/07/02
|
|
7 # Note: This script is meant to be invoked with the following command
|
|
8 # R --slave --vanilla -f ./region_motif_intersect.r --args <workingdir> <db> <inbed> <outtab>
|
|
9 # <workingdir> is working directory of galaxy installation
|
|
10 # <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse
|
|
11 # Dependencies: none
|
|
12
|
|
13 # Auxiliary function to concatenate multiple strings
|
|
14 concat <- function(...) {
|
|
15 input_list <- list(...)
|
|
16 return(paste(input_list, sep="", collapse=""))
|
|
17 }
|
|
18
|
|
19 # Set common and data directories
|
|
20 args <- commandArgs()
|
|
21 workingDir = args[7]
|
|
22 dbDir = concat(workingDir, "/region_motif_db")
|
|
23 dbCode = args[8]
|
|
24 if (dbCode == "t") {
|
|
25 motifDB = concat(dbDir, "/pouya_test_motifs.bed.bgz")
|
|
26 } else if (dbCode == "p") {
|
|
27 motifDB = concat(dbDir, "/pouya_motifs.bed.bgz")
|
|
28 } else if (dbCode == "j") {
|
|
29 motifDB = concat(dbDir, "/jaspar_jolma_motifs.bed.bgz")
|
|
30 } else if (dbCode == "m") {
|
|
31 motifDB = concat(dbDir, "/mm9_motifs.bed.bgz")
|
|
32 } else {
|
|
33 motifDB = concat(dbDir, "/pouya_motifs.bed.bgz")
|
|
34 }
|
|
35
|
|
36 # Set input and reference files, comment to toggle commmand line arguments
|
|
37 inBed = args[9]
|
|
38 outTab = args[10]
|
|
39
|
|
40 # Auxiliary function to read in BED file
|
|
41 read_bed <- function(file) {
|
|
42 return(read.table(file, sep="\t", stringsAsFactors=FALSE))
|
|
43 }
|
|
44
|
|
45 startTime = Sys.time()
|
|
46 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n")
|
|
47
|
|
48 # Load dependencies
|
|
49 cat("Loading dependencies...\n")
|
|
50 suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE))
|
|
51
|
|
52 # Initializing hash table (as env) with motif names and loading tabix file
|
|
53 cat("Loading motif database and initializing hash table...\n")
|
|
54 motifTable = new.env()
|
|
55 motifTbx <- TabixFile(motifDB)
|
|
56
|
|
57 # Loading input bed file, convert integer columns to numeric, name columns
|
|
58 cat("Loading region file...\n")
|
|
59 regionsDF = read_bed(inBed)
|
|
60 dfTemp = sapply(regionsDF, is.integer)
|
|
61 regionsDF[dfTemp] = lapply(regionsDF[dfTemp], as.numeric)
|
|
62 names(regionsDF)[names(regionsDF) == "V1"] = "chr"
|
|
63 names(regionsDF)[names(regionsDF) == "V2"] = "start"
|
|
64 names(regionsDF)[names(regionsDF) == "V3"] = "end"
|
|
65
|
|
66 # Filtering regions to exclude chromosomes not in motif database
|
|
67 cat("Determining intersection counts...\n")
|
|
68 motifTbxChrs = seqnamesTabix(motifTbx)
|
|
69 regionsDFFilter = subset(regionsDF, chr %in% motifTbxChrs)
|
|
70
|
|
71 # Loading regions into GRanges object and scanning motif tabix database
|
|
72 # Region end is incremented by 1 since scanTabix querying is inclusive for
|
|
73 # position start but exclusive for position end.
|
|
74 param = GRanges(regionsDFFilter$chr, IRanges(regionsDFFilter$start,
|
|
75 end=regionsDFFilter$end + 1))
|
|
76 regionsIntersects = scanTabix(motifTbx, param=param)
|
|
77
|
|
78 # Parsing result list and updating motif count hash table
|
|
79 cat("Parsing result list...\n")
|
|
80 for(regionIntersects in regionsIntersects) {
|
|
81 for(regionIntersect in strsplit(regionIntersects, " ")) {
|
|
82 intersectMotif = strsplit(regionIntersect, "\t")[[1]][4]
|
|
83 if(is.null(motifTable[[intersectMotif]])) {
|
|
84 motifTable[[intersectMotif]] = 1
|
|
85 } else {
|
|
86 motifTable[[intersectMotif]] = motifTable[[intersectMotif]] + 1
|
|
87 }
|
|
88 }
|
|
89 }
|
|
90
|
|
91 # Converting motif count hash table to an integer vector for output
|
|
92 counts = integer(length = length(ls(motifTable)))
|
|
93 names(counts) = ls(motifTable)
|
|
94 for(motifName in ls(motifTable)) {
|
|
95 counts[motifName] = as.integer(motifTable[[motifName]])
|
|
96 }
|
|
97
|
|
98 # Outputting intersection counts to tab delineated file
|
|
99 cat("Outputting to file...\n")
|
|
100 write.table(counts, outTab, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
|
|
101 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."),
|
|
102 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")
|