21
|
1 # Name: region_motif_intersect.r
|
|
2 # Description: Takes a bed file of target regions and counts intersections
|
31
|
3 # of each motif (in separately installed tabix database) and target regions.
|
35
|
4 # Author: Jeremy Liu
|
21
|
5 # Email: jeremy.liu@yale.edu
|
35
|
6 # Date: 15/02/11
|
37
|
7 # Note: This script can be invoked with the following command
|
31
|
8 # R --slave --vanilla -f ./region_motif_intersect.r --args <db_bgz> <db_tbi> <inbed> <outtab>
|
37
|
9 # Dependencies: region_motif_data_manager, Rsamtools
|
21
|
10
|
|
11 # Auxiliary function to concatenate multiple strings
|
|
12 concat <- function(...) {
|
|
13 input_list <- list(...)
|
|
14 return(paste(input_list, sep="", collapse=""))
|
|
15 }
|
|
16
|
31
|
17 # Retrive motif database path
|
21
|
18 args <- commandArgs()
|
36
|
19 motifDB_bgz = unlist(strsplit(args[7], ','))[1] # Handles duplicate entries in data table
|
|
20 motifDB_tbi = unlist(strsplit(args[8], ','))[1] # Just takes the first one
|
21
|
21
|
|
22 # Set input and reference files, comment to toggle commmand line arguments
|
|
23 inBed = args[9]
|
|
24 outTab = args[10]
|
|
25
|
|
26 # Auxiliary function to read in BED file
|
|
27 read_bed <- function(file) {
|
|
28 return(read.table(file, sep="\t", stringsAsFactors=FALSE))
|
|
29 }
|
|
30
|
|
31 startTime = Sys.time()
|
|
32 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n")
|
|
33
|
|
34 # Load dependencies
|
|
35 cat("Loading dependencies...\n")
|
37
|
36 suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE)) # NEED TO HANDLE INSTALLATION
|
21
|
37
|
|
38 # Initializing hash table (as env) with motif names and loading tabix file
|
|
39 cat("Loading motif database and initializing hash table...\n")
|
|
40 motifTable = new.env()
|
31
|
41 motifTbx <- TabixFile(motifDB_bgz)
|
21
|
42
|
|
43 # Loading input bed file, convert integer columns to numeric, name columns
|
|
44 cat("Loading region file...\n")
|
|
45 regionsDF = read_bed(inBed)
|
|
46 dfTemp = sapply(regionsDF, is.integer)
|
|
47 regionsDF[dfTemp] = lapply(regionsDF[dfTemp], as.numeric)
|
|
48 names(regionsDF)[names(regionsDF) == "V1"] = "chr"
|
|
49 names(regionsDF)[names(regionsDF) == "V2"] = "start"
|
|
50 names(regionsDF)[names(regionsDF) == "V3"] = "end"
|
|
51
|
|
52 # Filtering regions to exclude chromosomes not in motif database
|
|
53 cat("Determining intersection counts...\n")
|
|
54 motifTbxChrs = seqnamesTabix(motifTbx)
|
|
55 regionsDFFilter = subset(regionsDF, chr %in% motifTbxChrs)
|
|
56
|
|
57 # Loading regions into GRanges object and scanning motif tabix database
|
|
58 # Region end is incremented by 1 since scanTabix querying is inclusive for
|
|
59 # position start but exclusive for position end.
|
|
60 param = GRanges(regionsDFFilter$chr, IRanges(regionsDFFilter$start,
|
|
61 end=regionsDFFilter$end + 1))
|
|
62 regionsIntersects = scanTabix(motifTbx, param=param)
|
|
63
|
|
64 # Parsing result list and updating motif count hash table
|
|
65 cat("Parsing result list...\n")
|
|
66 for(regionIntersects in regionsIntersects) {
|
|
67 for(regionIntersect in strsplit(regionIntersects, " ")) {
|
|
68 intersectMotif = strsplit(regionIntersect, "\t")[[1]][4]
|
|
69 if(is.null(motifTable[[intersectMotif]])) {
|
|
70 motifTable[[intersectMotif]] = 1
|
|
71 } else {
|
|
72 motifTable[[intersectMotif]] = motifTable[[intersectMotif]] + 1
|
|
73 }
|
|
74 }
|
|
75 }
|
|
76
|
|
77 # Converting motif count hash table to an integer vector for output
|
|
78 counts = integer(length = length(ls(motifTable)))
|
|
79 names(counts) = ls(motifTable)
|
|
80 for(motifName in ls(motifTable)) {
|
|
81 counts[motifName] = as.integer(motifTable[[motifName]])
|
|
82 }
|
|
83
|
|
84 # Outputting intersection counts to tab delineated file
|
|
85 cat("Outputting to file...\n")
|
|
86 write.table(counts, outTab, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
|
|
87 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."),
|
|
88 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")
|