Mercurial > repos > jeremyjliu > region_motif_enrichment
comparison region-motif-compare/region_motif_intersect.r @ 17:7afdfd4f4c1b draft
Uploaded
author | jeremyjliu |
---|---|
date | Wed, 12 Nov 2014 15:21:11 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:9a84f76db861 | 17:7afdfd4f4c1b |
---|---|
1 # Name: region_motif_intersect.r | |
2 # Description: Takes a bed file of target regions and counts intersections | |
3 # of each motif (built in rdata database) and target regions. | |
4 # Author: Jeremy liu | |
5 # Email: jeremy.liu@yale.edu | |
6 # Date: 14/07/02 | |
7 # Note: This script is meant to be invoked with the following command | |
8 # R --slave --vanilla -f ./region_motif_intersect.r --args <workingdir> <db> <inbed> <outtab> | |
9 # <workingdir> is working directory of galaxy installation | |
10 # <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse | |
11 # Dependencies: none | |
12 | |
13 # Auxiliary function to concatenate multiple strings | |
14 concat <- function(...) { | |
15 input_list <- list(...) | |
16 return(paste(input_list, sep="", collapse="")) | |
17 } | |
18 | |
19 # Set common and data directories | |
20 args <- commandArgs() | |
21 workingDir = args[7] | |
22 dbDir = concat(workingDir, "/region_motif_db") | |
23 dbCode = args[8] | |
24 if (dbCode == "t") { | |
25 motifDB = concat(dbDir, "/pouya_test_motifs.bed.bgz") | |
26 } else if (dbCode == "p") { | |
27 motifDB = concat(dbDir, "/pouya_motifs.bed.bgz") | |
28 } else if (dbCode == "j") { | |
29 motifDB = concat(dbDir, "/jaspar_jolma_motifs.bed.bgz") | |
30 } else if (dbCode == "m") { | |
31 motifDB = concat(dbDir, "/mm9_motifs.bed.bgz") | |
32 } else { | |
33 motifDB = concat(dbDir, "/pouya_motifs.bed.bgz") | |
34 } | |
35 | |
36 # Set input and reference files, comment to toggle commmand line arguments | |
37 inBed = args[9] | |
38 outTab = args[10] | |
39 | |
40 # Auxiliary function to read in BED file | |
41 read_bed <- function(file) { | |
42 return(read.table(file, sep="\t", stringsAsFactors=FALSE)) | |
43 } | |
44 | |
45 startTime = Sys.time() | |
46 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n") | |
47 | |
48 # Load dependencies | |
49 cat("Loading dependencies...\n") | |
50 suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE)) | |
51 | |
52 # Initializing hash table (as env) with motif names and loading tabix file | |
53 cat("Loading motif database and initializing hash table...\n") | |
54 motifTable = new.env() | |
55 motifTbx <- TabixFile(motifDB) | |
56 | |
57 # Loading input bed file, convert integer columns to numeric, name columns | |
58 cat("Loading region file...\n") | |
59 regionsDF = read_bed(inBed) | |
60 dfTemp = sapply(regionsDF, is.integer) | |
61 regionsDF[dfTemp] = lapply(regionsDF[dfTemp], as.numeric) | |
62 names(regionsDF)[names(regionsDF) == "V1"] = "chr" | |
63 names(regionsDF)[names(regionsDF) == "V2"] = "start" | |
64 names(regionsDF)[names(regionsDF) == "V3"] = "end" | |
65 | |
66 # Filtering regions to exclude chromosomes not in motif database | |
67 cat("Determining intersection counts...\n") | |
68 motifTbxChrs = seqnamesTabix(motifTbx) | |
69 regionsDFFilter = subset(regionsDF, chr %in% motifTbxChrs) | |
70 | |
71 # Loading regions into GRanges object and scanning motif tabix database | |
72 # Region end is incremented by 1 since scanTabix querying is inclusive for | |
73 # position start but exclusive for position end. | |
74 param = GRanges(regionsDFFilter$chr, IRanges(regionsDFFilter$start, | |
75 end=regionsDFFilter$end + 1)) | |
76 regionsIntersects = scanTabix(motifTbx, param=param) | |
77 | |
78 # Parsing result list and updating motif count hash table | |
79 cat("Parsing result list...\n") | |
80 for(regionIntersects in regionsIntersects) { | |
81 for(regionIntersect in strsplit(regionIntersects, " ")) { | |
82 intersectMotif = strsplit(regionIntersect, "\t")[[1]][4] | |
83 if(is.null(motifTable[[intersectMotif]])) { | |
84 motifTable[[intersectMotif]] = 1 | |
85 } else { | |
86 motifTable[[intersectMotif]] = motifTable[[intersectMotif]] + 1 | |
87 } | |
88 } | |
89 } | |
90 | |
91 # Converting motif count hash table to an integer vector for output | |
92 counts = integer(length = length(ls(motifTable))) | |
93 names(counts) = ls(motifTable) | |
94 for(motifName in ls(motifTable)) { | |
95 counts[motifName] = as.integer(motifTable[[motifName]]) | |
96 } | |
97 | |
98 # Outputting intersection counts to tab delineated file | |
99 cat("Outputting to file...\n") | |
100 write.table(counts, outTab, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) | |
101 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."), | |
102 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n") |