| 
2
 | 
     1 # run_spp.R
 | 
| 
 | 
     2 # =============
 | 
| 
 | 
     3 # Author: Anshul Kundaje, Computer Science Dept., Stanford University
 | 
| 
 | 
     4 # Email: akundaje@stanford.edu
 | 
| 
 | 
     5 # Last updated: Oct 8, 2010
 | 
| 
 | 
     6 # =============
 | 
| 
 | 
     7 # MANDATORY ARGUMENTS
 | 
| 
 | 
     8 # -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
 | 
| 
 | 
     9 # MANDATORY ARGUMENT FOR PEAK CALLING
 | 
| 
 | 
    10 # -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
 | 
| 
 | 
    11 # OPTIONAL ARGUMENTS
 | 
| 
 | 
    12 # -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600
 | 
| 
 | 
    13 # -speak=<strPeak>, user-defined cross-correlation peak strandshift
 | 
| 
 | 
    14 # -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10)
 | 
| 
 | 
    15 # -p=<nodes> , number of parallel processing nodes, default=NULL
 | 
| 
 | 
    16 # -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
 | 
| 
 | 
    17 # -npeak=<numPeaks>, threshold on number of peaks to call
 | 
| 
 | 
    18 # -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
 | 
| 
 | 
    19 # -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
 | 
| 
 | 
    20 # OUTPUT PARAMETERS
 | 
| 
 | 
    21 # -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
 | 
| 
 | 
    22 # -savn=<narrowpeakfilename> OR -savn NarrowPeak file name
 | 
| 
 | 
    23 # -savr=<regionpeakfilename> OR -savr RegionPeak file name
 | 
| 
 | 
    24 # -savd=<rdatafile> OR -savd , save Rdata file
 | 
| 
 | 
    25 # -savp=<plotdatafile> OR -savp , save cross-correlation plot
 | 
| 
 | 
    26 # -out=<resultfile>, append peakshift result to a file 
 | 
| 
 | 
    27 #      format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag
 | 
| 
 | 
    28 # -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
 | 
| 
 | 
    29 # -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line
 | 
| 
 | 
    32 nargs = length(args); # number of arguments
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 # ###########################################################################
 | 
| 
 | 
    35 # AUXILIARY FUNCTIONS
 | 
| 
 | 
    36 # ###########################################################################
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 print.usage <- function() {
 | 
| 
 | 
    39 # ===================================
 | 
| 
 | 
    40 # Function will print function usage
 | 
| 
 | 
    41 # ===================================	
 | 
| 
 | 
    42 	cat('Usage: Rscript run_spp.R <options>\n',file=stderr())
 | 
| 
 | 
    43 	cat('MANDATORY ARGUMENTS\n',file=stderr())
 | 
| 
 | 
    44 	cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped)(FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
 | 
| 
 | 
    45 	cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr())
 | 
| 
 | 
    46 	cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
 | 
| 
 | 
    47 	cat('OPTIONAL ARGUMENTS\n',file=stderr())
 | 
| 
 | 
    48 	cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600\n',file=stderr())
 | 
| 
 | 
    49 	cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr())
 | 
| 
 | 
    50 	cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr())
 | 
| 
 | 
    51 	cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr())
 | 
| 
 | 
    52 	cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr())
 | 
| 
 | 
    53 	cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr())    
 | 
| 
 | 
    54 	cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr())
 | 
| 
 | 
    55 	cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr())
 | 
| 
 | 
    56 	cat('OUTPUT ARGUMENTS\n',file=stderr())
 | 
| 
 | 
    57 	cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr())
 | 
| 
 | 
    58 	cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr())
 | 
| 
 | 
    59 	cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr())
 | 
| 
 | 
    60 	cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr())
 | 
| 
 | 
    61 	cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr())
 | 
| 
 | 
    62 	cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr())
 | 
| 
 | 
    63 	cat('     format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag)\n',file=stderr())
 | 
| 
 | 
    64 	cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr())
 | 
| 
 | 
    65 	cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr())
 | 
| 
 | 
    66 } # end: print.usage()
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 get.file.parts <- function(file.fullpath) {
 | 
| 
 | 
    69 # ===================================
 | 
| 
 | 
    70 # Function will take a file name with path and split the file name into 
 | 
| 
 | 
    71 # path, fullname, name and ext
 | 
| 
 | 
    72 # ===================================	
 | 
| 
 | 
    73 	if (! is.character(file.fullpath)) {
 | 
| 
 | 
    74 		stop('File name must be a string')
 | 
| 
 | 
    75 	}
 | 
| 
 | 
    76 	
 | 
| 
 | 
    77 	file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator
 | 
| 
 | 
    78 	
 | 
| 
 | 
    79 	if (length(file.parts) == 0) { # if empty file name
 | 
| 
 | 
    80 		return(list(path='',
 | 
| 
 | 
    81 						fullname='',
 | 
| 
 | 
    82 						name='',
 | 
| 
 | 
    83 						ext='')
 | 
| 
 | 
    84 		)
 | 
| 
 | 
    85 	} else {
 | 
| 
 | 
    86 		if (length(file.parts) == 1) { # if no path then just the file name itself
 | 
| 
 | 
    87 			file.path <- '.'
 | 
| 
 | 
    88 			file.fullname <- file.parts
 | 
| 
 | 
    89 		} else {
 | 
| 
 | 
    90 			file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path
 | 
| 
 | 
    91 			file.fullname <- file.parts[length(file.parts)] # last token is filename
 | 
| 
 | 
    92 		}        
 | 
| 
 | 
    93 		file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on .
 | 
| 
 | 
    94 		if (length(file.fullname.parts) == 1) { # if no extension
 | 
| 
 | 
    95 			file.ext <- ''
 | 
| 
 | 
    96 			file.name <- file.fullname.parts
 | 
| 
 | 
    97 		} else {
 | 
| 
 | 
    98 			file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token
 | 
| 
 | 
    99 			file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".")
 | 
| 
 | 
   100 		}
 | 
| 
 | 
   101 		return(list(path=file.path,
 | 
| 
 | 
   102 						fullname=file.fullname,
 | 
| 
 | 
   103 						name=file.name,
 | 
| 
 | 
   104 						ext=file.ext))
 | 
| 
 | 
   105 	}         	
 | 
| 
 | 
   106 } # end: get.file.parts()
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 parse.arguments <- function(args) {
 | 
| 
 | 
   109 # ===================================
 | 
| 
 | 
   110 # Function will parse arguments
 | 
| 
 | 
   111 # ===================================	
 | 
| 
 | 
   112 	# Set arguments to default values
 | 
| 
 | 
   113 	chip.file <- NA  # main ChIP tagAlign/BAM file name
 | 
| 
 | 
   114 	isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL
 | 
| 
 | 
   115 	control.file <- NA # control tagAlign/BAM file name
 | 
| 
 | 
   116 	isurl.control.file <- FALSE # flag indicating whether control file is a URL   
 | 
| 
 | 
   117 	sep.min <- -100  # min strand shift
 | 
| 
 | 
   118 	sep.max <- 600  # max strand shift
 | 
| 
 | 
   119 	sep.bin <- 5    # increment for strand shift
 | 
| 
 | 
   120 	sep.peak <- NA # user-defined peak shift
 | 
| 
 | 
   121 	exclude.min <- 10 # lowerbound of strand shift exclusion region
 | 
| 
 | 
   122 	exclude.max <- NaN # upperbound of strand shift exclusion region
 | 
| 
 | 
   123 	n.nodes <- NA # number of parallel processing nodes
 | 
| 
 | 
   124 	fdr <- 0.01 # false discovery rate threshold for peak calling
 | 
| 
 | 
   125 	npeak <- NA # threshold on number of peaks to call
 | 
| 
 | 
   126 	temp.dir <- tempdir() # temporary directory
 | 
| 
 | 
   127 	chrname.rm.pattern <- NA # chromosome name pattern used to remove tags
 | 
| 
 | 
   128 	output.odir <- NA # Output directory name
 | 
| 
 | 
   129 	output.npeak.file <- NA # Output narrowPeak file name
 | 
| 
 | 
   130 	output.rpeak.file <- NA # Output regionPeak file name
 | 
| 
 | 
   131 	output.rdata.file <- NA # Rdata file
 | 
| 
 | 
   132 	output.plot.file <- NA  # cross correlation plot file
 | 
| 
 | 
   133 	output.result.file <- NA # result file
 | 
| 
 | 
   134 	replace.flag <- FALSE # replace file flag
 | 
| 
 | 
   135 	clean.files.flag <- FALSE # file deletion flag
 | 
| 
 | 
   136 	
 | 
| 
 | 
   137 	# Parse arguments   
 | 
| 
 | 
   138 	for (each.arg in args) {
 | 
| 
 | 
   139 		
 | 
| 
 | 
   140 		if (grepl('^-c=',each.arg)) { #-c=<chip.file>
 | 
| 
 | 
   141 			
 | 
| 
 | 
   142 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   143 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   144 				chip.file <- arg.split[2] # second part is chip.file
 | 
| 
 | 
   145 			} else {
 | 
| 
 | 
   146 				stop('No tagAlign/BAM file name provided for parameter -c=')
 | 
| 
 | 
   147 			}
 | 
| 
 | 
   148 			
 | 
| 
 | 
   149 		} else if (grepl('^-i=',each.arg)) { #-i=<control.file>
 | 
| 
 | 
   150 			
 | 
| 
 | 
   151 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   152 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   153 				control.file <- arg.split[2] # second part is control.file
 | 
| 
 | 
   154 			} else {
 | 
| 
 | 
   155 				stop('No tagAlign/BAM file name provided for parameter -i=')
 | 
| 
 | 
   156 			}
 | 
| 
 | 
   157 			
 | 
| 
 | 
   158 		} else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max>
 | 
| 
 | 
   159 			
 | 
| 
 | 
   160 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   161 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   162 				sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax
 | 
| 
 | 
   163 				sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on :                
 | 
| 
 | 
   164 				if (length(sep.vals.split) != 3) { # must have 3 parts
 | 
| 
 | 
   165 					stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')                    
 | 
| 
 | 
   166 				} else {
 | 
| 
 | 
   167 					if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric
 | 
| 
 | 
   168 						stop('Strand shift limits must be numeric values')
 | 
| 
 | 
   169 					}
 | 
| 
 | 
   170 					sep.min <- round(as.numeric(sep.vals.split[1]))
 | 
| 
 | 
   171 					sep.bin <- round(as.numeric(sep.vals.split[2]))
 | 
| 
 | 
   172 					sep.max <- round(as.numeric(sep.vals.split[3]))
 | 
| 
 | 
   173 					if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) {
 | 
| 
 | 
   174 						stop('Illegal separation values -s=sepmin:sepbin:sepmax')
 | 
| 
 | 
   175 					}
 | 
| 
 | 
   176 				}                                    
 | 
| 
 | 
   177 			} else {
 | 
| 
 | 
   178 				stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
 | 
| 
 | 
   179 			}
 | 
| 
 | 
   180 			
 | 
| 
 | 
   181 		} else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift
 | 
| 
 | 
   182 			
 | 
| 
 | 
   183 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   184 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   185 				sep.peak <- arg.split[2] # second part is <sep.peak>
 | 
| 
 | 
   186 				if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric
 | 
| 
 | 
   187 					stop('-speak=<sep.peak>: User defined peak shift must be numeric')
 | 
| 
 | 
   188 				}
 | 
| 
 | 
   189 				sep.peak <- as.numeric(sep.peak)
 | 
| 
 | 
   190 			} else {
 | 
| 
 | 
   191 				stop('User defined peak shift must be provided as -speak=<sep.peak>')
 | 
| 
 | 
   192 			}
 | 
| 
 | 
   193 			
 | 
| 
 | 
   194 		} else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max>
 | 
| 
 | 
   195 			
 | 
| 
 | 
   196 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   197 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   198 				exclude.vals <- arg.split[2] # second part is excludemin:excludemax
 | 
| 
 | 
   199 				exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on :                
 | 
| 
 | 
   200 				if (length(exclude.vals.split) != 2) { # must have 2 parts
 | 
| 
 | 
   201 					stop('Exclusion limits must be specified as -x=excludemin:excludemax')
 | 
| 
 | 
   202 				} else {
 | 
| 
 | 
   203 					if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric
 | 
| 
 | 
   204 						stop('Exclusion limits must be numeric values')
 | 
| 
 | 
   205 					}
 | 
| 
 | 
   206 					exclude.min <- round(as.numeric(exclude.vals.split[1]))                    
 | 
| 
 | 
   207 					exclude.max <- round(as.numeric(exclude.vals.split[2]))
 | 
| 
 | 
   208 					if (exclude.min > exclude.max) {
 | 
| 
 | 
   209 						stop('Illegal exclusion limits -x=excludemin:excludemax')
 | 
| 
 | 
   210 					}                    
 | 
| 
 | 
   211 				}                                    
 | 
| 
 | 
   212 			} else {
 | 
| 
 | 
   213 				stop('Exclusion limits must be specified as -x=excludemin:excludemax')
 | 
| 
 | 
   214 			}
 | 
| 
 | 
   215 			
 | 
| 
 | 
   216 		} else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL            
 | 
| 
 | 
   217 			
 | 
| 
 | 
   218 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   219 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   220 				n.nodes <- arg.split[2] # second part is numnodes
 | 
| 
 | 
   221 				if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric
 | 
| 
 | 
   222 					stop('-p=<numnodes>: numnodes must be numeric')
 | 
| 
 | 
   223 				}
 | 
| 
 | 
   224 				n.nodes <- round(as.numeric(n.nodes))
 | 
| 
 | 
   225 			} else {
 | 
| 
 | 
   226 				stop('Number of parallel nodes must be provided as -p=<numnodes>')
 | 
| 
 | 
   227 			}
 | 
| 
 | 
   228 			
 | 
| 
 | 
   229 		} else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01            
 | 
| 
 | 
   230 			
 | 
| 
 | 
   231 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   232 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   233 				fdr <- arg.split[2] # second part is fdr
 | 
| 
 | 
   234 				if (is.na(as.numeric(fdr))) { # check that fdr is numeric
 | 
| 
 | 
   235 					stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric')
 | 
| 
 | 
   236 				}
 | 
| 
 | 
   237 				fdr <- as.numeric(fdr)
 | 
| 
 | 
   238 			} else {
 | 
| 
 | 
   239 				stop('False discovery rate must be provided as -fdr=<fdr>')
 | 
| 
 | 
   240 			}
 | 
| 
 | 
   241 			
 | 
| 
 | 
   242 		} else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA            
 | 
| 
 | 
   243 			
 | 
| 
 | 
   244 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   245 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   246 				npeak <- arg.split[2] # second part is npeak
 | 
| 
 | 
   247 				if (is.na(as.numeric(npeak))) { # check that npeak is numeric
 | 
| 
 | 
   248 					stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric')
 | 
| 
 | 
   249 				}
 | 
| 
 | 
   250 				npeak <- round(as.numeric(npeak))
 | 
| 
 | 
   251 			} else {
 | 
| 
 | 
   252 				stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>')
 | 
| 
 | 
   253 			}
 | 
| 
 | 
   254 			
 | 
| 
 | 
   255 		} else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir>
 | 
| 
 | 
   256 			
 | 
| 
 | 
   257 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   258 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   259 				temp.dir <- arg.split[2] # second part is temp.dir
 | 
| 
 | 
   260 			} else {
 | 
| 
 | 
   261 				stop('No temporary directory provided for parameter -tmpdir=')
 | 
| 
 | 
   262 			}
 | 
| 
 | 
   263 
 | 
| 
 | 
   264 		} else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern>
 | 
| 
 | 
   265 			
 | 
| 
 | 
   266 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   267 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   268 				chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern
 | 
| 
 | 
   269 			} else {
 | 
| 
 | 
   270 				stop('No pattern provided for parameter -filtchr=')
 | 
| 
 | 
   271 			}
 | 
| 
 | 
   272 			
 | 
| 
 | 
   273 		} else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir>
 | 
| 
 | 
   274 			
 | 
| 
 | 
   275 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   276 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   277 				output.odir <- arg.split[2] # second part is output.odir
 | 
| 
 | 
   278 			} else {
 | 
| 
 | 
   279 				stop('No output directory provided for parameter -odir=')
 | 
| 
 | 
   280 			}
 | 
| 
 | 
   281 			
 | 
| 
 | 
   282 		} else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak
 | 
| 
 | 
   283 			
 | 
| 
 | 
   284 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   285 			if (! is.na(arg.split[2])) {                
 | 
| 
 | 
   286 				output.npeak.file <- arg.split[2] #-savn=
 | 
| 
 | 
   287 			} else if (each.arg=='-savn') {
 | 
| 
 | 
   288 				output.npeak.file <- NULL # NULL indicates get the name from the main file name
 | 
| 
 | 
   289 			} else {
 | 
| 
 | 
   290 				stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>')
 | 
| 
 | 
   291 			}
 | 
| 
 | 
   292 			
 | 
| 
 | 
   293 		} else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak
 | 
| 
 | 
   294 			
 | 
| 
 | 
   295 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   296 			if (! is.na(arg.split[2])) {                
 | 
| 
 | 
   297 				output.rpeak.file <- arg.split[2] #-savr=
 | 
| 
 | 
   298 			} else if (each.arg=='-savr') {
 | 
| 
 | 
   299 				output.rpeak.file <- NULL # NULL indicates get the name from the main file name
 | 
| 
 | 
   300 			} else {
 | 
| 
 | 
   301 				stop('Argument for saving regionPeak file must be -savr or -savr=<filename>')
 | 
| 
 | 
   302 			}
 | 
| 
 | 
   303 			
 | 
| 
 | 
   304 		} else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file
 | 
| 
 | 
   305 			
 | 
| 
 | 
   306 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   307 			if (! is.na(arg.split[2])) {                
 | 
| 
 | 
   308 				output.rdata.file <- arg.split[2] #-savd=
 | 
| 
 | 
   309 			} else if (each.arg=='-savd') {
 | 
| 
 | 
   310 				output.rdata.file <- NULL # NULL indicates get the name from the main file name
 | 
| 
 | 
   311 			} else {
 | 
| 
 | 
   312 				stop('Argument for saving Rdata file must be -savd or -savd=<filename>')
 | 
| 
 | 
   313 			}
 | 
| 
 | 
   314 			
 | 
| 
 | 
   315 		} else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot                       
 | 
| 
 | 
   316 			
 | 
| 
 | 
   317 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   318 			if (! is.na(arg.split[2])) {                
 | 
| 
 | 
   319 				output.plot.file <- arg.split[2] #-savp=
 | 
| 
 | 
   320 			} else if (each.arg=='-savp') {
 | 
| 
 | 
   321 				output.plot.file <- NULL # NULL indicates get the name from the main file name
 | 
| 
 | 
   322 			} else {
 | 
| 
 | 
   323 				stop('Argument for saving Rdata file must be -savp or -savp=<filename>')
 | 
| 
 | 
   324 			}
 | 
| 
 | 
   325 			
 | 
| 
 | 
   326 		} else if (grepl('^-out=',each.arg)) { #-out=<output.result.file>
 | 
| 
 | 
   327 			
 | 
| 
 | 
   328 			arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
 | 
| 
 | 
   329 			if (! is.na(arg.split[2]) ) {
 | 
| 
 | 
   330 				output.result.file <- arg.split[2] # second part is output.result.file
 | 
| 
 | 
   331 			} else {
 | 
| 
 | 
   332 				stop('No result file provided for parameter -out=')
 | 
| 
 | 
   333 			}
 | 
| 
 | 
   334 		
 | 
| 
 | 
   335 		} else if (each.arg == '-rf') {
 | 
| 
 | 
   336 			
 | 
| 
 | 
   337 			replace.flag <- TRUE
 | 
| 
 | 
   338 		
 | 
| 
 | 
   339 		} else if (each.arg == '-clean') {
 | 
| 
 | 
   340 			
 | 
| 
 | 
   341 			clean.files.flag <- TRUE
 | 
| 
 | 
   342 			
 | 
| 
 | 
   343 		} else {
 | 
| 
 | 
   344 			
 | 
| 
 | 
   345 			stop('Illegal argument ',each.arg)
 | 
| 
 | 
   346 		}        
 | 
| 
 | 
   347 	}
 | 
| 
 | 
   348 	# End: for loop
 | 
| 
 | 
   349 	
 | 
| 
 | 
   350 	# Check mandatory arguments
 | 
| 
 | 
   351 	if (is.na(chip.file)) {
 | 
| 
 | 
   352 		stop('-c=<tagAlign/BAMFileName> is a mandatory argument')
 | 
| 
 | 
   353 	}
 | 
| 
 | 
   354 	
 | 
| 
 | 
   355 	if (is.na(control.file) && ! is.na(output.npeak.file)) {
 | 
| 
 | 
   356 		stop('-i=<tagAlign/BAMFileName> is required for peak calling')
 | 
| 
 | 
   357 	}
 | 
| 
 | 
   358 	
 | 
| 
 | 
   359 	# Check if ChIP and control files are URLs
 | 
| 
 | 
   360 	if (grepl('^http://',chip.file)) {
 | 
| 
 | 
   361 		isurl.chip.file <- TRUE
 | 
| 
 | 
   362 	}
 | 
| 
 | 
   363 	if (grepl('^http://',control.file)) {
 | 
| 
 | 
   364 		isurl.control.file <- TRUE
 | 
| 
 | 
   365 	}
 | 
| 
 | 
   366 	
 | 
| 
 | 
   367 	# If ChIP file is a URL output.odir MUST be specified
 | 
| 
 | 
   368 	if (isurl.chip.file && is.na(output.odir)) {
 | 
| 
 | 
   369 		stop('If ChIP file is a URL, then output directory MUST be specified')
 | 
| 
 | 
   370 	}
 | 
| 
 | 
   371 	
 | 
| 
 | 
   372 	# Check that ChIP and control files exist
 | 
| 
 | 
   373 	if (isurl.chip.file) {
 | 
| 
 | 
   374 		if (system(paste('wget -q --spider',chip.file)) != 0) {
 | 
| 
 | 
   375 			stop('ChIP file URL not valid: ',chip.file)
 | 
| 
 | 
   376 		}
 | 
| 
 | 
   377 	} else if (!file.exists(chip.file)) {
 | 
| 
 | 
   378 		stop('ChIP File:',chip.file,' does not exist')
 | 
| 
 | 
   379 	}
 | 
| 
 | 
   380 	
 | 
| 
 | 
   381 	if (!is.na(control.file)) {
 | 
| 
 | 
   382 		if (isurl.control.file) {
 | 
| 
 | 
   383 			if (system(paste('wget -q --spider',control.file)) != 0) {
 | 
| 
 | 
   384 				stop('Control file URL not valid: ',control.file)
 | 
| 
 | 
   385 			}
 | 
| 
 | 
   386 		} else if (!file.exists(control.file)) {
 | 
| 
 | 
   387 			stop('Control File:',control.file,' does not exist')
 | 
| 
 | 
   388 		}   
 | 
| 
 | 
   389 	}
 | 
| 
 | 
   390 	
 | 
| 
 | 
   391 	# Correct other arguments
 | 
| 
 | 
   392 	if (is.na(output.odir)) { # Reconstruct output.odir if not provided
 | 
| 
 | 
   393 		output.odir <- get.file.parts(chip.file)$path
 | 
| 
 | 
   394 	}
 | 
| 
 | 
   395 	
 | 
| 
 | 
   396 	if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL
 | 
| 
 | 
   397 		output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep=""))
 | 
| 
 | 
   398 	}
 | 
| 
 | 
   399 	
 | 
| 
 | 
   400 	if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL
 | 
| 
 | 
   401 		output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep=""))
 | 
| 
 | 
   402 	}
 | 
| 
 | 
   403 	
 | 
| 
 | 
   404 	if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL
 | 
| 
 | 
   405 		output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep=""))
 | 
| 
 | 
   406 	}
 | 
| 
 | 
   407 	
 | 
| 
 | 
   408 	if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL
 | 
| 
 | 
   409 		output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep=""))
 | 
| 
 | 
   410 	}
 | 
| 
 | 
   411 	
 | 
| 
 | 
   412 	return(list(chip.file=chip.file,
 | 
| 
 | 
   413 					isurl.chip.file=isurl.chip.file,
 | 
| 
 | 
   414 					control.file=control.file,
 | 
| 
 | 
   415 					isurl.control.file=isurl.control.file,
 | 
| 
 | 
   416 					sep.range=c(sep.min,sep.bin,sep.max),
 | 
| 
 | 
   417 					sep.peak=sep.peak,
 | 
| 
 | 
   418 					ex.range=c(exclude.min,exclude.max),
 | 
| 
 | 
   419 					n.nodes=n.nodes,
 | 
| 
 | 
   420 					fdr=fdr,
 | 
| 
 | 
   421 					npeak=npeak,
 | 
| 
 | 
   422 					temp.dir=temp.dir,
 | 
| 
 | 
   423 					chrname.rm.pattern=chrname.rm.pattern,
 | 
| 
 | 
   424 					output.odir=output.odir,
 | 
| 
 | 
   425 					output.npeak.file=output.npeak.file,
 | 
| 
 | 
   426 					output.rpeak.file=output.rpeak.file,
 | 
| 
 | 
   427 					output.rdata.file=output.rdata.file,
 | 
| 
 | 
   428 					output.plot.file=output.plot.file,
 | 
| 
 | 
   429 					output.result.file=output.result.file,
 | 
| 
 | 
   430 					replace.flag=replace.flag,
 | 
| 
 | 
   431 					clean.files.flag=clean.files.flag))          
 | 
| 
 | 
   432 } # end: parse.arguments()
 | 
| 
 | 
   433 
 | 
| 
 | 
   434 read.align <- function(align.filename) {
 | 
| 
 | 
   435 # ===================================
 | 
| 
 | 
   436 # Function will read a tagAlign or BAM file
 | 
| 
 | 
   437 # ===================================	
 | 
| 
 | 
   438 	if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file
 | 
| 
 | 
   439 		chip.data <- read.tagalign.tags(align.filename)
 | 
| 
 | 
   440 		# get readlength info
 | 
| 
 | 
   441 		tmpDataRows <- read.table(align.filename,nrows=500)
 | 
| 
 | 
   442 		chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
 | 
| 
 | 
   443 	} else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file
 | 
| 
 | 
   444 		# create BAM file name
 | 
| 
 | 
   445 		bam2align.filename <- sub('\\.bam','.tagAlign',align.filename)
 | 
| 
 | 
   446 		# generate command to convert bam to tagalign
 | 
| 
 | 
   447 		command <- vector(length=2)
 | 
| 
 | 
   448 		command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename)
 | 
| 
 | 
   449 		command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
 | 
| 
 | 
   450 		# command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")	
 | 
| 
 | 
   451 		command <- paste(command,collapse=" | ")
 | 
| 
 | 
   452 		# Run command
 | 
| 
 | 
   453 		status <- system(command,intern=FALSE,ignore.stderr=FALSE)
 | 
| 
 | 
   454 		if ((status != 0) || !file.exists(bam2align.filename)) {
 | 
| 
 | 
   455 			cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr())
 | 
| 
 | 
   456 			q(save="no",status=1)
 | 
| 
 | 
   457 		}
 | 
| 
 | 
   458 		# read converted BAM file
 | 
| 
 | 
   459 		chip.data <- read.tagalign.tags(bam2align.filename)
 | 
| 
 | 
   460 		# get readlength info
 | 
| 
 | 
   461 		tmpDataRows <- read.table(bam2align.filename,nrows=500)
 | 
| 
 | 
   462 		chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
 | 
| 
 | 
   463 		# delete temporary tagalign file
 | 
| 
 | 
   464 		file.remove(bam2align.filename)	
 | 
| 
 | 
   465 	} else {
 | 
| 
 | 
   466 		cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr())
 | 
| 
 | 
   467 		q(save="no",status=1)	
 | 
| 
 | 
   468 	}
 | 
| 
 | 
   469 	return(chip.data)
 | 
| 
 | 
   470 } # end: read.align()
 | 
| 
 | 
   471 
 | 
| 
 | 
   472 print.run.params <- function(params){
 | 
| 
 | 
   473 # ===================================
 | 
| 
 | 
   474 # Output run parameters
 | 
| 
 | 
   475 # ===================================		
 | 
| 
 | 
   476 	cat('################\n',file=stdout())
 | 
| 
 | 
   477 	cat(iparams$chip.file,
 | 
| 
 | 
   478 			iparams$control.file,
 | 
| 
 | 
   479 			iparams$sep.range,
 | 
| 
 | 
   480 			iparams$sep.peak,
 | 
| 
 | 
   481 			iparams$ex.range,
 | 
| 
 | 
   482 			iparams$n.nodes,
 | 
| 
 | 
   483 			iparams$fdr,
 | 
| 
 | 
   484 			iparams$npeak,
 | 
| 
 | 
   485 			iparams$output.odir,
 | 
| 
 | 
   486 			iparams$output.npeak.file,
 | 
| 
 | 
   487 			iparams$output.rpeak.file,
 | 
| 
 | 
   488 			iparams$output.rdata.file,
 | 
| 
 | 
   489 			iparams$output.plot.file,
 | 
| 
 | 
   490 			iparams$output.result.file,
 | 
| 
 | 
   491 			iparams$replace.flag,  
 | 
| 
 | 
   492 			labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift',
 | 
| 
 | 
   493 					'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:',
 | 
| 
 | 
   494 					'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:', 
 | 
| 
 | 
   495 					'plot pdf filename:','result filename:','Overwrite files?:'),
 | 
| 
 | 
   496 			fill=18,
 | 
| 
 | 
   497 			file=stdout())	
 | 
| 
 | 
   498 	cat('\n',file=stdout())	
 | 
| 
 | 
   499 } # end: print.run.parameters()
 | 
| 
 | 
   500 
 | 
| 
 | 
   501 check.replace.flag <- function(params){
 | 
| 
 | 
   502 # ===================================
 | 
| 
 | 
   503 # Check if files exist
 | 
| 
 | 
   504 # ===================================
 | 
| 
 | 
   505 # If replace.flag is NOT set, check if output files exist and abort if necessary
 | 
| 
 | 
   506 	if (! iparams$replace.flag) {
 | 
| 
 | 
   507 		if (! is.na(iparams$output.npeak.file)) {
 | 
| 
 | 
   508 			if (file.exists(iparams$output.npeak.file)) {
 | 
| 
 | 
   509 				cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
 | 
| 
 | 
   510 				q(save="no",status=1)
 | 
| 
 | 
   511 			}
 | 
| 
 | 
   512 		}
 | 
| 
 | 
   513 		if (! is.na(iparams$output.rpeak.file)) {
 | 
| 
 | 
   514 			if (file.exists(iparams$output.rpeak.file)) {
 | 
| 
 | 
   515 				cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
 | 
| 
 | 
   516 				q(save="no",status=1)
 | 
| 
 | 
   517 			}
 | 
| 
 | 
   518 		}    
 | 
| 
 | 
   519 		if (! is.na(iparams$output.plot.file)) {
 | 
| 
 | 
   520 			if (file.exists(iparams$output.plot.file)) {
 | 
| 
 | 
   521 				cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
 | 
| 
 | 
   522 				q(save="no",status=1)
 | 
| 
 | 
   523 			}
 | 
| 
 | 
   524 		}
 | 
| 
 | 
   525 		if (! is.na(iparams$output.rdata.file)) {
 | 
| 
 | 
   526 			if (file.exists(iparams$output.rdata.file)) {
 | 
| 
 | 
   527 				cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
 | 
| 
 | 
   528 				q(save="no",status=1)
 | 
| 
 | 
   529 			}
 | 
| 
 | 
   530 		}
 | 
| 
 | 
   531 	}	
 | 
| 
 | 
   532 }
 | 
| 
 | 
   533 
 | 
| 
 | 
   534 # #############################################################################
 | 
| 
 | 
   535 # MAIN FUNCTION
 | 
| 
 | 
   536 # #############################################################################
 | 
| 
 | 
   537 
 | 
| 
 | 
   538 # Check number of arguments
 | 
| 
 | 
   539 minargs = 1;
 | 
| 
 | 
   540 maxargs = 17;
 | 
| 
 | 
   541 if (nargs < minargs | nargs > maxargs) {
 | 
| 
 | 
   542 	print.usage()
 | 
| 
 | 
   543 	q(save="no",status=1)
 | 
| 
 | 
   544 }
 | 
| 
 | 
   545 
 | 
| 
 | 
   546 # Parse arguments
 | 
| 
 | 
   547 # iparams$chip.file
 | 
| 
 | 
   548 # iparams$isurl.chip.file
 | 
| 
 | 
   549 # iparams$control.file
 | 
| 
 | 
   550 # iparams$isurl.control.file
 | 
| 
 | 
   551 # iparams$sep.range
 | 
| 
 | 
   552 # iparams$sep.peak
 | 
| 
 | 
   553 # iparams$ex.range
 | 
| 
 | 
   554 # iparams$n.nodes
 | 
| 
 | 
   555 # iparams$fdr
 | 
| 
 | 
   556 # iparams$npeak
 | 
| 
 | 
   557 # iparams$temp.dir
 | 
| 
 | 
   558 # iparams$output.odir
 | 
| 
 | 
   559 # iparams$output.npeak.file
 | 
| 
 | 
   560 # iparams$output.rpeak.file
 | 
| 
 | 
   561 # iparams$output.rdata.file
 | 
| 
 | 
   562 # iparams$output.plot.file
 | 
| 
 | 
   563 # iparams$output.result.file
 | 
| 
 | 
   564 # iparams$replace.flag
 | 
| 
 | 
   565 # iparams$clean.files.flag
 | 
| 
 | 
   566 iparams <- parse.arguments(args)
 | 
| 
 | 
   567 
 | 
| 
 | 
   568 # Print run parameters
 | 
| 
 | 
   569 print.run.params(iparams)
 | 
| 
 | 
   570 
 | 
| 
 | 
   571 # Check if output files exist 
 | 
| 
 | 
   572 check.replace.flag(iparams)
 | 
| 
 | 
   573 
 | 
| 
 | 
   574 # curr.chip.file and curr.control.file always point to the original ChIP and control files on disk
 | 
| 
 | 
   575 # ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align
 | 
| 
 | 
   576 
 | 
| 
 | 
   577 # Download ChIP and control files if necessary to temp.dir
 | 
| 
 | 
   578 if (iparams$isurl.chip.file) {
 | 
| 
 | 
   579 	curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
 | 
| 
 | 
   580 	cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout())
 | 
| 
 | 
   581 	if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) {
 | 
| 
 | 
   582 		stop('Error downloading ChIP file:',iparams$chip.file)
 | 
| 
 | 
   583 	}
 | 
| 
 | 
   584 } else {
 | 
| 
 | 
   585 	curr.chip.file <- iparams$chip.file # file is in original directory
 | 
| 
 | 
   586 }
 | 
| 
 | 
   587 
 | 
| 
 | 
   588 if (iparams$isurl.control.file) {
 | 
| 
 | 
   589 	curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
 | 
| 
 | 
   590 	cat('Downloading control file:',iparams$control.file,"\n",file=stdout())
 | 
| 
 | 
   591 	if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) {
 | 
| 
 | 
   592 		stop('Error downloading Control file:',iparams$control.file)
 | 
| 
 | 
   593 	}
 | 
| 
 | 
   594 } else {
 | 
| 
 | 
   595 	curr.control.file <- iparams$control.file # file is in original directory
 | 
| 
 | 
   596 }
 | 
| 
 | 
   597 
 | 
| 
 | 
   598 # unzip ChIP and input files if required AND copy to temp directory
 | 
| 
 | 
   599 if (get.file.parts(curr.chip.file)$ext == '.gz') {
 | 
| 
 | 
   600 	ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
 | 
| 
 | 
   601 	cat('Decompressing ChIP file\n',file=stdout())
 | 
| 
 | 
   602 	if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) {
 | 
| 
 | 
   603 		stop('Unable to decompress file:', iparams$chip.file)
 | 
| 
 | 
   604 	}
 | 
| 
 | 
   605 	if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
 | 
| 
 | 
   606 		file.remove(curr.chip.file)
 | 
| 
 | 
   607 	}
 | 
| 
 | 
   608 } else {
 | 
| 
 | 
   609 	ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir)
 | 
| 
 | 
   610 	if (iparams$clean.files.flag) {
 | 
| 
 | 
   611 		file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix]
 | 
| 
 | 
   612 	} else {
 | 
| 
 | 
   613 		file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix]		
 | 
| 
 | 
   614 	}	
 | 
| 
 | 
   615 }
 | 
| 
 | 
   616 
 | 
| 
 | 
   617 if (! is.na(iparams$control.file)) {
 | 
| 
 | 
   618 	if (get.file.parts(curr.control.file)$ext == '.gz') {
 | 
| 
 | 
   619 		ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
 | 
| 
 | 
   620 		cat('Decompressing control file\n',file=stdout())        
 | 
| 
 | 
   621 		if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) {
 | 
| 
 | 
   622 			stop('Unable to decompress file:', iparams$control.file)
 | 
| 
 | 
   623 		}
 | 
| 
 | 
   624 		if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
 | 
| 
 | 
   625 			file.remove(curr.control.file)
 | 
| 
 | 
   626 		}				
 | 
| 
 | 
   627 	} else {
 | 
| 
 | 
   628 		ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix]
 | 
| 
 | 
   629 		
 | 
| 
 | 
   630 		if (iparams$clean.files.flag) {
 | 
| 
 | 
   631 			file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix]
 | 
| 
 | 
   632 		} else {
 | 
| 
 | 
   633 			file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix]		
 | 
| 
 | 
   634 		}			
 | 
| 
 | 
   635 	}
 | 
| 
 | 
   636 }
 | 
| 
 | 
   637 
 | 
| 
 | 
   638 # Remove downloaded files
 | 
| 
 | 
   639 if (iparams$isurl.chip.file & file.exists(curr.chip.file))  {
 | 
| 
 | 
   640 	file.remove(curr.chip.file)
 | 
| 
 | 
   641 }
 | 
| 
 | 
   642 
 | 
| 
 | 
   643 if (! is.na(iparams$control.file)) {
 | 
| 
 | 
   644 	if (iparams$isurl.control.file & file.exists(curr.control.file)) {
 | 
| 
 | 
   645 		file.remove(curr.control.file)
 | 
| 
 | 
   646 	}
 | 
| 
 | 
   647 }
 | 
| 
 | 
   648 
 | 
| 
 | 
   649 # Load SPP library
 | 
| 
 | 
   650 library(spp)
 | 
| 
 | 
   651 
 | 
| 
 | 
   652 # Read ChIP tagAlign/BAM files
 | 
| 
 | 
   653 cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout())
 | 
| 
 | 
   654 chip.data <- read.align(ta.chip.filename)
 | 
| 
 | 
   655 cat("ChIP data read length",chip.data$read.length,"\n",file=stdout())
 | 
| 
 | 
   656 file.remove(ta.chip.filename) # Delete temporary file
 | 
| 
 | 
   657 if (length(chip.data$tags)==0) {
 | 
| 
 | 
   658 	stop('Error in ChIP file format:', iparams$chip.file)
 | 
| 
 | 
   659 }
 | 
| 
 | 
   660 # Remove illegal chromosome names
 | 
| 
 | 
   661 if (! is.na(iparams$chrname.rm.pattern)) {
 | 
| 
 | 
   662 	selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE)
 | 
| 
 | 
   663 	chip.data$tags <- chip.data$tags[selectidx]
 | 
| 
 | 
   664 	chip.data$quality <- chip.data$quality[selectidx]
 | 
| 
 | 
   665 }
 | 
| 
 | 
   666 chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d))))
 | 
| 
 | 
   667 
 | 
| 
 | 
   668 # Read Control tagAlign/BAM files
 | 
| 
 | 
   669 if (! is.na(iparams$control.file)) {
 | 
| 
 | 
   670 	cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout())
 | 
| 
 | 
   671 	control.data <- read.align(ta.control.filename)
 | 
| 
 | 
   672 	file.remove(ta.control.filename) # Delete temporary file    
 | 
| 
 | 
   673 	if (length(control.data$tags)==0) {
 | 
| 
 | 
   674 		stop('Error in control file format:', iparams$chip.file)
 | 
| 
 | 
   675 	}    
 | 
| 
 | 
   676 	cat("Control data read length",control.data$read.length,"\n",file=stdout())
 | 
| 
 | 
   677 	# Remove illegal chromosome names
 | 
| 
 | 
   678 	if (! is.na(iparams$chrname.rm.pattern)) {
 | 
| 
 | 
   679 		selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE)
 | 
| 
 | 
   680 		control.data$tags <- control.data$tags[selectidx]
 | 
| 
 | 
   681 		control.data$quality <- control.data$quality[selectidx]
 | 
| 
 | 
   682 	}
 | 
| 
 | 
   683 	control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d))))
 | 
| 
 | 
   684 }
 | 
| 
 | 
   685 
 | 
| 
 | 
   686 # Open multiple processes if required
 | 
| 
 | 
   687 if (is.na(iparams$n.nodes)) {
 | 
| 
 | 
   688 	cluster.nodes <- NULL
 | 
| 
 | 
   689 } else {
 | 
| 
 | 
   690 	library(snow)
 | 
| 
 | 
   691 	cluster.nodes <- makeCluster(iparams$n.nodes)
 | 
| 
 | 
   692 }
 | 
| 
 | 
   693 
 | 
| 
 | 
   694 # #################################    
 | 
| 
 | 
   695 # Calculate cross-correlation for various strand shifts
 | 
| 
 | 
   696 # #################################    
 | 
| 
 | 
   697 cat("Calculating peak characteristics\n",file=stdout())
 | 
| 
 | 
   698 # crosscorr
 | 
| 
 | 
   699 # $cross.correlation : Cross-correlation profile as an $x/$y data.frame
 | 
| 
 | 
   700 # $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak.
 | 
| 
 | 
   701 # $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak) 
 | 
| 
 | 
   702 crosscorr <- get.binding.characteristics(chip.data,
 | 
| 
 | 
   703 		srange=iparams$sep.range[c(1,3)],
 | 
| 
 | 
   704 		bin=iparams$sep.range[2],
 | 
| 
 | 
   705 		accept.all.tags=T,
 | 
| 
 | 
   706 		cluster=cluster.nodes)
 | 
| 
 | 
   707 if (!is.na(iparams$n.nodes)) {
 | 
| 
 | 
   708 	stopCluster(cluster.nodes)
 | 
| 
 | 
   709 }
 | 
| 
 | 
   710 
 | 
| 
 | 
   711 # Smooth the cross-correlation curve if required
 | 
| 
 | 
   712 cc <- crosscorr$cross.correlation
 | 
| 
 | 
   713 crosscorr$min.cc <- crosscorr$cross.correlation[ which.min(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation
 | 
| 
 | 
   714 cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout())
 | 
| 
 | 
   715 cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout())
 | 
| 
 | 
   716 sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth
 | 
| 
 | 
   717 cc$y <- runmean(cc$y,sbw,alg="fast")
 | 
| 
 | 
   718 
 | 
| 
 | 
   719 # Compute cross-correlation peak
 | 
| 
 | 
   720 bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
 | 
| 
 | 
   721 peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw]
 | 
| 
 | 
   722 peakidx <- diff(peakidx,bw)
 | 
| 
 | 
   723 peakidx <- which(peakidx==-1) + bw        
 | 
| 
 | 
   724 
 | 
| 
 | 
   725 # exclude peaks from the excluded region
 | 
| 
 | 
   726 if ( is.nan(iparams$ex.range[2]) ) {
 | 
| 
 | 
   727 	iparams$ex.range[2] <- chip.data$read.length+10
 | 
| 
 | 
   728 }
 | 
| 
 | 
   729 peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2])]    
 | 
| 
 | 
   730 cc <- cc[peakidx,]
 | 
| 
 | 
   731 
 | 
| 
 | 
   732 # Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition   
 | 
| 
 | 
   733 maxpeakidx <- which.max(cc$y)
 | 
| 
 | 
   734 maxpeakshift <- cc$x[maxpeakidx]
 | 
| 
 | 
   735 maxpeakval <- cc$y[maxpeakidx]
 | 
| 
 | 
   736 peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift)) 
 | 
| 
 | 
   737 cc <- cc[peakidx,]
 | 
| 
 | 
   738 
 | 
| 
 | 
   739 # sort the peaks and get the top 3
 | 
| 
 | 
   740 sortidx <- order(cc$y,decreasing=TRUE)
 | 
| 
 | 
   741 sortidx <- sortidx[c(1:min(3,length(sortidx)))]
 | 
| 
 | 
   742 cc.peak <- cc[sortidx,]
 | 
| 
 | 
   743 
 | 
| 
 | 
   744 # Override peak shift if user supplies peak shift
 | 
| 
 | 
   745 if (! is.na(iparams$sep.peak)) {
 | 
| 
 | 
   746 	cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2)
 | 
| 
 | 
   747 }
 | 
| 
 | 
   748 cat("Peak cross-correlation value", paste(cc.peak$y,collapse=","),"\n",file=stdout())
 | 
| 
 | 
   749 cat("Peak strand shift",paste(cc.peak$x,collapse=","),"\n",file=stdout())
 | 
| 
 | 
   750 
 | 
| 
 | 
   751 # Reset values in crosscorr
 | 
| 
 | 
   752 crosscorr$peak$x <- cc.peak$x[1]
 | 
| 
 | 
   753 crosscorr$peak$y <- cc.peak$y[1]
 | 
| 
 | 
   754 
 | 
| 
 | 
   755 # Compute window half size
 | 
| 
 | 
   756 whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3
 | 
| 
 | 
   757 crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh])
 | 
| 
 | 
   758 cat("Window half size",crosscorr$whs,"\n",file=stdout())
 | 
| 
 | 
   759 
 | 
| 
 | 
   760 # Compute phantom peak coefficient
 | 
| 
 | 
   761 ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) & 
 | 
| 
 | 
   762                      ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(1.5*iparams$sep.range[2]) ) ) )
 | 
| 
 | 
   763 ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ]
 | 
| 
 | 
   764 crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,]
 | 
| 
 | 
   765 cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout())
 | 
| 
 | 
   766 cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout())
 | 
| 
 | 
   767 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y
 | 
| 
 | 
   768 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y
 | 
| 
 | 
   769 cat("Normalized cross-correlation coefficient (NCCC)",crosscorr$phantom.coeff,"\n",file=stdout())
 | 
| 
 | 
   770 crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y)
 | 
| 
 | 
   771 cat("Relative Cross correlation Coefficient (RCCC)",crosscorr$rel.phantom.coeff,"\n",file=stdout())
 | 
| 
 | 
   772 crosscorr$phantom.quality.tag <- NA
 | 
| 
 | 
   773 if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) {
 | 
| 
 | 
   774 	crosscorr$phantom.quality.tag <- -2
 | 
| 
 | 
   775 } else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) {
 | 
| 
 | 
   776 	crosscorr$phantom.quality.tag <- -1
 | 
| 
 | 
   777 } else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) {
 | 
| 
 | 
   778 	crosscorr$phantom.quality.tag <- 0
 | 
| 
 | 
   779 } else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) {
 | 
| 
 | 
   780 	crosscorr$phantom.quality.tag <- 1
 | 
| 
 | 
   781 } else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) {
 | 
| 
 | 
   782 	crosscorr$phantom.quality.tag <- 2
 | 
| 
 | 
   783 }
 | 
| 
 | 
   784 cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout())
 | 
| 
 | 
   785 
 | 
| 
 | 
   786 # Output result to result file if required
 | 
| 
 | 
   787 #Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag)
 | 
| 
 | 
   788 if (! is.na(iparams$output.result.file)) {
 | 
| 
 | 
   789 	cat(get.file.parts(iparams$chip.file)$fullname,
 | 
| 
 | 
   790 			chip.data$num.tags,
 | 
| 
 | 
   791 			paste(cc.peak$x,collapse=","),
 | 
| 
 | 
   792 			paste(cc.peak$y,collapse=","),
 | 
| 
 | 
   793 			crosscorr$phantom.cc$x,
 | 
| 
 | 
   794 			crosscorr$phantom.cc$y,
 | 
| 
 | 
   795 			crosscorr$min.cc$x,
 | 
| 
 | 
   796 			crosscorr$min.cc$y,
 | 
| 
 | 
   797 			crosscorr$phantom.coeff,
 | 
| 
 | 
   798 			crosscorr$rel.phantom.coeff,
 | 
| 
 | 
   799 			crosscorr$phantom.quality.tag,
 | 
| 
 | 
   800 			sep="\t",
 | 
| 
 | 
   801 			file=iparams$output.result.file,
 | 
| 
 | 
   802 			append=TRUE)
 | 
| 
 | 
   803 	cat("\n",
 | 
| 
 | 
   804 			file=iparams$output.result.file,
 | 
| 
 | 
   805 			append=TRUE)    
 | 
| 
 | 
   806 }
 | 
| 
 | 
   807 
 | 
| 
 | 
   808 # Save figure if required
 | 
| 
 | 
   809 if (! is.na(iparams$output.plot.file)) {
 | 
| 
 | 
   810 	pdf(file=iparams$output.plot.file,width=5,height=5)
 | 
| 
 | 
   811 	par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8);
 | 
| 
 | 
   812 	plot(crosscorr$cross.correlation,
 | 
| 
 | 
   813 			type='l',
 | 
| 
 | 
   814 			xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")),
 | 
| 
 | 
   815 			ylab="cross-correlation")
 | 
| 
 | 
   816 	abline(v=cc.peak$x,lty=2,col=2)
 | 
| 
 | 
   817 	abline(v=crosscorr$phantom.cc$x,lty=2,col=4)
 | 
| 
 | 
   818 	title(main=get.file.parts(iparams$chip.file)$fullname,
 | 
| 
 | 
   819 	      sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag))
 | 
| 
 | 
   820 	dev.off();    
 | 
| 
 | 
   821 }
 | 
| 
 | 
   822 
 | 
| 
 | 
   823 # Save RData file if required
 | 
| 
 | 
   824 if (! is.na(iparams$output.rdata.file)) {
 | 
| 
 | 
   825 	save(iparams,
 | 
| 
 | 
   826 			crosscorr,
 | 
| 
 | 
   827 			cc.peak,
 | 
| 
 | 
   828 			file=iparams$output.rdata.file);    
 | 
| 
 | 
   829 }
 | 
| 
 | 
   830 
 | 
| 
 | 
   831 # #################################    
 | 
| 
 | 
   832 # Call peaks
 | 
| 
 | 
   833 # #################################
 | 
| 
 | 
   834 
 | 
| 
 | 
   835 if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) {
 | 
| 
 | 
   836 	
 | 
| 
 | 
   837 	# Remove local tag anomalies
 | 
| 
 | 
   838 	cat('Removing read stacks\n',file=stdout())
 | 
| 
 | 
   839 	chip.data <- remove.local.tag.anomalies(chip.data$tags)
 | 
| 
 | 
   840 	control.data <- remove.local.tag.anomalies(control.data$tags)
 | 
| 
 | 
   841 	
 | 
| 
 | 
   842 	# Open multiple processes if required
 | 
| 
 | 
   843 	if (is.na(iparams$n.nodes)) {
 | 
| 
 | 
   844 		cluster.nodes <- NULL
 | 
| 
 | 
   845 	} else {
 | 
| 
 | 
   846 		cluster.nodes <- makeCluster(iparams$n.nodes)
 | 
| 
 | 
   847 	}
 | 
| 
 | 
   848 	
 | 
| 
 | 
   849 	# Find peaks
 | 
| 
 | 
   850 	cat('Finding peaks\n',file=stdout())
 | 
| 
 | 
   851 	if (!is.na(iparams$npeak)) {
 | 
| 
 | 
   852 		iparams$fdr <- 0.96
 | 
| 
 | 
   853 	}
 | 
| 
 | 
   854 	narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes)
 | 
| 
 | 
   855 	if (!is.na(iparams$n.nodes)) {
 | 
| 
 | 
   856 		stopCluster(cluster.nodes)
 | 
| 
 | 
   857 	}
 | 
| 
 | 
   858 	cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout())
 | 
| 
 | 
   859 	
 | 
| 
 | 
   860 	# Write to narrowPeak file
 | 
| 
 | 
   861 	if (!is.na(iparams$output.npeak.file)) {
 | 
| 
 | 
   862 		write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
 | 
| 
 | 
   863 		system(paste('gzip -f ',iparams$output.npeak.file))
 | 
| 
 | 
   864 	}
 | 
| 
 | 
   865 	
 | 
| 
 | 
   866 	# Compute and write regionPeak file
 | 
| 
 | 
   867 	if (!is.na(iparams$output.rpeak.file)) {
 | 
| 
 | 
   868 		region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=10)
 | 
| 
 | 
   869 		write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
 | 
| 
 | 
   870 		system(paste('gzip -f ',iparams$output.rpeak.file))
 | 
| 
 | 
   871 	}
 | 
| 
 | 
   872 	
 | 
| 
 | 
   873 	# Save Rdata file    
 | 
| 
 | 
   874 	if (! is.na(iparams$output.rdata.file)) {
 | 
| 
 | 
   875 		save(iparams,
 | 
| 
 | 
   876 				crosscorr,
 | 
| 
 | 
   877 				cc.peak,
 | 
| 
 | 
   878 				narrow.peaks,
 | 
| 
 | 
   879 				region.peaks,
 | 
| 
 | 
   880 				file=iparams$output.rdata.file);    
 | 
| 
 | 
   881 	}
 | 
| 
 | 
   882 	
 | 
| 
 | 
   883 }
 | 
| 
 | 
   884 
 | 
| 
 | 
   885 
 |