Mercurial > repos > galaxyp > psm_to_sam
view PSM2SAM.R @ 2:e1bb35f6ca28 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm_to_sam commit 141369f97aa2804d2bbfd9ed620ea2a5574994c2-dirty
author | galaxyp |
---|---|
date | Thu, 14 Jan 2016 18:13:18 -0500 |
parents | 34f9e847dd4e |
children | ce09f1a1bbad |
line wrap: on
line source
#!/usr/bin/env Rscript initial.options <- commandArgs(trailingOnly = FALSE) script_parent_dir <- dirname(sub("--file=", "", initial.options[grep("--file=", initial.options)])) ## begin warning handler withCallingHandlers({ library(methods) # Because Rscript does not always do this options('useFancyQuotes' = FALSE) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("RGalaxy")) option_list <- list() option_list$passedPSM <- make_option('--passedPSM', type='character') option_list$XScolumn <- make_option('--XScolumn', type='character') option_list$exon_anno <- make_option('--exon_anno', type='character') option_list$proteinseq <- make_option('--proteinseq', type='character') option_list$procodingseq <- make_option('--procodingseq', type='character') option_list$header <- make_option('--header', type='character') option_list$OutputFile <- make_option('--OutputFile', type='character') opt <- parse_args(OptionParser(option_list=option_list)) PSMtab2SAM <- function( passedPSM_file = GalaxyInputFile(required=TRUE), exon_anno_file = GalaxyInputFile(), proteinseq_file = GalaxyInputFile(), procodingseq_file = GalaxyInputFile(), header_file = GalaxyInputFile(), XScolumn = GalaxyCharacterParam(required=TRUE), OutputFile = GalaxyOutput("proSAM","sam")) { if (length(exon_anno_file) == 0) { exon_anno_file = paste(script_parent_dir, "tool-data/exon_anno.RData", sep="/") } if (length(proteinseq_file) == 0) { proteinseq_file = paste(script_parent_dir, "tool-data/proseq.RData", sep="/") } if (length(procodingseq_file) == 0) { procodingseq_file = paste(script_parent_dir, "tool-data/procodingseq.RData", sep="/") } if (length(header_file) == 0) { header_file = paste(script_parent_dir, "tool-data/header_refseq_hg19.txt", sep="/") } if (!file.exists(header_file)) { gstop("failed to read header file") } if (file.exists(OutputFile)) { if (file.info(OutputFile)$size > 0) { gstop("output file already exists") } else { tryCatch( { file.remove(OutputFile) }, error=function(err) { gstop("failed to read empty existing file") }) } } suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(Biostrings)) options(stringsAsFactors=FALSE) scoreName = XScolumn columnName = gsub("[^A-Z0-9]", "_", scoreName) passedPSM <- tryCatch({ #read.delim(passedPSM_file, row.names=1) suppressPackageStartupMessages(library(RSQLite)) drv <- dbDriver("SQLite") con <- dbConnect(drv, passedPSM_file) # do case-insensitive search for the score name res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep="")) scoreInfo = fetch(res, n=1) scoreId = scoreInfo["Id"] realScoreName = scoreInfo["Name"] # original case dbClearResult(res) sql <- paste("SELECT ss.Name as SourceName, s.NativeID", ", psm.ObservedNeutralMass AS precursor_neutral_mass", ", psm.Charge AS assumed_charge", ", s.ScanTimeInSeconds AS retention_time_sec", ", psm.Rank AS hit_rank", ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide", ", pro.Accession AS protein", ", COUNT(DISTINCT pro.Id) AS num_tot_proteins", ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass", ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff", ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term", ", MissedCleavages AS num_missed_cleavages", ", psm.QValue AS qvalue", ", psmScore.Value AS ", columnName, ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification", "FROM PeptideSpectrumMatch psm", "JOIN Spectrum s ON psm.Spectrum=s.Id", "JOIN SpectrumSource ss ON s.Source=ss.Id", "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide", "JOIN Protein pro ON pi.Protein=pro.Id", "JOIN Peptide pep ON pi.Peptide=pep.Id", "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId, "LEFT JOIN ProteinData pd ON pro.Id=pd.Id", "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch", "LEFT JOIN Modification mod ON pm.Modification=mod.Id", "GROUP BY psm.Id" ) res <- dbSendQuery(con, sql) passedPSM <- fetch(res, n=-1) dbClearResult(res) dbDisconnect(con) passedPSM }, error=function(err) { gstop("failed to read passedPSM: ", err) }) tryCatch({ load(exon_anno_file) exon_anno <- exon }, error=function(err) { gstop("failed to read exon_anno: ", conditionMessage(err)) }) tryCatch({ load(proteinseq_file) }, error=function(err) { gstop("failed to read proteinseq: ", conditionMessage(err)) }) tryCatch({ load(procodingseq_file) }, error=function(err) { gstop("failed to read procodingseq: ", conditionMessage(err)) }) PEP <- passedPSM[, 'peptide'] #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/") Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/") #PEP_SEQ <- formatPep(spectra[, 'Sequence']) SAM <- c() spectrumcount <- table(Spectrumid) for(i in 1:dim(passedPSM)[1]){ #print(i) peptide <- PEP[i] QNAME <- Spectrumid[i] idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE) if(length(idx) == 0){ RNAME <- '*' MAPQ <- 255 RNEXT <- '*' PNEXT <- 0 TLEN <- 0 QUAL <- '*' POS <- 0 SEQ <- '*' CIGAR <- '*' FLAG <- 0x4 annoted <- '?' XA <- paste('XA:Z:', annoted, sep='') res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, as.character(SEQ), QUAL, XA) res <- unique(data.frame(t(res))) }else{ pro <- proteinseq[idx, ] sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) regexpr(peptide, x, fixed=TRUE))) pep_len <- nchar(peptide) end_pos <- sta_pos + pep_len - 1 coding <- procodingseq[match(pro[, 'pro_name'], procodingseq[, 'pro_name']), ] code_s <- (sta_pos-1) * 3 + 1 code_e <- end_pos * 3 codingseq <- substring(coding[, 'coding'], code_s, code_e) exonp <- lapply(pro[, 'tx_name'], function(x) exon_anno[exon_anno[, 'tx_name']==x, ]) exonp <- lapply(exonp, function(x){ if(length(unique(x[, 'tx_id'])) > 1){ x[grep(x[1, 'tx_id'], x[, 'tx_id'], fixed=TRUE), ] }else x }) if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE res <- mapply(function(x, y, z, m) if(dim(z)[1] == 0){ .proteinUnannotated(x, y, z, m, primary=pri) }else{ if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], na.rm = TRUE))){ #if(toString(translate(DNAString(m))) != peptide){ .peptideUnannotated(x, y, z, m, primary=pri) }else{ .MapCoding2genome(x, y, z, m, primary=pri) } }, code_s, code_e, exonp, codingseq) res <- unique(data.frame(t(res))) } XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='') NH <- paste('NH:i:', dim(res)[1], sep='') XP <- paste('XP:Z:', peptide, sep='') #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='') XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='') XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), digits=4), sep='') #XA <- paste('XA:Z:', annoted, sep='') XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='') XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='') XM <- ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'), paste('XM:Z:', passedPSM[i, 'modification'], sep='')) res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT) SAM <- rbind(SAM, res) } file.copy(header_file, OutputFile) write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE) } .proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) { RNAME <- '*' MAPQ <- 255 RNEXT <- '*' PNEXT <- 0 TLEN <- 0 QUAL <- '*' POS <- 0 SEQ <- '*' CIGAR <- '*' annoted <- 2 XA <- paste('XA:Z:', annoted, sep='') FLAG <- 0x4 tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, as.character(SEQ), QUAL, XA) tmp } .peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) { #RNAME <- as.character(exon_anno[1, 'chromosome_name']) RNAME <- '*' MAPQ <- 255 RNEXT <- '*' PNEXT <- 0 TLEN <- 0 QUAL <- '*' POS <- 0 SEQ <- '*' CIGAR <- '*' annoted <- 1 XA <- paste('XA:Z:', annoted, sep='') FLAG <- 0x4 #if(exon_anno[1, 'strand'] == '+'){ # FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) #}else{ # FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100) #} tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, as.character(SEQ), QUAL, XA) tmp } .MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) { idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), which(exon_anno[, 'cds_end'] >= c_sta)) idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), which(exon_anno[, 'cds_end'] >= c_end)) len <- c_end - c_sta + 1 RNAME <- as.character(exon_anno[1, 'chromosome_name']) MAPQ <- 255 RNEXT <- '*' PNEXT <- 0 TLEN <- 0 QUAL <- '*' annoted <- 0 XA <- paste('XA:Z:', annoted, sep='') if(exon_anno[1, 'strand'] == '+'){ POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - exon_anno[idxs, 'cds_start'] SEQ <- DNAString(cseq) FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) }else{ POS <- exon_anno[idxe, 'cds_chr_start'] + exon_anno[idxe, 'cds_end'] - c_end SEQ <- reverseComplement(DNAString(cseq)) FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) } if(idxe == idxs){ CIGAR <- paste(len, 'M', sep='') }else{ if(exon_anno[1, 'strand'] == '+'){ #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, # 'cds_chr_end']- 1 part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1 insert <- unlist(lapply(1:(idxe - idxs), function(x) paste(exon_anno[idxs + x, 'cds_chr_start'] - exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep=''))) if(idxe-idxs >1){ innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) paste(exon_anno[idxs + x, 'cds_chr_end'] - exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', sep=''))) }else{ innerexon <- ''} #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) #paste(exon_anno[idxs+x, 'cds_chr_end'] - # exon_anno[idxs+x, 'cds_chr_start']+1, # 'M', sep=''))), '') midpattern <- rep(NA, length(insert) + length(innerexon)) midpattern[seq(1, length(insert) + length(innerexon), by=2)] <- insert midpattern[seq(2, length(insert) + length(innerexon), by=2)] <- innerexon midpattern <- paste(midpattern, collapse='') }else{ #insert <- exon_anno[idxs, 'cds_chr_start'] - # exon_anno[idxe, 'cds_chr_end']- 1 part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 insert <- unlist(lapply(1:(idxe-idxs), function(x) paste(exon_anno[idxe - x, 'cds_chr_start'] - exon_anno[idxe-x + 1, 'cds_chr_end']- 1, 'N', sep=''))) if(idxe-idxs >1){ innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) paste(exon_anno[idxe-x, 'cds_chr_end'] - exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep=''))) }else{ innerexon <-''} midpattern <- rep(NA, length(insert)+length(innerexon)) midpattern[seq(1, length(insert) + length(innerexon), by=2)] <- insert midpattern[seq(2, length(insert) + length(innerexon), by=2)] <- innerexon midpattern <- paste(midpattern, collapse='') } CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='') } tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, as.character(SEQ), QUAL, XA) tmp } params <- list() for(param in names(opt)) { if (!param == "help") params[param] <- opt[param] } setClass("GalaxyRemoteError", contains="character") wrappedFunction <- function(f) { tryCatch(do.call(f, params), error=function(e) new("GalaxyRemoteError", conditionMessage(e))) } suppressPackageStartupMessages(library(RGalaxy)) do.call(PSMtab2SAM, params) ## end warning handler }, warning = function(w) { cat(paste("Warning:", conditionMessage(w), "\n")) invokeRestart("muffleWarning") })