Mercurial > repos > galaxyp > psm_to_sam
diff PSM2SAM.R @ 0:c506e5dac2bb draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm_to_sam commit b37186806a83fb3a59a1b4ccb1d44667d5224277-dirty
author | galaxyp |
---|---|
date | Fri, 02 Oct 2015 14:14:15 -0400 |
parents | |
children | 34f9e847dd4e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PSM2SAM.R Fri Oct 02 14:14:15 2015 -0400 @@ -0,0 +1,400 @@ +#!/usr/bin/env Rscript + +## begin warning handler +withCallingHandlers({ + +library(methods) # Because Rscript does not always do this + +options('useFancyQuotes' = FALSE) + +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("RGalaxy")) + + +option_list <- list() + +option_list$passedPSM <- make_option('--passedPSM', type='character') +option_list$XScolumn <- make_option('--XScolumn', type='character') +option_list$exon_anno <- make_option('--exon_anno', type='character') +option_list$proteinseq <- make_option('--proteinseq', type='character') +option_list$procodingseq <- make_option('--procodingseq', type='character') +option_list$header <- make_option('--header', type='character') +option_list$OutputFile <- make_option('--OutputFile', type='character') + + +opt <- parse_args(OptionParser(option_list=option_list)) + + + + +PSMtab2SAM <- function( + passedPSM_file = GalaxyInputFile(required=TRUE), + exon_anno_file = GalaxyInputFile(), + proteinseq_file = GalaxyInputFile(), + procodingseq_file = GalaxyInputFile(), + header_file = GalaxyInputFile(), + XScolumn = GalaxyCharacterParam(required=TRUE), + OutputFile = GalaxyOutput("proSAM","sam")) +{ + if (length(exon_anno_file) == 0) { exon_anno_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/exon_anno.RData" } + if (length(proteinseq_file) == 0) { proteinseq_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/proseq.RData" } + if (length(procodingseq_file) == 0) { procodingseq_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/procodingseq.RData" } + if (length(header_file) == 0) { header_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/header_refseq_hg19.txt" } + + if (!file.exists(header_file)) { gstop("failed to read header file") } + if (file.exists(OutputFile)) + { + if (file.info(OutputFile)$size > 0) { gstop("output file already exists") } + else + { + tryCatch( + { + file.remove(OutputFile) + }, error=function(err) + { + gstop("failed to read empty existing file") + }) + } + } + + suppressPackageStartupMessages(library(GenomicRanges)) + suppressPackageStartupMessages(library(Biostrings)) + + options(stringsAsFactors=FALSE) + + scoreName = XScolumn + columnName = gsub(":", "_", scoreName) + + passedPSM <- tryCatch({ + #read.delim(passedPSM_file, row.names=1) + suppressPackageStartupMessages(library(RSQLite)) + drv <- dbDriver("SQLite") + con <- dbConnect(drv, passedPSM_file) + + # do case-insensitive search for the score name + res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep="")) + scoreInfo = fetch(res, n=1) + scoreId = scoreInfo["Id"] + realScoreName = scoreInfo["Name"] # original case + dbClearResult(res) + + sql <- paste("SELECT ss.Name as SourceName, s.NativeID", + ", psm.ObservedNeutralMass AS precursor_neutral_mass", + ", psm.Charge AS assumed_charge", + ", s.ScanTimeInSeconds AS retention_time_sec", + ", psm.Rank AS hit_rank", + ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide", + ", pro.Accession AS protein", + ", COUNT(DISTINCT pro.Id) AS num_tot_proteins", + ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass", + ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff", + ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term", + ", MissedCleavages AS num_missed_cleavages", + ", psm.QValue AS qvalue", + ", psmScore.Value AS ", columnName, + ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification", + "FROM PeptideSpectrumMatch psm", + "JOIN Spectrum s ON psm.Spectrum=s.Id", + "JOIN SpectrumSource ss ON s.Source=ss.Id", + "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide", + "JOIN Protein pro ON pi.Protein=pro.Id", + "JOIN Peptide pep ON pi.Peptide=pep.Id", + "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId, + "LEFT JOIN ProteinData pd ON pro.Id=pd.Id", + "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch", + "LEFT JOIN Modification mod ON pm.Modification=mod.Id", + "GROUP BY psm.Id" + ) + + res <- dbSendQuery(con, sql) + passedPSM <- fetch(res, n=-1) + dbClearResult(res) + dbDisconnect(con) + passedPSM + }, error=function(err) { + gstop("failed to read passedPSM: ", err) + }) + + tryCatch({ + load(exon_anno_file) + exon_anno <- exon + }, error=function(err) { + gstop("failed to read exon_anno: ", conditionMessage(err)) + }) + + tryCatch({ + load(proteinseq_file) + }, error=function(err) { + gstop("failed to read proteinseq: ", conditionMessage(err)) + }) + + tryCatch({ + load(procodingseq_file) + }, error=function(err) { + gstop("failed to read procodingseq: ", conditionMessage(err)) + }) + + PEP <- passedPSM[, 'peptide'] + #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/") + Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/") + #PEP_SEQ <- formatPep(spectra[, 'Sequence']) + + SAM <- c() + + spectrumcount <- table(Spectrumid) + + for(i in 1:dim(passedPSM)[1]){ + #print(i) + peptide <- PEP[i] + QNAME <- Spectrumid[i] + idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE) + if(length(idx) == 0){ + RNAME <- '*' + MAPQ <- 255 + RNEXT <- '*' + PNEXT <- 0 + TLEN <- 0 + QUAL <- '*' + POS <- 0 + SEQ <- '*' + CIGAR <- '*' + FLAG <- 0x4 + annoted <- '?' + XA <- paste('XA:Z:', annoted, sep='') + res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, + as.character(SEQ), QUAL, XA) + res <- unique(data.frame(t(res))) + }else{ + pro <- proteinseq[idx, ] + sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) + regexpr(peptide, x, fixed=TRUE))) + pep_len <- nchar(peptide) + end_pos <- sta_pos + pep_len - 1 + + coding <- procodingseq[match(pro[, 'pro_name'], + procodingseq[, 'pro_name']), ] + code_s <- (sta_pos-1) * 3 + 1 + code_e <- end_pos * 3 + codingseq <- substring(coding[, 'coding'], code_s, code_e) + + + + exonp <- lapply(pro[, 'tx_name'], function(x) + exon_anno[exon_anno[, 'tx_name']==x, ]) + + exonp <- lapply(exonp, function(x){ + if(length(unique(x[, 'tx_id'])) > 1){ + x[grep(x[1, 'tx_id'], x[, 'tx_id'], + fixed=TRUE), ] + }else x + }) + + if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE + + res <- mapply(function(x, y, z, m) + if(dim(z)[1] == 0){ + .proteinUnannotated(x, y, z, m, primary=pri) + }else{ + if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], + na.rm = TRUE))){ + #if(toString(translate(DNAString(m))) != peptide){ + .peptideUnannotated(x, y, z, m, primary=pri) + }else{ + .MapCoding2genome(x, y, z, m, primary=pri) + } + }, + code_s, code_e, exonp, codingseq) + + res <- unique(data.frame(t(res))) + + } + XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='') + NH <- paste('NH:i:', dim(res)[1], sep='') + XP <- paste('XP:Z:', peptide, sep='') + #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='') + XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='') + XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), + digits=4), sep='') + #XA <- paste('XA:Z:', annoted, sep='') + XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='') + XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='') + + XM <- ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'), + paste('XM:Z:', passedPSM[i, 'modification'], sep='')) + + res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT) + SAM <- rbind(SAM, res) + } + + file.copy(header_file, OutputFile) + write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE) +} + + + +.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) +{ + RNAME <- '*' + MAPQ <- 255 + RNEXT <- '*' + PNEXT <- 0 + TLEN <- 0 + QUAL <- '*' + POS <- 0 + SEQ <- '*' + CIGAR <- '*' + annoted <- 2 + XA <- paste('XA:Z:', annoted, sep='') + FLAG <- 0x4 + + tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, + as.character(SEQ), QUAL, XA) + tmp +} + + + +.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) +{ + #RNAME <- as.character(exon_anno[1, 'chromosome_name']) + RNAME <- '*' + MAPQ <- 255 + RNEXT <- '*' + PNEXT <- 0 + TLEN <- 0 + QUAL <- '*' + POS <- 0 + SEQ <- '*' + CIGAR <- '*' + annoted <- 1 + XA <- paste('XA:Z:', annoted, sep='') + FLAG <- 0x4 + #if(exon_anno[1, 'strand'] == '+'){ + # FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) + #}else{ + # FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100) + #} + + tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, + as.character(SEQ), QUAL, XA) + tmp +} + + + +.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...) +{ + idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), + which(exon_anno[, 'cds_end'] >= c_sta)) + idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), + which(exon_anno[, 'cds_end'] >= c_end)) + len <- c_end - c_sta + 1 + RNAME <- as.character(exon_anno[1, 'chromosome_name']) + MAPQ <- 255 + RNEXT <- '*' + PNEXT <- 0 + TLEN <- 0 + QUAL <- '*' + annoted <- 0 + XA <- paste('XA:Z:', annoted, sep='') + + if(exon_anno[1, 'strand'] == '+'){ + POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - + exon_anno[idxs, 'cds_start'] + SEQ <- DNAString(cseq) + FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) + + }else{ + POS <- exon_anno[idxe, 'cds_chr_start'] + + exon_anno[idxe, 'cds_end'] - c_end + SEQ <- reverseComplement(DNAString(cseq)) + FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) + } + + if(idxe == idxs){ + CIGAR <- paste(len, 'M', sep='') + }else{ + if(exon_anno[1, 'strand'] == '+'){ + #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, + # 'cds_chr_end']- 1 + part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 + part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1 + + insert <- unlist(lapply(1:(idxe - idxs), function(x) + paste(exon_anno[idxs + x, 'cds_chr_start'] - + exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep=''))) + if(idxe-idxs >1){ + innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) + paste(exon_anno[idxs + x, 'cds_chr_end'] - + exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', + sep=''))) + }else{ innerexon <- ''} + + #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) + #paste(exon_anno[idxs+x, 'cds_chr_end'] - + # exon_anno[idxs+x, 'cds_chr_start']+1, + # 'M', sep=''))), '') + midpattern <- rep(NA, length(insert) + length(innerexon)) + midpattern[seq(1, length(insert) + length(innerexon), + by=2)] <- insert + midpattern[seq(2, length(insert) + length(innerexon), + by=2)] <- innerexon + midpattern <- paste(midpattern, collapse='') + + }else{ + #insert <- exon_anno[idxs, 'cds_chr_start'] - + # exon_anno[idxe, 'cds_chr_end']- 1 + part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 + part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 + + insert <- unlist(lapply(1:(idxe-idxs), function(x) + paste(exon_anno[idxe - x, 'cds_chr_start'] - + exon_anno[idxe-x + 1, 'cds_chr_end']- 1, + 'N', sep=''))) + if(idxe-idxs >1){ + innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) + paste(exon_anno[idxe-x, 'cds_chr_end'] - + exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep=''))) + }else{ innerexon <-''} + + midpattern <- rep(NA, length(insert)+length(innerexon)) + midpattern[seq(1, length(insert) + length(innerexon), + by=2)] <- insert + midpattern[seq(2, length(insert) + length(innerexon), + by=2)] <- innerexon + midpattern <- paste(midpattern, collapse='') + + } + + CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='') + } + + tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, + as.character(SEQ), QUAL, XA) + tmp +} + + +params <- list() +for(param in names(opt)) +{ + if (!param == "help") + params[param] <- opt[param] +} + +setClass("GalaxyRemoteError", contains="character") +wrappedFunction <- function(f) +{ + tryCatch(do.call(f, params), + error=function(e) new("GalaxyRemoteError", conditionMessage(e))) +} + + +suppressPackageStartupMessages(library(RGalaxy)) +do.call(PSMtab2SAM, params) + +## end warning handler +}, warning = function(w) { + cat(paste("Warning:", conditionMessage(w), "\n")) + invokeRestart("muffleWarning") +})