Mercurial > repos > galaxyp > psm_to_sam

diff PSM2SAM.R @ 0:c506e5dac2bb draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm_to_sam commit b37186806a83fb3a59a1b4ccb1d44667d5224277-dirty
author: galaxyp
date: Fri, 02 Oct 2015 14:14:15 -0400
children: 34f9e847dd4e
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PSM2SAM.R	Fri Oct 02 14:14:15 2015 -0400
@@ -0,0 +1,400 @@
+#!/usr/bin/env Rscript
+
+## begin warning handler
+withCallingHandlers({
+
+library(methods) # Because Rscript does not always do this
+
+options('useFancyQuotes' = FALSE)
+
+suppressPackageStartupMessages(library("optparse"))
+suppressPackageStartupMessages(library("RGalaxy"))
+
+
+option_list <- list()
+
+option_list$passedPSM <- make_option('--passedPSM', type='character')
+option_list$XScolumn <- make_option('--XScolumn', type='character')
+option_list$exon_anno <- make_option('--exon_anno', type='character')
+option_list$proteinseq <- make_option('--proteinseq', type='character')
+option_list$procodingseq <- make_option('--procodingseq', type='character')
+option_list$header <- make_option('--header', type='character')
+option_list$OutputFile <- make_option('--OutputFile', type='character')
+
+
+opt <- parse_args(OptionParser(option_list=option_list))
+
+
+
+
+PSMtab2SAM <- function(
+	passedPSM_file = GalaxyInputFile(required=TRUE), 
+	exon_anno_file = GalaxyInputFile(),
+	proteinseq_file = GalaxyInputFile(),
+	procodingseq_file = GalaxyInputFile(),
+	header_file = GalaxyInputFile(),
+	XScolumn = GalaxyCharacterParam(required=TRUE), 
+	OutputFile = GalaxyOutput("proSAM","sam"))
+{
+    if (length(exon_anno_file) == 0) { exon_anno_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/exon_anno.RData" }
+    if (length(proteinseq_file) == 0) { proteinseq_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/proseq.RData" }
+    if (length(procodingseq_file) == 0) { procodingseq_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/procodingseq.RData" }
+    if (length(header_file) == 0) { header_file = "/export/src/tools-galaxyp-chambm/tools/bumbershoot/psm_to_sam/tool-data/header_refseq_hg19.txt" }
+
+    if (!file.exists(header_file)) { gstop("failed to read header file") }
+    if (file.exists(OutputFile))
+    {
+        if (file.info(OutputFile)$size > 0) { gstop("output file already exists") }
+        else
+        {
+            tryCatch(
+            {
+                file.remove(OutputFile)
+            }, error=function(err)
+            {
+                gstop("failed to read empty existing file")
+            })
+        }
+    }
+
+    suppressPackageStartupMessages(library(GenomicRanges))
+    suppressPackageStartupMessages(library(Biostrings))
+    
+    options(stringsAsFactors=FALSE)
+
+    scoreName = XScolumn
+    columnName = gsub(":", "_", scoreName)
+    
+    passedPSM <- tryCatch({
+        #read.delim(passedPSM_file, row.names=1)
+        suppressPackageStartupMessages(library(RSQLite))
+        drv <- dbDriver("SQLite")
+        con <- dbConnect(drv, passedPSM_file)
+        
+        # do case-insensitive search for the score name
+        res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep=""))
+        scoreInfo = fetch(res, n=1)
+        scoreId = scoreInfo["Id"]
+        realScoreName = scoreInfo["Name"] # original case
+        dbClearResult(res)
+
+        sql <- paste("SELECT ss.Name as SourceName, s.NativeID",
+                     ", psm.ObservedNeutralMass AS precursor_neutral_mass",
+                     ", psm.Charge AS assumed_charge",
+                     ", s.ScanTimeInSeconds AS retention_time_sec",
+                     ", psm.Rank AS hit_rank",
+                     ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide",
+                     ", pro.Accession AS protein",
+                     ", COUNT(DISTINCT pro.Id) AS num_tot_proteins",
+                     ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass",
+                     ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff",
+                     ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term",
+                     ", MissedCleavages AS num_missed_cleavages",
+                     ", psm.QValue AS qvalue",
+                     ", psmScore.Value AS ", columnName,
+                     ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification",
+                     "FROM PeptideSpectrumMatch psm",
+                     "JOIN Spectrum s ON psm.Spectrum=s.Id",
+                     "JOIN SpectrumSource ss ON s.Source=ss.Id",
+                     "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide",
+                     "JOIN Protein pro ON pi.Protein=pro.Id",
+                     "JOIN Peptide pep ON pi.Peptide=pep.Id",
+                     "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId,
+                     "LEFT JOIN ProteinData pd ON pro.Id=pd.Id",
+                     "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch",
+                     "LEFT JOIN Modification mod ON pm.Modification=mod.Id",
+                     "GROUP BY psm.Id"
+                    )
+
+        res <- dbSendQuery(con, sql)
+        passedPSM <- fetch(res, n=-1)
+        dbClearResult(res)
+        dbDisconnect(con)
+        passedPSM
+    }, error=function(err) {
+        gstop("failed to read passedPSM: ", err)
+    })
+
+    tryCatch({
+        load(exon_anno_file)
+        exon_anno <- exon
+    }, error=function(err) {
+        gstop("failed to read exon_anno: ", conditionMessage(err))
+    })
+
+    tryCatch({
+        load(proteinseq_file)
+    }, error=function(err) {
+        gstop("failed to read proteinseq: ", conditionMessage(err))
+    })
+
+    tryCatch({
+        load(procodingseq_file)
+    }, error=function(err) {
+        gstop("failed to read procodingseq: ", conditionMessage(err))
+    })
+
+    PEP <- passedPSM[, 'peptide']
+    #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/")
+    Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/")
+    #PEP_SEQ <- formatPep(spectra[, 'Sequence'])
+
+    SAM <- c()
+
+    spectrumcount <- table(Spectrumid)
+
+    for(i in 1:dim(passedPSM)[1]){
+        #print(i)
+        peptide <- PEP[i]
+        QNAME <- Spectrumid[i]
+        idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE)
+        if(length(idx) == 0){
+            RNAME <- '*' 
+            MAPQ <- 255
+            RNEXT <- '*'
+            PNEXT <- 0
+            TLEN <- 0
+            QUAL <- '*'
+            POS <- 0
+            SEQ <- '*'
+            CIGAR <- '*'            
+            FLAG <- 0x4 
+            annoted <- '?'
+            XA <- paste('XA:Z:', annoted, sep='')
+            res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
+                    as.character(SEQ), QUAL, XA)
+            res <-  unique(data.frame(t(res))) 
+        }else{
+            pro <- proteinseq[idx, ]
+            sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) 
+                        regexpr(peptide, x, fixed=TRUE)))
+            pep_len <- nchar(peptide)
+            end_pos <- sta_pos + pep_len - 1
+            
+            coding <- procodingseq[match(pro[, 'pro_name'], 
+                                        procodingseq[, 'pro_name']), ]
+            code_s <- (sta_pos-1) * 3 + 1
+            code_e <- end_pos * 3
+            codingseq <- substring(coding[, 'coding'], code_s, code_e)
+            
+            
+            
+            exonp <- lapply(pro[, 'tx_name'], function(x)
+                        exon_anno[exon_anno[, 'tx_name']==x, ])
+            
+            exonp <- lapply(exonp, function(x){
+                            if(length(unique(x[, 'tx_id'])) > 1){
+                                x[grep(x[1, 'tx_id'], x[, 'tx_id'], 
+                                fixed=TRUE), ]
+                            }else x
+                        })
+                       
+            if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE
+                
+            res <- mapply(function(x, y, z, m) 
+                    if(dim(z)[1] == 0){
+                        .proteinUnannotated(x, y, z, m, primary=pri)
+                    }else{
+                        if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], 
+                            na.rm = TRUE))){
+                        #if(toString(translate(DNAString(m))) != peptide){
+                            .peptideUnannotated(x, y, z, m, primary=pri)
+                        }else{
+                            .MapCoding2genome(x, y, z, m, primary=pri)
+                        }
+                    }, 
+                    code_s, code_e, exonp, codingseq) 
+                    
+            res <-  unique(data.frame(t(res)))
+            
+        }
+        XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='')
+        NH <- paste('NH:i:', dim(res)[1], sep='')
+        XP <- paste('XP:Z:', peptide, sep='')
+        #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='')
+        XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='')
+        XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), 
+                        digits=4), sep='')
+        #XA <- paste('XA:Z:', annoted, sep='')
+        XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='')
+        XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='')
+
+        XM <-  ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'),
+                    paste('XM:Z:', passedPSM[i, 'modification'], sep=''))
+
+        res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT)    
+        SAM <- rbind(SAM, res)
+    }
+
+    file.copy(header_file, OutputFile)
+    write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE)
+}
+
+
+
+.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
+{ 
+    RNAME <- '*' 
+    MAPQ <- 255
+    RNEXT <- '*'
+    PNEXT <- 0
+    TLEN <- 0
+    QUAL <- '*'
+    POS <- 0
+    SEQ <- '*'
+    CIGAR <- '*'
+    annoted <- 2
+    XA <- paste('XA:Z:', annoted, sep='')
+    FLAG <- 0x4 
+    
+    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
+                as.character(SEQ), QUAL, XA)
+    tmp
+}
+
+
+
+.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
+{    
+    #RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
+    RNAME <- '*'
+    MAPQ <- 255
+    RNEXT <- '*'
+    PNEXT <- 0
+    TLEN <- 0
+    QUAL <- '*'
+    POS <- 0
+    SEQ <- '*'
+    CIGAR <- '*'
+    annoted <- 1
+    XA <- paste('XA:Z:', annoted, sep='')
+    FLAG <- 0x4 
+    #if(exon_anno[1, 'strand'] == '+'){        
+    #    FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
+    #}else{
+    #    FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100)   
+    #}
+    
+    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
+                as.character(SEQ), QUAL, XA)
+    tmp
+}
+
+
+
+.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
+{
+    idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), 
+                        which(exon_anno[, 'cds_end'] >= c_sta))
+    idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), 
+                        which(exon_anno[, 'cds_end'] >= c_end))              
+    len <- c_end - c_sta + 1 
+    RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
+    MAPQ <- 255
+    RNEXT <- '*'
+    PNEXT <- 0
+    TLEN <- 0
+    QUAL <- '*'
+    annoted <- 0
+    XA <- paste('XA:Z:', annoted, sep='')
+    
+    if(exon_anno[1, 'strand'] == '+'){
+            POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - 
+                    exon_anno[idxs, 'cds_start']
+            SEQ <- DNAString(cseq)
+            FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
+            
+     }else{
+            POS <- exon_anno[idxe, 'cds_chr_start'] +  
+                    exon_anno[idxe, 'cds_end'] - c_end
+            SEQ <- reverseComplement(DNAString(cseq))
+            FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) 
+     }
+    
+    if(idxe == idxs){
+        CIGAR <- paste(len, 'M', sep='')        
+    }else{
+         if(exon_anno[1, 'strand'] == '+'){
+            #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, 
+            #                        'cds_chr_end']- 1
+            part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 
+            part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1  
+            
+            insert <- unlist(lapply(1:(idxe - idxs), function(x) 
+                paste(exon_anno[idxs + x, 'cds_chr_start'] - 
+                    exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep='')))
+            if(idxe-idxs >1){
+                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
+                        paste(exon_anno[idxs + x, 'cds_chr_end'] - 
+                        exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', 
+                        sep=''))) 
+            }else{ innerexon <- ''}
+            
+            #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) 
+            #paste(exon_anno[idxs+x, 'cds_chr_end'] - 
+            #             exon_anno[idxs+x, 'cds_chr_start']+1, 
+            #             'M', sep=''))), '')
+            midpattern <- rep(NA, length(insert) + length(innerexon))
+            midpattern[seq(1, length(insert) + length(innerexon), 
+                                                        by=2)] <- insert
+            midpattern[seq(2, length(insert) + length(innerexon), 
+                                                        by=2)] <- innerexon
+            midpattern <- paste(midpattern, collapse='') 
+            
+        }else{
+            #insert <- exon_anno[idxs, 'cds_chr_start'] - 
+            #            exon_anno[idxe, 'cds_chr_end']- 1
+            part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 
+            part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1  
+            
+            insert <- unlist(lapply(1:(idxe-idxs), function(x) 
+                paste(exon_anno[idxe - x, 'cds_chr_start'] - 
+                        exon_anno[idxe-x + 1, 'cds_chr_end']- 1, 
+                        'N', sep='')))
+            if(idxe-idxs >1){
+                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
+                        paste(exon_anno[idxe-x, 'cds_chr_end'] - 
+                        exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep='')))
+            }else{ innerexon <-''}
+        
+            midpattern <- rep(NA, length(insert)+length(innerexon))
+            midpattern[seq(1, length(insert) + length(innerexon), 
+                                                        by=2)] <- insert
+            midpattern[seq(2, length(insert) + length(innerexon), 
+                                                        by=2)] <- innerexon
+            midpattern <- paste(midpattern, collapse='') 
+
+        }
+
+        CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='')    
+    }
+
+    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
+                as.character(SEQ), QUAL, XA)
+    tmp
+}
+
+
+params <- list()
+for(param in names(opt))
+{
+    if (!param == "help")
+        params[param] <- opt[param]
+}
+
+setClass("GalaxyRemoteError", contains="character")
+wrappedFunction <- function(f)
+{
+    tryCatch(do.call(f, params),
+        error=function(e) new("GalaxyRemoteError", conditionMessage(e)))
+}
+
+
+suppressPackageStartupMessages(library(RGalaxy))
+do.call(PSMtab2SAM, params)
+
+## end warning handler
+}, warning = function(w) {
+    cat(paste("Warning:", conditionMessage(w), "\n"))
+    invokeRestart("muffleWarning")
+})
author	galaxyp
date	Fri, 02 Oct 2015 14:14:15 -0400
parents
children	34f9e847dd4e