view PSM2SAM.R @ 1:34f9e847dd4e draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/psm_to_sam commit cf9086847f3153e28b697b5f4b0da1a677eb165a-dirty
author galaxyp
date Thu, 12 Nov 2015 18:11:28 -0500
parents c506e5dac2bb
children ce09f1a1bbad
line wrap: on
line source

#!/usr/bin/env Rscript

initial.options <- commandArgs(trailingOnly = FALSE)
script_parent_dir <- dirname(sub("--file=", "", initial.options[grep("--file=", initial.options)]))

## begin warning handler
withCallingHandlers({

library(methods) # Because Rscript does not always do this

options('useFancyQuotes' = FALSE)

suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("RGalaxy"))


option_list <- list()

option_list$passedPSM <- make_option('--passedPSM', type='character')
option_list$XScolumn <- make_option('--XScolumn', type='character')
option_list$exon_anno <- make_option('--exon_anno', type='character')
option_list$proteinseq <- make_option('--proteinseq', type='character')
option_list$procodingseq <- make_option('--procodingseq', type='character')
option_list$header <- make_option('--header', type='character')
option_list$OutputFile <- make_option('--OutputFile', type='character')


opt <- parse_args(OptionParser(option_list=option_list))




PSMtab2SAM <- function(
	passedPSM_file = GalaxyInputFile(required=TRUE), 
	exon_anno_file = GalaxyInputFile(),
	proteinseq_file = GalaxyInputFile(),
	procodingseq_file = GalaxyInputFile(),
	header_file = GalaxyInputFile(),
	XScolumn = GalaxyCharacterParam(required=TRUE), 
	OutputFile = GalaxyOutput("proSAM","sam"))
{
    if (length(exon_anno_file) == 0) { exon_anno_file = paste(script_parent_dir, "tool-data/exon_anno.RData", sep="/") }
    if (length(proteinseq_file) == 0) { proteinseq_file = paste(script_parent_dir, "tool-data/proseq.RData", sep="/") }
    if (length(procodingseq_file) == 0) { procodingseq_file = paste(script_parent_dir, "tool-data/procodingseq.RData", sep="/") }
    if (length(header_file) == 0) { header_file = paste(script_parent_dir, "tool-data/header_refseq_hg19.txt", sep="/") }

    if (!file.exists(header_file)) { gstop("failed to read header file") }
    if (file.exists(OutputFile))
    {
        if (file.info(OutputFile)$size > 0) { gstop("output file already exists") }
        else
        {
            tryCatch(
            {
                file.remove(OutputFile)
            }, error=function(err)
            {
                gstop("failed to read empty existing file")
            })
        }
    }

    suppressPackageStartupMessages(library(GenomicRanges))
    suppressPackageStartupMessages(library(Biostrings))
    
    options(stringsAsFactors=FALSE)

    scoreName = XScolumn
    columnName = gsub("[^A-Z0-9]", "_", scoreName)
    
    passedPSM <- tryCatch({
        #read.delim(passedPSM_file, row.names=1)
        suppressPackageStartupMessages(library(RSQLite))
        drv <- dbDriver("SQLite")
        con <- dbConnect(drv, passedPSM_file)
        
        # do case-insensitive search for the score name
        res <- dbSendQuery(con, paste("SELECT Id, Name FROM PeptideSpectrumMatchScoreName WHERE lower(Name)=lower('", scoreName, "')", sep=""))
        scoreInfo = fetch(res, n=1)
        scoreId = scoreInfo["Id"]
        realScoreName = scoreInfo["Name"] # original case
        dbClearResult(res)

        sql <- paste("SELECT ss.Name as SourceName, s.NativeID",
                     ", psm.ObservedNeutralMass AS precursor_neutral_mass",
                     ", psm.Charge AS assumed_charge",
                     ", s.ScanTimeInSeconds AS retention_time_sec",
                     ", psm.Rank AS hit_rank",
                     ", IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), pep.DecoySequence) AS peptide",
                     ", pro.Accession AS protein",
                     ", COUNT(DISTINCT pro.Id) AS num_tot_proteins",
                     ", psm.ObservedNeutralMass - (psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026) AS calc_neutral_pep_mass",
                     ", psm.MonoisotopicMassError - ROUND(psm.MonoisotopicMassError) * 1.0026 AS massdiff",
                     ", CTerminusIsSpecific+NTerminusIsSpecific AS num_tol_term",
                     ", MissedCleavages AS num_missed_cleavages",
                     ", psm.QValue AS qvalue",
                     ", psmScore.Value AS ", columnName,
                     ", GROUP_CONCAT(DISTINCT pm.Offset || ';' || mod.MonoMassDelta) AS modification",
                     "FROM PeptideSpectrumMatch psm",
                     "JOIN Spectrum s ON psm.Spectrum=s.Id",
                     "JOIN SpectrumSource ss ON s.Source=ss.Id",
                     "JOIN PeptideInstance pi ON psm.Peptide=pi.Peptide",
                     "JOIN Protein pro ON pi.Protein=pro.Id",
                     "JOIN Peptide pep ON pi.Peptide=pep.Id",
                     "JOIN PeptideSpectrumMatchScore psmScore ON psmScore.PsmId=psm.Id AND ScoreNameId=", scoreId,
                     "LEFT JOIN ProteinData pd ON pro.Id=pd.Id",
                     "LEFT JOIN PeptideModification pm ON psm.Id=pm.PeptideSpectrumMatch",
                     "LEFT JOIN Modification mod ON pm.Modification=mod.Id",
                     "GROUP BY psm.Id"
                    )

        res <- dbSendQuery(con, sql)
        passedPSM <- fetch(res, n=-1)
        dbClearResult(res)
        dbDisconnect(con)
        passedPSM
    }, error=function(err) {
        gstop("failed to read passedPSM: ", err)
    })

    tryCatch({
        load(exon_anno_file)
        exon_anno <- exon
    }, error=function(err) {
        gstop("failed to read exon_anno: ", conditionMessage(err))
    })

    tryCatch({
        load(proteinseq_file)
    }, error=function(err) {
        gstop("failed to read proteinseq: ", conditionMessage(err))
    })

    tryCatch({
        load(procodingseq_file)
    }, error=function(err) {
        gstop("failed to read procodingseq: ", conditionMessage(err))
    })

    PEP <- passedPSM[, 'peptide']
    #Spectrumid <- paste(passedPSM[, 'SourceName'], gsub(" ", "_", passedPSM[, 'NativeID']), sep="/")
    Spectrumid <- paste(passedPSM[, 'SourceName'], passedPSM[, 'NativeID'], sep="/")
    #PEP_SEQ <- formatPep(spectra[, 'Sequence'])

    SAM <- c()

    spectrumcount <- table(Spectrumid)

    for(i in 1:dim(passedPSM)[1]){
        #print(i)
        peptide <- PEP[i]
        QNAME <- Spectrumid[i]
        idx <- grep(peptide, proteinseq[, 'peptide'], fixed=TRUE)
        if(length(idx) == 0){
            RNAME <- '*' 
            MAPQ <- 255
            RNEXT <- '*'
            PNEXT <- 0
            TLEN <- 0
            QUAL <- '*'
            POS <- 0
            SEQ <- '*'
            CIGAR <- '*'            
            FLAG <- 0x4 
            annoted <- '?'
            XA <- paste('XA:Z:', annoted, sep='')
            res <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                    as.character(SEQ), QUAL, XA)
            res <-  unique(data.frame(t(res))) 
        }else{
            pro <- proteinseq[idx, ]
            sta_pos <- unlist(lapply(pro[, 'peptide'], function(x) 
                        regexpr(peptide, x, fixed=TRUE)))
            pep_len <- nchar(peptide)
            end_pos <- sta_pos + pep_len - 1
            
            coding <- procodingseq[match(pro[, 'pro_name'], 
                                        procodingseq[, 'pro_name']), ]
            code_s <- (sta_pos-1) * 3 + 1
            code_e <- end_pos * 3
            codingseq <- substring(coding[, 'coding'], code_s, code_e)
            
            
            
            exonp <- lapply(pro[, 'tx_name'], function(x)
                        exon_anno[exon_anno[, 'tx_name']==x, ])
            
            exonp <- lapply(exonp, function(x){
                            if(length(unique(x[, 'tx_id'])) > 1){
                                x[grep(x[1, 'tx_id'], x[, 'tx_id'], 
                                fixed=TRUE), ]
                            }else x
                        })
                       
            if(passedPSM[i, 'hit_rank'] == 1) pri <- TRUE else pri <- FALSE
                
            res <- mapply(function(x, y, z, m) 
                    if(dim(z)[1] == 0){
                        .proteinUnannotated(x, y, z, m, primary=pri)
                    }else{
                        if((nchar(m) != 3*pep_len) | (y > max(z[, 'cds_end'], 
                            na.rm = TRUE))){
                        #if(toString(translate(DNAString(m))) != peptide){
                            .peptideUnannotated(x, y, z, m, primary=pri)
                        }else{
                            .MapCoding2genome(x, y, z, m, primary=pri)
                        }
                    }, 
                    code_s, code_e, exonp, codingseq) 
                    
            res <-  unique(data.frame(t(res)))
            
        }
        XL <- paste('XL:i:', as.numeric(spectrumcount[QNAME]), sep='')
        NH <- paste('NH:i:', dim(res)[1], sep='')
        XP <- paste('XP:Z:', peptide, sep='')
        #XF <- paste('XF:f:', round(passedPSM[i, XFcolumn], digits=4), sep='')
        XC <- paste('XC:i:', passedPSM[i, 'assumed_charge'], sep='')
        XS <- paste('XS:f:', round(as.numeric(passedPSM[i, columnName]), 
                        digits=4), sep='')
        #XA <- paste('XA:Z:', annoted, sep='')
        XN <- paste('XN:i:', passedPSM[i, 'num_missed_cleavages'], sep='')
        XT <- paste('XT:i:', passedPSM[i, 'num_tol_term'], sep='')

        XM <-  ifelse(is.na(passedPSM[i, 'modification']), paste('XM:Z:-'),
                    paste('XM:Z:', passedPSM[i, 'modification'], sep=''))

        res <- cbind(QNAME, res, NH, XL, XP, XC, XS, XM, XN, XT)    
        SAM <- rbind(SAM, res)
    }

    file.copy(header_file, OutputFile)
    write.table(SAM, file=OutputFile, sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE)
}



.proteinUnannotated <-function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
{ 
    RNAME <- '*' 
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    POS <- 0
    SEQ <- '*'
    CIGAR <- '*'
    annoted <- 2
    XA <- paste('XA:Z:', annoted, sep='')
    FLAG <- 0x4 
    
    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}



.peptideUnannotated <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
{    
    #RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
    RNAME <- '*'
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    POS <- 0
    SEQ <- '*'
    CIGAR <- '*'
    annoted <- 1
    XA <- paste('XA:Z:', annoted, sep='')
    FLAG <- 0x4 
    #if(exon_anno[1, 'strand'] == '+'){        
    #    FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
    #}else{
    #    FLAG <- ifelse(primary==TRUE, 0x10, 0x10+0x100)   
    #}
    
    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}



.MapCoding2genome <- function(c_sta, c_end, exon_anno, cseq, primary=TRUE, ...)
{
    idxs <- intersect(which(exon_anno[, 'cds_start'] <= c_sta), 
                        which(exon_anno[, 'cds_end'] >= c_sta))
    idxe <- intersect(which(exon_anno[, 'cds_start'] <= c_end), 
                        which(exon_anno[, 'cds_end'] >= c_end))              
    len <- c_end - c_sta + 1 
    RNAME <- as.character(exon_anno[1, 'chromosome_name']) 
    MAPQ <- 255
    RNEXT <- '*'
    PNEXT <- 0
    TLEN <- 0
    QUAL <- '*'
    annoted <- 0
    XA <- paste('XA:Z:', annoted, sep='')
    
    if(exon_anno[1, 'strand'] == '+'){
            POS <- exon_anno[idxs, 'cds_chr_start'] + c_sta - 
                    exon_anno[idxs, 'cds_start']
            SEQ <- DNAString(cseq)
            FLAG <- ifelse(primary==TRUE, 0x00, 0x00+0x100) 
            
     }else{
            POS <- exon_anno[idxe, 'cds_chr_start'] +  
                    exon_anno[idxe, 'cds_end'] - c_end
            SEQ <- reverseComplement(DNAString(cseq))
            FLAG <- ifelse(primary==TRUE, 0x10, 0x10 + 0x100) 
     }
    
    if(idxe == idxs){
        CIGAR <- paste(len, 'M', sep='')        
    }else{
         if(exon_anno[1, 'strand'] == '+'){
            #insert <- exon_anno[idxe, 'cds_chr_start'] - exon_anno[idxs, 
            #                        'cds_chr_end']- 1
            part1 <- exon_anno[idxs, 'cds_end'] - c_sta + 1 
            part2 <- c_end - exon_anno[idxe, 'cds_start'] + 1  
            
            insert <- unlist(lapply(1:(idxe - idxs), function(x) 
                paste(exon_anno[idxs + x, 'cds_chr_start'] - 
                    exon_anno[idxs+x-1, 'cds_chr_end']- 1, 'N', sep='')))
            if(idxe-idxs >1){
                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
                        paste(exon_anno[idxs + x, 'cds_chr_end'] - 
                        exon_anno[idxs + x, 'cds_chr_start'] + 1, 'M', 
                        sep=''))) 
            }else{ innerexon <- ''}
            
            #ifelse(idxe-idxs >1, unlist(lapply(1:(idxe-idxs-1), function(x) 
            #paste(exon_anno[idxs+x, 'cds_chr_end'] - 
            #             exon_anno[idxs+x, 'cds_chr_start']+1, 
            #             'M', sep=''))), '')
            midpattern <- rep(NA, length(insert) + length(innerexon))
            midpattern[seq(1, length(insert) + length(innerexon), 
                                                        by=2)] <- insert
            midpattern[seq(2, length(insert) + length(innerexon), 
                                                        by=2)] <- innerexon
            midpattern <- paste(midpattern, collapse='') 
            
        }else{
            #insert <- exon_anno[idxs, 'cds_chr_start'] - 
            #            exon_anno[idxe, 'cds_chr_end']- 1
            part1 <- c_end- exon_anno[idxe, 'cds_start'] + 1 
            part2 <- exon_anno[idxs, 'cds_end'] - c_sta + 1  
            
            insert <- unlist(lapply(1:(idxe-idxs), function(x) 
                paste(exon_anno[idxe - x, 'cds_chr_start'] - 
                        exon_anno[idxe-x + 1, 'cds_chr_end']- 1, 
                        'N', sep='')))
            if(idxe-idxs >1){
                innerexon <- unlist(lapply(1:(idxe-idxs-1), function(x) 
                        paste(exon_anno[idxe-x, 'cds_chr_end'] - 
                        exon_anno[idxe-x, 'cds_chr_start']+1, 'M', sep='')))
            }else{ innerexon <-''}
        
            midpattern <- rep(NA, length(insert)+length(innerexon))
            midpattern[seq(1, length(insert) + length(innerexon), 
                                                        by=2)] <- insert
            midpattern[seq(2, length(insert) + length(innerexon), 
                                                        by=2)] <- innerexon
            midpattern <- paste(midpattern, collapse='') 

        }

        CIGAR <- paste(part1, 'M', midpattern, part2, 'M', sep='')    
    }

    tmp <- c(FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, 
                as.character(SEQ), QUAL, XA)
    tmp
}


params <- list()
for(param in names(opt))
{
    if (!param == "help")
        params[param] <- opt[param]
}

setClass("GalaxyRemoteError", contains="character")
wrappedFunction <- function(f)
{
    tryCatch(do.call(f, params),
        error=function(e) new("GalaxyRemoteError", conditionMessage(e)))
}


suppressPackageStartupMessages(library(RGalaxy))
do.call(PSMtab2SAM, params)

## end warning handler
}, warning = function(w) {
    cat(paste("Warning:", conditionMessage(w), "\n"))
    invokeRestart("muffleWarning")
})