view msi_qualitycontrol.xml @ 1:f5d0417ed59b draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/msi_qualitycontrol commit 93af5acf54999bcfea4c803680ca5a9c2a92fee5
author galaxyp
date Thu, 02 Nov 2017 10:32:21 -0400
parents 348d4134ba6a
children 4c81a6bffcf0
line wrap: on
line source

<tool id="mass_spectrometry_imaging_qc" name="MSI Qualitycontrol" version="1.7.0">
    <description>
        mass spectrometry imaging QC
    </description>
    <requirements>
        <requirement type="package" version="1.7.0">bioconductor-cardinal</requirement>
        <requirement type="package" version="2.2.1">r-ggplot2</requirement>
        <requirement type="package" version="1.1_2">r-rcolorbrewer</requirement>
        <requirement type="package" version="2.2.1"> r-gridextra</requirement>
        <requirement type="package" version="2.23_15">r-kernsmooth</requirement>
    </requirements>
    <command detect_errors="exit_code">
    <![CDATA[

        #if $infile.ext == 'imzml'
            cp '${infile.extra_files_path}/imzml' infile.imzML &&
            cp '${infile.extra_files_path}/ibd' infile.ibd &&
        #elif $infile.ext == 'analyze75'
            cp '${infile.extra_files_path}/hdr' infile.hdr &&
            cp '${infile.extra_files_path}/img' infile.img &&
            cp '${infile.extra_files_path}/t2m' infile.t2m &&
        #else
            ln -s '$infile' infile.RData &&
        #end if
        cat '${cardinal_qualitycontrol_script}' &&
        Rscript '${cardinal_qualitycontrol_script}'
    ]]>
    </command>
    <configfiles>
        <configfile name="cardinal_qualitycontrol_script"><![CDATA[

################################# load libraries and read file #########################
library(Cardinal)
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
library(KernSmooth)

## Read MALDI Imagind dataset

#if $infile.ext == 'imzml'
    msidata <- readMSIData('infile.imzML')
#elif $infile.ext == 'analyze75'
    msidata <- readMSIData('infile.hdr')

#else
    load('infile.RData')
#end if

#if $inputpeptidefile:
    ## Read tabular file with peptide masses for plots and heatmap images: 
    input_list = read.delim("$inputpeptidefile", header = FALSE, na.strings=c("","NA", "#NUM!", "#ZAHL!"), stringsAsFactors = FALSE)
#else
    input_list = data.frame(0, 0)
#end if

###################################### file properties in numbers ######################

## Number of features (mz)
maxfeatures = length(features(msidata))
## Range mz
minmz = round(min(mz(msidata)), digits=2)
maxmz = round(max(mz(msidata)), digits=2)
## Number of spectra (pixels)
pixelcount = length(pixels(msidata))
## Range x coordinates
minimumx = min(coord(msidata)[,1])
maximumx = max(coord(msidata)[,1])
## Range y coordinates
minimumy = min(coord(msidata)[,2])
maximumy = max(coord(msidata)[,2])
## Range of intensities
minint = round(min(spectra(msidata)[]), digits=2)
maxint = round(max(spectra(msidata)[]), digits=2)
medint = round(median(spectra(msidata)[]), digits=2)
## Number of intensities > 0 
npeaks= sum(spectra(msidata)[]>0)
## Spectra multiplied with mz (potential number of peaks)
numpeaks = ncol(spectra(msidata)[])*nrow(spectra(msidata)[])
## Percentage of intensities > 0
percpeaks = round(npeaks/numpeaks*100, digits=2)
## Number of empty TICs
TICs = colSums(spectra(msidata)[]) 
NumemptyTIC = sum(TICs == 0)

## Processing informations
processinginfo = processingData(msidata)
centroidedinfo = processinginfo@centroided # TRUE or FALSE

## if TRUE write processinginfo if no write FALSE

## normalization
if (length(processinginfo@normalization) == 0) {
  normalizationinfo='FALSE'
} else {
  normalizationinfo=processinginfo@normalization
}
## smoothing
if (length(processinginfo@smoothing) == 0) {
  smoothinginfo='FALSE'
} else {
  smoothinginfo=processinginfo@smoothing
}
## baseline
if (length(processinginfo@baselineReduction) == 0) {
  baselinereductioninfo='FALSE'
} else {
  baselinereductioninfo=processinginfo@baselineReduction
}
## peak picking
if (length(processinginfo@peakPicking) == 0) {
  peakpickinginfo='FALSE'
} else {
  peakpickinginfo=processinginfo@peakPicking
}


## calculate how many input peptide masses are valid: 
inputpeptides = input_list[input_list[,1]>minmz & input_list[,1]<maxmz,]
inputmasses = inputpeptides[,1]
inputnames = inputpeptides[,2]

#############################################################################

properties = c("Number of mz features",
               "Range of mz values [Da]",
               "Number of pixels", 
               "Range of x coordinates", 
               "Range of y coordinates",
               "Range of intensities", 
               "Median of intensities",
               "Intensities > 0",
               "Number of zero TICs",
               "Preprocessing", 
               "Normalization", 
               "Smoothing",
               "Baseline reduction",
               "Peak picking",
               "Centroided", 
               "# valid peptidemasses")

values = c(paste0(maxfeatures), 
           paste0(minmz, " - ", maxmz), 
           paste0(pixelcount), 
           paste0(minimumx, " - ", maximumx),  
           paste0(minimumy, " - ", maximumy), 
           paste0(minint, " - ", maxint), 
           paste0(medint),
           paste0(percpeaks, " %"), 
           paste0(NumemptyTIC), 
           paste0(" "),
           paste0(normalizationinfo),
           paste0(smoothinginfo),
           paste0(baselinereductioninfo),
           paste0(peakpickinginfo),
           paste0(centroidedinfo), 
           paste0(length(inputmasses)))


property_df = data.frame(properties, values)


## Variables for plots
xrange = 1
yrange = 1
maxx = max(coord(msidata)[,1])+xrange
minx = min(coord(msidata)[,1])-xrange
maxy = max(coord(msidata)[,2])+yrange
miny = min(coord(msidata)[,2])-yrange


####################################### Preparation of images #########################

## Acquisitionorder

pixelnumber = 1:pixelcount
pixelxyarray=cbind(coord(msidata),pixelnumber)


## Number of peaks per pixel
peaksperpixel = colSums(spectra(msidata)[]> 0)
peakscoordarray=cbind(coord(msidata), peaksperpixel)

## Most abundant mz 

highestmz = apply(spectra(msidata)[],2,which.max) 
highestmz_matrix = cbind(coord(msidata),mz(msidata)[highestmz])
colnames(highestmz_matrix)[3] = "highestmzinDa"

###################################### Preparation of plots ############################

## function without xaxt for plots with automatic x axis
plot_colorByDensity = function(x1,x2,
                               ylim=c(min(x2),max(x2)),
                               xlim=c(min(x1),max(x1)),
                               xlab="",ylab="",main="") {
  
  df <- data.frame(x1,x2)
  x <- densCols(x1,x2, colramp=colorRampPalette(c("black", "white")))
  df\$dens <- col2rgb(x)[1,] + 1L
  cols <-  colorRampPalette(c("#000099", "#00FEFF", "#45FE4F","#FCFF00", "#FF9400", "#FF3100"))(256)
  df\$col <- cols[df\$dens]
  plot(x2~x1, data=df[order(df\$dens),], 
       ylim=ylim,xlim=xlim,pch=20,col=col,
       cex=1,xlab=xlab,ylab=ylab,las=1,
       main=main)
}

## Number of peaks per mz - number across all pixel
peakspermz = rowSums(spectra(msidata)[] > 0 )

## Sum of all intensities for each mz (like TIC, but for mz instead of pixel)
mzTIC = rowSums(spectra(msidata)[]) # calculate intensity sum for each mz



######################################## PDF #############################################
##########################################################################################
##########################################################################################


pdf("qualitycontrol.pdf", fonts = "Times", pointsize = 12)
plot(0,type='n',axes=FALSE,ann=FALSE)
#if not $filename:
    #set $filename = $infile.display_name
#end if
title(main=paste("Quality control of MSI data\n\n", "Filename:", "$filename"))

############################# I) numbers ####################################
#############################################################################
grid.table(property_df, rows= NULL)

############################# II) ion images #################################
##############################################################################

## 1) Acquisition image
(ggplot(pixelxyarray, aes(x=x, y=y, fill=pixelnumber))
 +scale_y_reverse() + geom_tile() + coord_fixed()
 + ggtitle("1) Order of Acquisition")
 +theme_bw()
 + scale_fill_gradientn(colours = c("blue", "purple" , "red","orange"), 
                        space = "Lab", na.value = "black", name = "Acq"))

## 2) Calibrant images:



if (length(inputmasses) != 0)
{   for (mass in 1:length(inputmasses))

    {
      image(msidata, mz=inputmasses[mass], plusminus=$plusminusinDalton, 
            main= paste0("2",LETTERS[mass], ") ", inputnames[mass], " (", round(inputmasses[mass], digits = 2), " Da)"), 
            contrast.enhance = "histogram") 
    }
} else {print("The inputpeptide masses were outside the mass range")}

## 3) Number of peaks per pixel - image

(ggplot(peakscoordarray, aes(x=x, y=y, fill=peaksperpixel), colour=colo)
 +scale_y_reverse(lim=c(maxy,miny)) + geom_tile() + coord_fixed() 
 + ggtitle("3) Number of peaks per pixel")
 + theme_bw() 
 + theme(text=element_text(family="ArialMT", face="bold", size=12))
 + scale_fill_gradientn(colours = c("blue", "purple" , "red","orange") 
                        ,space = "Lab", na.value = "black", name = "# peaks"))


## 4) TIC image 
TICcoordarray=cbind(coord(msidata), TICs)
colo <- colorRampPalette(
c('blue', 'cyan', 'green', 'yellow','red'))
(ggplot(TICcoordarray, aes(x=x, y=y, fill=TICs), colour=colo)
 +scale_y_reverse(lim=c(maxy,miny)) + geom_tile() + coord_fixed() 
 + ggtitle("4) Total Ion Chromatogram")
 + theme_bw() 
 + theme(text=element_text(family="ArialMT", face="bold", size=12))
 + scale_fill_gradientn(colours = c("blue", "purple" , "red","orange") 
                        ,space = "Lab", na.value = "black", name = "TIC"))

## 5) Most abundant mass image 

(ggplot(highestmz_matrix, aes(x=x, y=y, fill=highestmzinDa))
+scale_y_reverse(lim=c(maxy,miny)) + geom_tile() + coord_fixed() 
+ ggtitle("5) Most abundant m/z in each pixel")
+ theme_bw() 
+ scale_fill_gradientn(colours = c("blue", "purple" , "red","orange"), space = "Lab", na.value = "black", name = "m/z", 
                       labels = as.character(pretty(highestmz_matrix\$highestmzinDa)[c(1,3,5,7)]),
                       breaks = pretty(highestmz_matrix\$highestmzinDa)[c(1,3,5,7)], limits=c(min(highestmz_matrix\$highestmzinDa), max(highestmz_matrix\$highestmzinDa)))
+ theme(text=element_text(family="ArialMT", face="bold", size=12)))

## which mz are highest
highestmz_peptides = names(sort(table(round(highestmz_matrix\$highestmzinDa, digits=0)), decreasing=TRUE)[1])
highestmz_pixel = which(round(highestmz_matrix\$highestmzinDa, digits=0) == highestmz_peptides)[1]

secondhighestmz = names(sort(table(round(highestmz_matrix\$highestmzinDa, digits=0)), decreasing=TRUE)[2]) 
secondhighestmz_pixel = which(round(highestmz_matrix\$highestmzinDa, digits=0) == secondhighestmz)[1]



## 6) pca image for two components
pca <- PCA(msidata, ncomp=2) 
par(mfrow = c(2,1))
plot(pca, col=c("black", "darkgrey"), main="6) PCA for two components")
image(pca, ylim = c(-1, maxy), col=c("black", "white"))


############################# III) properties over acquisition (spectra index)##########
##############################################################################

par(mfrow = c(2,1), mar=c(5,6,4,2))

## 7a) number of peaks per spectrum - scatterplot
plot_colorByDensity(pixels(msidata), peaksperpixel, ylab = "", xlab = "", main="7a) Number of peaks per spectrum")
title(xlab="Spectra index \n (= Acquisition time)", line=3)
title(ylab="Number of peaks", line=4)

## 7b) number of peaks per spectrum - histogram
hist(peaksperpixel, main="", las=1, xlab = "Number of peaks per spectrum", ylab="") 
title(main="7b) Number of peaks per spectrum", line=2)
title(ylab="Frequency = # spectra", line=4)
abline(v=median(peaksperpixel), col="blue")

## 8a) TIC per spectrum -  density scatterplot
zero=0
par(mfrow = c(2,1), mar=c(5,6,4,2))
plot_colorByDensity(pixels(msidata), TICs,  ylab = "", xlab = "", main="8a) TIC per pixel")
title(xlab="Spectra index \n (= Acquisition time)", line=3)
title(ylab = "Total ion chromatogram intensity", line=4)

## 8b) TIC per spectrum -  histogram
hist(log(TICs), main="", las=1, xlab = "log(TIC per spectrum)", ylab="")
title(main= "8b) TIC per spectrum", line=2)
title(ylab="Frequency = # spectra", line=4)
abline(v=median(log(TICs[TICs>0])), col="blue") 


## 9) intensity of chosen peptides over acquisition (pixel index)

if (length(inputmasses) != 0)
{   

    par(mfrow = c(3, 2))
    intensityvector = vector()
    for (mass in 1:length(inputmasses))
    {
        mznumber = features(msidata, mz = inputmasses[mass])
        intensityvector = spectra(msidata)[][mznumber,] 
        plot(intensityvector, main=inputnames[mass], xlab="Spectra index \n (= Acquisition time)")
    }
} else {print("The inputpeptide masses were outside the mass range")}

################################## IV) changes over mz ############################
###################################################################################

## 10) Number of peaks per mz

par(mfrow = c(2,1), mar=c(5,6,4,4.5))
## 10a) Number of peaks per mz - scatterplot
plot_colorByDensity(mz(msidata),peakspermz, main= "10a) Number of peaks for each mz", ylab ="")
title(xlab="mz in Dalton", line=2.5)
title(ylab = "Number of peaks", line=4)
axis(4, at=pretty(peakspermz),labels=as.character(round((pretty(peakspermz)/pixelcount*100), digits=1)), las=1)
mtext("Coverage of spectra [%]", 4, line=3, adj=1)

# make plot smaller to fit axis and labels, add second y axis with %
## 10b) Number of peaks per mz - histogram
hist(peakspermz, main="", las=1, ylab="", xlab="")
title(ylab = "Frequency", line=4)
title(main="10b) Number of peaks per mz", xlab = "Number of peaks per mz", line=2)
abline(v=median(peakspermz), col="blue") 


## 11) Sum of intensities per mz

par(mfrow = c(2,1), mar=c(5,6,4,2))
# 11a) sum of intensities per mz - scatterplot
plot_colorByDensity(mz(msidata),mzTIC,  main= "11a) Sum of all peak intensities for each mz", ylab ="")
title(xlab="mz in Dalton", line=2.5)
title(ylab="Intensity sum", line=4)
# 11b) sum of intensities per mz - histogram
hist(log(mzTIC), main="", xlab = "", las=1, ylab="")
title(main="11b) Sum of intensities per mz", line=2, ylab="")
title(xlab = "log (sum of intensities per mz)")
title(ylab = "Frequency", line=4)
abline(v=median(log(mzTIC[mzTIC>0])), col="blue")



################################## V) general plots ############################
###################################################################################


## 12) Intensity distribution

par(mfrow = c(2,1), mar=c(5,6,4,2))

## 12a) Intensity histogram: 
hist(log2(spectra(msidata)[]), main="", xlab = "", ylab="", las=1)
title(main="12a) Log2-transformed intensities", line=2)
title(xlab="log2 intensities")
title(ylab="Frequency", line=4)
abline(v=median(log2(spectra(msidata)[(spectra(msidata)>0)])), col="blue")

## 12b) Median intensity over spectra
medianint_spectra = apply(spectra(msidata), 2, median)
plot(medianint_spectra, main="12b) Median intensity per spectrum",las=1, xlab="Spectra index \n (= Acquisition time)", ylab="")
title(ylab="Median spectrum intensity", line=4)

## 13) Mass spectra 

par(mfrow = c(2, 2))
plot(msidata, pixel = 1:length(pixelnumber), main= "Average spectrum")
plot(msidata, pixel =round(length(pixelnumber)/2, digits=0), main="Spectrum in middle of acquisition")
plot(msidata, pixel = highestmz_pixel, main= paste0("Spectrum at ", rownames(coord(msidata)[highestmz_pixel,])))
plot(msidata, pixel = secondhighestmz_pixel, main= paste0("Spectrum at ", rownames(coord(msidata)[secondhighestmz_pixel,])))

dev.off()

    ]]></configfile>
    </configfiles>
    <inputs>
        <param name="infile" type="data" format="imzml, rdata, analyze75" label="Inputfile as imzML, Analyze7.5 or Cardinal MSImageSet saved as RData"
            help="Upload composite datatype imzml (ibd+imzML) or analyze75 (hdr+img+t2m) or regular upload .RData (Cardinal MSImageSet)"/>
        <param name="filename" type="text" value="" optional="true" label="Title" help="will appear in the quality report. If nothing given it will take the dataset name."/>
        <param name="inputpeptidefile" type="data" optional="true" format="txt, csv" label="Text file with peptidemasses and names"
            help="first column peptide m/z, second column peptide name, tab separated file"/>
        <param name="plusminusinDalton" value="0.25" type="text" label="Mass range" help="plusminus mass window in Dalton"/>
    </inputs>
    <outputs>
        <data format="pdf" name="plots" from_work_dir="qualitycontrol.pdf" label="${tool.name} on $infile.display_name"/>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="" ftype="imzml">
                <composite_data value="Example_Continuous.imzML" ftype="imzml"/>
                <composite_data value="Example_Continuous.ibd" ftype="ibd"/>
            </param>
            <param name="inputpeptidefile" value="inputpeptides.csv" ftype="csv"/>
            <param name="plusminusinDalton" value="0.25"/>
            <param name="filename" value="Testfile_imzml"/>
            <output name="plots" file="Testfile_qualitycontrol_imzml.pdf" compare="sim_size" delta="20000"/>
        </test>
        <test>
            <param name="infile" value="" ftype="analyze75">
                <composite_data value="Analyze75.hdr" ftype="hdr"/>
                <composite_data value="Analyze75.img" ftype="img"/>
                <composite_data value="Analyze75.t2m" ftype="t2m"/>
            </param>
            <param name="inputpeptidefile" value="inputpeptides.txt" ftype="txt"/>
            <param name="plusminusinDalton" value="0.5"/>
            <param name="filename" value="Testfile_analyze75"/>
            <output name="plots" file="Testfile_qualitycontrol_analyze75.pdf" compare="sim_size" delta="20000"/>
        </test>
        <test>
            <param name="infile" value="example_continousS042.RData" ftype="rdata"/>
            <param name="inputpeptidefile" value="inputpeptides.csv" ftype="txt"/>
            <param name="plusminusinDalton" value="0.1"/>
            <param name="filename" value="Testfile_rdata"/>
            <output name="plots" file="Testfile_qualitycontrol_rdata.pdf" compare="sim_size" delta="20000"/>
        </test>
    </tests>
    <help>
        <![CDATA[
Quality control for maldi imaging mass spectrometry data. 

Input data: 3 types of input data can be used:

- imzml file (upload imzml and ibd file via the "composite" function) `Introduction to the imzml format <http://ms-imaging.org/wp/introduction/>`_
- Analyze7.5 (upload hdr, img and t2m file via the "composite" function)
- Cardinal "MSImageSet" data (with variable name "msidata", saved as .RData)

Only for continuous imzML so far. 

The output of this tool contains key values and plots of the imaging data as pdf. 

        ]]>
    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv146</citation>
    </citations>
</tool>