# HG changeset patch
# User stef
# Date 1426856183 14400
# Node ID 5287f72e08916f8d4a07f73e52c85810e34431e7
# Parent 13eee0859f94a6a9f38ac0695750e733ba2c8444
Uploaded
diff -r 13eee0859f94 -r 5287f72e0891 README.rst
--- a/README.rst Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-FALCO Amplicon Analysis Pipeline & Galaxy wrapper
-========================================
-
-FALCO variant-caller is part of the Amplicon Analysis Pipeline (AAP).
-
-The typical workflow is as follows:
-* paired-end amplicon sequencing
-* merge pairs (so only overlapping pairs are included)
-* map the single read fastq with BWA
-* perform variant calling with FALCO
-* create (html) report of the results
-
-FALCO uses samtools and straight-forward statistics to determine wether a
-potential variant is likely a (technical) artifact or not.
-
-Input / Output
-
-Input of the caller is a BAM file, output VCF
-
-
-Bug Reports & other questions
-=============================
-
-Issues can be reported via http://www.tgac.nl/
diff -r 13eee0859f94 -r 5287f72e0891 falco-call.sh
--- a/falco-call.sh Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-#!/bin/bash
-TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-echo "Started FALCO calling"
-
-## ----------
-## Variables setup ($1 contains the bash config file path)
-## ----------
-source $1
-
-## ----------
-## make sure all is ok
-## ----------
-if [ ! -f $REF_FILE".fai" ]
-then
- echo "No FAI index (fai) found for reference fasta [$REF_FILE]"
- exit "No FAI index (fai) found for reference fasta [$REF_FILE]"
-fi
-
-## ----------
-## set params
-## ----------
-if [[ $filter_file != 'None' && $filter_file != '' ]] # Galaxy default is "None" for some reason
-then
- FILTER_PARAM=" --filter "$filter_file
-else
- FILTER_PARAM=""
-fi
-
-if [[ $manifest_file != 'None' && $manifest_file != 'None' ]] # Galaxy default is "None" for some reason
-then
- MANIFEST_PARAM=" --manifest "$manifest_file
-else
- MANIFEST_PARAM=""
-fi
-
-## name of file in galaxy not always set so will use a user-set job_name instead
-#bam_base=`echo $bam_name | sed 's#.bam$##' - `
-bam_base=$job_name
-
-## ----------
-## Status / debug
-## ----------
-DEBUG=1
-if [ $DEBUG ]
-then
- DBS="[INFO] "
- echo $DBS"FILTER: "$filter_file
- echo $DBS"MANIFEST: "$manifest_file
- echo $DBS"REF FILE: "$REF_FILE
- echo $DBS"DB KEY: "$DB_KEY
- echo $DBS"REF SRC: "$REF_SOURCE
- echo $DBS"BAM FILE: "$bam_file
- echo $DBS"BAM NAME: "$bam_name
- echo $DBS"BAM BASE: "$bam_base
- echo $DBS"OUT PATH: "$out_path
-fi
-
-## ----------
-## create output files dir
-## ----------
-mkdir $out_path
-
-## ----------
-## running analysis
-## ----------
-echo "[INFO] Starting variant calling"
-## NOTE: if $FILTER_PARAM is set it includes the param name (--filter)
-## NOTE: if $MANIFEST_PARAM is set it includes the param name (--manifest)
-CALL_STRING="$TOOLDIR/falco/bin/falco --bam $bam_file --output $bam_base --ref $REF_FILE $FILTER_PARAM $MANIFEST_PARAM"
-echo "[INFO] "$CALL_STRING
-perl $CALL_STRING
-echo "[INFO] done with variant calling"
-
-
-## ----------
-## create index html for main galaxy output
-## ----------
-echo "" >> $html_out
-echo "" >> $html_out
-echo "
" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo " FALCO
" >> $html_out
-echo " This page is way to get output files that are not implemented in galaxy history, it is not intended to be a user-friendly way of displaying anything ;)
" >> $html_out
-#echo " HTML" >> $html_out
-echo " " >> $html_out
-for file in *.vcf *.txt *stderr *stdout
-#for file in *
-do
- lineCount=`wc -l $file | cut -f 1 -d " "`
- echo " $file has $lineCount lines |
" >> $html_out
- echo " --> " `head -1 $file` " |
" >> $html_out
-done
-echo "
" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-
-## ----------
-## creating galaxy history outputs
-## ----------
-#cp 'index.html' $html_out # this is the overview of samples html
-#cp $bam_base'.html' $out_path/'out.html' # this is the sample html
-cp $bam_base'.falco.vcf' $vcf_out
-cp $bam_base'.qc.ann.qual.txt' $qc_ann_qual_out
-cp $bam_base'.qc2.ann.txt' $qc2_ann_txt_out
-cp $bam_base'.qc.targets.txt' $qc_targets_txt_out
-
-## ----------
-## copy files to keep to output path
-## ----------
-#cp -r ./$bam_base/*png $out_path/$bam_base/
-#cp -r ./* $out_path
-cp *.vcf $out_path; cp *.txt $out_path; cp *_std* $out_path
-
-## ----------
-echo "END falco sh"
-exit 0
diff -r 13eee0859f94 -r 5287f72e0891 falco-call.xml
--- a/falco-call.xml Fri Feb 27 10:41:30 2015 -0500
+++ b/falco-call.xml Fri Mar 20 08:56:23 2015 -0400
@@ -46,7 +46,7 @@
-
+
@@ -91,9 +91,9 @@
-
-
-
+
+
+
diff -r 13eee0859f94 -r 5287f72e0891 falco-filter-report.sh
--- a/falco-filter-report.sh Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-#!/bin/bash
-TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-echo "Started FALCO calling"
-
-## ----------
-## Variables setup ($1 contains the bash config file path)
-## ----------
-source $1
-
-## ----------
-## make sure all is ok
-## ----------
-#if [ ! -f $REF_FILE".fai" ]
-#then
-# echo "No FAI index (fai) found for reference fasta [$REF_FILE]"
-# exit "No FAI index (fai) found for reference fasta [$REF_FILE]"
-#fi
-
-## ----------
-## set params
-## ----------
-if [[ $vcf2_file != 'None' && $vcf2_file != '' ]] # Galaxy default is "None" for some reason
-then
- VCF2_PARAM=" --vcfOther "$vcf2_file
-else
- VCF2_PARAM=""
-fi
-
-# if [[ $manifest_file != 'None' && $manifest_file != 'None' ]] # Galaxy default is "None" for some reason
-# then
-# MANIFEST_PARAM=" --manifest "$manifest_file
-# else
-# MANIFEST_PARAM=""
-# fi
-
-## name of file in galaxy not always set so will use a user-set job_name instead
-#bam_base=`echo $bam_name | sed 's#.bam$##' - `
-vcf_base=$job_name
-sample_html_file=$vcf_base'.html'
-
-## ----------
-## Status / debug
-## ----------
-DEBUG=1
-# if [ $DEBUG ]
-# then
-# DBS="[INFO] "
-# echo $DBS"FILTER: "$filter_file
-# echo $DBS"MANIFEST: "$manifest_file
-# echo $DBS"REF FILE: "$REF_FILE
-# echo $DBS"DB KEY: "$DB_KEY
-# echo $DBS"REF SRC: "$REF_SOURCE
-# echo $DBS"BAM FILE: "$bam_file
-# echo $DBS"BAM NAME: "$bam_name
-# echo $DBS"BAM BASE: "$bam_base
-# echo $DBS"OUT PATH: "$out_path
-# fi
-
-## ----------
-## create output files dir
-## ----------
-mkdir $out_path
-
-## ----------
-## running analysis
-## ----------
-echo "[INFO] Starting FALCO reporting"
-CMD_STRING="$TOOLDIR/falco/bin/falco-filter-report --vcf $vcf_file --output $vcf_base --qc_ann_qual_txt $qc_ann_qual_file --qc2_ann_txt $qc2_ann_txt_file --qc_targets_txt $qc_targets_txt_file $VCF2_PARAM"
-echo "[INFO] "$CMD_STRING
-perl $CMD_STRING
-echo "[INFO] done with FALCO reporting"
-
-
-## ----------
-## create index html for main galaxy output
-## ----------
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo " FALCO
" >> $html_out
-echo " RESULTS" >> $html_out
-#echo " This page is way to get output files that are not implemented in galaxy history, it is not intended to be a user-friendly way of displaying anything ;)
" >> $html_out
-#echo " HTML" >> $html_out
-#echo " " >> $html_out
-#for file in *.vcf *.txt *stderr *stdout
-#for file in *
-#do
-# lineCount=`wc -l $file | cut -f 1 -d " "`
-# echo " $file has $lineCount lines |
" >> $html_out
-# echo " --> " `head -1 $file` " |
" >> $html_out
-#done
-#echo "
" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-
-## ----------
-## creating galaxy history outputs
-## ----------
-#cp 'index.html' $html_out # this is the overview of samples html
-cp $sample_html_file $out_path # this is the sample html
-#cp $vcf_base'.html' $html_out # this is the sample html
-
-## ----------
-## copy files to keep to output path
-## ----------
-#cp -r ./$bam_base/*png $out_path/$bam_base/
-cp -r ./* $out_path
-#cp *.vcf $out_path; cp *.txt $out_path; cp *_std* $out_path
-
-## ----------
-echo "END falco sh"
-exit 0
diff -r 13eee0859f94 -r 5287f72e0891 falco-filter-report.xml
--- a/falco-filter-report.xml Fri Feb 27 10:41:30 2015 -0500
+++ b/falco-filter-report.xml Fri Mar 20 08:56:23 2015 -0400
@@ -47,9 +47,9 @@
-
-
-
+
+
+
diff -r 13eee0859f94 -r 5287f72e0891 falco/README
--- a/falco/README Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-Usage:
-
-falco
-
-input:
-- bam file
-- manifest file
-- ref + fai
-- locifilt
-- manifest
-
-output:
-- *.vcf file
-- *.qc.ann.txt -> catalog of each assayed bp
-- *.qc2.ann.txt -> catalog of transitions
-- *.qc.targets.txt -> amplicon centered information, eg depth, start/end positions.
-
-example:
-
-OUTBASE="HCT116-TEST"
-
-## variant calling
-falco --bam data/HCT-116/HCT116.bam --output $OUTBASE --ref genome.fa
-
-file1=$OUTBASE".qc.ann.qual.txt"
-file2=$OUTBASE".qc2.ann.txt"
-file3=$OUTBASE".qc.targets.txt"
-file4=$OUTBASE".res.filtered.tsv"
-
-## variant filtering and reporting
-falco-filter-report --vcf snpeff.vcf --output $OUTBASE --qc_ann_qual_txt $file1 --qc2_ann_txt $file2 --qc_targets_txt $file3 --res_filtered_tsv $file4
-
-
diff -r 13eee0859f94 -r 5287f72e0891 falco/bin/falco
--- a/falco/bin/falco Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use Cwd 'abs_path';
-use Getopt::Long;
-use File::Basename;
-
-my $absPath = abs_path($0);
-my $dn = dirname($absPath);
-
-# Set options
-
-my $bam;
-my $ref = "$dn/../ref/hg19/hg19.fa";
-my $locifilt = "$dn/../ref/filters/filter.tsv";
-my $manifest = "$dn/../ref/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
-my $base = $bam;
-
-GetOptions (
- "bam=s" => \$bam,
- "ref=s" => \$ref,
- "filter=s" => \$locifilt,
- "manifest=s" => \$manifest,
- "output=s" => \$base,
-)
-or die("Error in command line arguments\n");
-
-## sanity checks
-unless ( -f $ref ){
- print STDERR "Genome Reference file not found [$ref]\n";
- exit 1;
-}
-unless ( -f $ref.'.fai' ){
- print STDERR "Genome Reference index file not found [$ref\.fai]\n";
- exit 1;
-}
-
-
-# Set paths to external script
-# TODO make modules
-
-my $perAmpliconAnalysis = "$dn/../lib/perl/perAmpliconAnalysis.pl";
-my $addQual = "$dn/../lib/R/addQual.R";
-my $func = "$dn/../lib/R/func.R";
-my $qcFilt = "$dn/../lib/perl/qcFilt.pl";
-my $qc2vcf = "$dn/../lib/perl/qc2vcf.pl";
-
-# Check Rscript
-my $rscript = `which Rscript`;
-chomp $rscript;
-if ($rscript !~ /Rscript$/) {
- print STDERR "No Rscript present in PATH\n";
- exit 1;
-}
-
-# Check samtools
-my $samtools = `which samtools`;
-chomp $samtools;
-if ($samtools !~ /samtools$/) {
- print STDERR "No samtools present in PATH\n";
- exit 1;
-}
-
-print STDOUT localtime() . " [$$] $perAmpliconAnalysis $bam $ref $manifest $base $samtools\n";
-system("$perAmpliconAnalysis $bam $ref $manifest $base $samtools");
-
-print STDOUT localtime() . " [$$] $rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt\n";
-system("$rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt");
-
-print STDOUT localtime() . " [$$] $qcFilt $base.qc.ann.qual.txt $base\n";
-system("$qcFilt $base.qc.ann.qual.txt $base");
-
-print STDOUT localtime() . " [$$] $qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf\n";
-system("$qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf");
-
diff -r 13eee0859f94 -r 5287f72e0891 falco/bin/falco-filter-report
--- a/falco/bin/falco-filter-report Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,93 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use Cwd 'abs_path';
-use Getopt::Long;
-use File::Basename;
-
-my $absPath = abs_path($0);
-my $dir = dirname($absPath);
-my $lib = "$dir/../lib/";
-
-my $vcf2tsv = "$lib/perl/vcf2tsv.pl";
-my $spliteff = "$lib/perl/splitEff.pl";
-my $filter = "$lib/perl/filter.pl";
-my $plotPng = "$lib/R/plotsPng.R";
-my $mkReport = "$lib/perl/mkHtmlReportGalaxy.pl";
-
-my $locifilt = "$dir/../ref/filters/filter.tsv";
-my $manifest = "$dir/../ref/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
-
-my $canonicals = "$dir/../ref/TSACP/canonicals.tsv";
-my $clinvar = "$dir/../ref/filters/clinvar_00-latest.f.vcf";
-my $cosmic = "$dir/../ref/filters/CosmicCodingMuts_v64_26032013_noLimit_wgs.f.vcf";
-my $cosmicNC = "$dir/../ref/filters/CosmicNonCodingVariants_v64_26032013_noLimit_wgs.f.vcf";
-
-my $base = undef;
-my $vcf = undef;
-my $vcfOther = undef;
-my $noFilt = undef;
-my $noPlot = undef;
-
-my $qc_ann_qual_txt = undef;
-my $qc2_ann_txt = undef
-my $qc_targets_txt = undef;
-
-GetOptions (
- "vcf=s" => \$vcf,
- "vcfOther=s" => \$vcfOther,
- "output=s" => \$base,
- "clinvar=s" => \$clinvar,
- "cosmic=s" => \$cosmic,
- "cosmicNC=s" => \$cosmicNC,
- "noFilt" => \$noFilt,
- "noPlot" => \$noPlot,
- "qc_ann_qual_txt=s" => \$qc_ann_qual_txt,
- "qc2_ann_txt=s" => \$qc2_ann_txt,
- "qc_targets_txt=s" => \$qc_targets_txt,
-)
-or die("Error in command line arguments\n");
-
-## sanity checks
-die( "No base name provided [-output]\n" ) unless defined($base) and $base ne '';
-die( "No VCF file provided [-vcf]\n" ) unless defined($vcf) and -f $vcf;
-die( "Missing input [-qc_ann_qual_txt]\n" ) unless defined($qc_ann_qual_txt) and -f $qc_ann_qual_txt;
-die( "Missing input [-qc2_ann_txt]\n" ) unless defined($qc2_ann_txt) and -f $qc2_ann_txt;
-die( "Missing input [-qc_targets_txt]\n" ) unless defined($qc_targets_txt) and -f $qc_targets_txt;
-die( "Required file does not exists [$canonicals]\n" ) unless -f $canonicals;
-die( "Required file does not exists [$clinvar]\n" ) unless -f $clinvar;
-die( "Required file does not exists [$cosmic]\n" ) unless -f $cosmic;
-die( "Required file does not exists [$cosmicNC]\n" ) unless -f $cosmicNC;
-
-
-## Rscript check
-my $rscript = `which Rscript`;
-chomp $rscript;
-if ($rscript !~ /Rscript$/) {
- print STDERR "No Rscript present in PATH\n";
- exit 1;
-}
-
-## FILTERING
-print STDOUT localtime() . " [$$] converting vcf to tsv\n";
-system( "$vcf2tsv $vcf > $base\.tsv" );
-
-print STDOUT localtime() . " [$$] splitting vcf columns\n";
-system( "$spliteff $base\.tsv Falco >> $base\.res\.tsv" );
-
-if ( defined($vcfOther) ){
- print STDOUT localtime() . " [$$] converting vcf to tsv\n";
- system( "$vcf2tsv $vcf > $base\.Other\.tsv" );
- print STDOUT localtime() . " [$$] splitting vcf columns\n";
- system( "$spliteff $base\.Other\.tsv Other >> $base\.res\.tsv" );
-}
-
-print STDOUT localtime() . " [$$] filtering data\n";
-system( "$filter $base\.res\.tsv $canonicals $clinvar $cosmic $cosmicNC > $base\.res\.filtered\.tsv" );
-
-## PLOTTING
-print STDOUT localtime() . " [$$] Creating plots\n";
-system( "Rscript $plotPng $qc_ann_qual_txt $qc2_ann_txt $qc_targets_txt $base\.res\.filtered\.tsv $clinvar $locifilt $base" );
-
-## REPORTING
-print STDOUT localtime() . " [$$] Creating HTML report\n";
-system( "perl $mkReport $base $qc_targets_txt" );
diff -r 13eee0859f94 -r 5287f72e0891 falco/lib/R/addQual.R
--- a/falco/lib/R/addQual.R Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-#!/usr/bin/Rscript
-args <- commandArgs(T)
-qcFile <- args[1]
-output <- args[2]
-funcR <- args[3]
-lociFiltFile <- args[4]
-
-read.delim(lociFiltFile, header=F) -> lociFiltD
-lociFilt <- paste(paste("chr", lociFiltD[,1], sep=""), lociFiltD[,2], sep=":")
-
-source(funcR)
-read.delim(qcFile, header=T) -> qc
-
-#addQual(qc) -> qc.qual
-addQualTransCor(qc, lociFilt) -> qc.qual
-#addQualVarCor(qc) -> qc.qual
-
-write.table(qc.qual, output, row.names=F, col.names=T, quote=F, sep="\t")
-#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
-
-
diff -r 13eee0859f94 -r 5287f72e0891 falco/lib/R/func.R
--- a/falco/lib/R/func.R Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,575 +0,0 @@
-#!/usr/bin/Rscript
-#args <- commandArgs(T)
-#qcFile <- args[1]
-#output <- args[2]
-
-#read.delim(qcFile, header=T) -> qc
-
-addQual <- function(qc) {
-
-qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.vector(qc$ins)
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-delSel <- qc$del != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-insSel <- qc$ins != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- #print(i)
- sel <- qc$amp == i
- prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob, lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
- }
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
- }
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
- }
- #print(sum(-10 * log10(pees) > 100))
- #print(sum(qc$qScoreI[selIns] > 100))
- #print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-addQualTransCor <- function(qc, lociFilt) {
-
-
-# lociFilt contains snp, cosmic and clinvar positions
-
-qc$del <- as.character(as.vector(qc$del))
-#qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.character(as.vector(qc$ins))
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-rle(as.vector(qc$ntRef)) -> srle
-hompols <- srle$lengths >= 5
-homPosS <- cumsum(srle$lengths)[hompols]
-homPosE <- cumsum(srle$lengths)[which(hompols) - 1]
-homFilt <- rep(F, nrow(qc))
-homFilt[homPosS] <- T
-homFilt[homPosE] <- T
-
-delSel <- qc$del != "." & !homFilt
-if (sum(delSel > 0)) {
-
-
-data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$del[delSel], "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-}
-
-insSel <- qc$ins != "." & !homFilt
-if (sum(insSel) > 0) {
-data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$ins[insSel], "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(inss$ln, inss$occ)), lower.tail=F)
-}
-# Calculate transition bias
-
-
-#calcTransCor(qc) -> pCor
-globalProb <- sum(qc$nVar) / (sum(qc$nRef) + sum(qc$nVar))
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- #print(i)
- sel <- qc$amp == i
- prob1 <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- prob <- prob1 * calcTransCor(qc[sel,], lociFilt)
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob , lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
-
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
-
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
-} }
- #print(sum(-10 * log10(pees) > 100))
- #print(sum(qc$qScoreI[selIns] > 100))
- #print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-
-addQualVarCor <- function(qc) {
-# Corrected for variant occurence
-
-qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.vector(qc$ins)
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-delSel <- qc$del != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-insSel <- qc$ins != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-# Calculate base bias per run
-sum(qc$nA[grepl("A", qc$ntRef, ignore.case=T)]) -> refA
-sum(qc$nA[!grepl("A", qc$ntRef, ignore.case=T)]) -> varA
-sum(qc$nC[grepl("C", qc$ntRef, ignore.case=T)]) -> refC
-sum(qc$nC[!grepl("C", qc$ntRef, ignore.case=T)]) ->varC
-sum(qc$nG[grepl("G", qc$ntRef, ignore.case=T)]) -> refG
-sum(qc$nG[!grepl("G", qc$ntRef, ignore.case=T)]) -> varG
-sum(qc$nT[grepl("T", qc$ntRef, ignore.case=T)]) -> refT
-sum(qc$nT[!grepl("T", qc$ntRef, ignore.case=T)]) -> varT
-
-varA / (refA + varA) -> errA
-varC / (refC + varC) -> errC
-varG / (refG + varG) -> errG
-varT / (refT + varT) -> errT
-c(errA, errC, errG, errT) / mean(c(errA, errC, errG, errT)) -> errACGT
-
-errMat <- qc[,9:12]
-errMat[grepl("A", qc$ntRef, ignore.case=T),1] <- 0
-errMat[grepl("C", qc$ntRef, ignore.case=T),2] <- 0
-errMat[grepl("G", qc$ntRef, ignore.case=T),3] <- 0
-errMat[grepl("T", qc$ntRef, ignore.case=T),4] <- 0
-
-pCor <- rep(1, nrow(qc))
-pCor[grepl("^A", qc$ntVar, ignore.case=T)] <- errACGT[1]
-pCor[grepl("^C", qc$ntVar, ignore.case=T)] <- errACGT[2]
-pCor[grepl("^G", qc$ntVar, ignore.case=T)] <- errACGT[3]
-pCor[grepl("^T", qc$ntVar, ignore.case=T)] <- errACGT[4]
-
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- #print(i)
- sel <- qc$amp == i
- prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob * pCor[sel], lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
- }
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
- }
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
- }
- #print(sum(-10 * log10(pees) > 100))
- #print(sum(qc$qScoreI[selIns] > 100))
- #print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-#addQual(qc) -> qc
-#write.table(qc, output, row.names=F, col.names=T, quote=F, sep="\t")
-#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
-
-calcTrans <- function(qcRaw, lociFilt = NULL) {
-
-sel <- rep(T, nrow(qcRaw))
-if (!is.null(lociFilt)) {
-paste(qcRaw$X.chr, qcRaw$pos, sep=":") -> qcpos
-sel <- ! qcpos %in% lociFilt
-}
-
-qc <- qcRaw[sel,]
-
-# Calculate transition bias
-refA <- grepl("A", qc$ntRef, ignore.case=T)
-refC <- grepl("C", qc$ntRef, ignore.case=T)
-refG <- grepl("G", qc$ntRef, ignore.case=T)
-refT <- grepl("T", qc$ntRef, ignore.case=T)
-varA <- grepl("A", qc$ntVar, ignore.case=T)
-varC <- grepl("C", qc$ntVar, ignore.case=T)
-varG <- grepl("G", qc$ntVar, ignore.case=T)
-varT <- grepl("T", qc$ntVar, ignore.case=T)
-varN <- grepl(".", qc$ntVar, ignore.case=T)
-callA <- grepl("^A", qc$ntVar, ignore.case=T)
-callC <- grepl("^C", qc$ntVar, ignore.case=T)
-callG <- grepl("^G", qc$ntVar, ignore.case=T)
-callT <- grepl("^T", qc$ntVar, ignore.case=T)
-ntA <- sum(refA)
-ntC <- sum(refC)
-ntG <- sum(refG)
-ntT <- sum(refT)
-
-callList <- list(callA, callC, callG, callT)
-varList <- list(varA, varC, varG, varT)
-refList <- list(refA, refC, refG, refT)
-
-#tots <- c(
-#rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
-#rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
-#rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
-#rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
-
-nvar <- c(
-rep(sum(qc$nC[refA], qc$nG[refA], qc$nT[refA]), 4),
-rep(sum(qc$nA[refC], qc$nG[refC], qc$nT[refC]), 4),
-rep(sum(qc$nA[refG], qc$nC[refG], qc$nT[refG]), 4),
-rep(sum(qc$nA[refT], qc$nC[refT], qc$nG[refT]), 4))
-
-ncall <- c(
-rep(sum(qc$nVar[refA]), 4),
-rep(sum(qc$nVar[refC]), 4),
-rep(sum(qc$nVar[refG]), 4),
-rep(sum(qc$nVar[refT]), 4))
-
-nref <- c(
-rep(sum(qc$nRef[refA]), 4),
-rep(sum(qc$nRef[refC]), 4),
-rep(sum(qc$nRef[refG]), 4),
-rep(sum(qc$nRef[refT]), 4))
-
-tots <- nref + nvar
-
-nts <- c("A", "C", "G", "T")
-sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), npos=rep(c(ntA, ntC, ntG, ntT), each=4), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref, ncall=ncall)
-
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
-# print(sum(refList[[i]] & varList[[j]]))
-
- cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
- if (i == j) {
-# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
- cnt <- NA
- }
- sumTrans$cnt[pos + j] <- cnt
- #print(cnt)
-}}
-
-#sumTrans$cnt / sumTrans$nvar / sumTrans$tot * 1000000 -> tmp
-sumTrans$cnt / sumTrans$tot * 1000000 -> tmp
-
-#totalNT <- sum(qc$nVar) + sum(qc$nRef)
-#sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-tmp / median(tmp, na.rm=T) -> sumTrans$cor2
-
-sumTrans$cor[sumTrans$npos <= 15] <- 1
-sumTrans$cor2[sumTrans$npos <= 15] <- 1
-
-#sumTrans$cnt / totalNT -> sumTrans$cor
-#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-
-return(sumTrans)
-
-}
-
-calcTransCor <- function(qcRaw, lociFilt = NULL) {
-
-calcTrans(qcRaw, lociFilt) -> sumTrans
-
-pCor <- rep(1, nrow(qcRaw))
-paste(qcRaw$ntRef, substr(qcRaw$ntVar, 1, 1), sep=":") -> qcTrans
-
-for (i in c("A", "C", "G", "T")) {
-for (j in c("A", "C", "G", "T")) {
- if (i != j) {
- tTrans <- paste(i, j, sep=":")
- pCor[grepl(tTrans, qcTrans, ignore.case=T)] <- sumTrans$cor2[sumTrans$trans == tTrans]
- }
-
-}}
-return(pCor)
-}
-
-
-calcTransCorOld <- function(qc) {
-# Calculate transition bias
-refA <- grepl("A", qc$ntRef, ignore.case=T)
-refC <- grepl("C", qc$ntRef, ignore.case=T)
-refG <- grepl("G", qc$ntRef, ignore.case=T)
-refT <- grepl("T", qc$ntRef, ignore.case=T)
-varA <- grepl("A", qc$ntVar, ignore.case=T)
-varC <- grepl("C", qc$ntVar, ignore.case=T)
-varG <- grepl("G", qc$ntVar, ignore.case=T)
-varT <- grepl("T", qc$ntVar, ignore.case=T)
-varN <- grepl(".", qc$ntVar, ignore.case=T)
-callA <- grepl("^A", qc$ntVar, ignore.case=T)
-callC <- grepl("^C", qc$ntVar, ignore.case=T)
-callG <- grepl("^G", qc$ntVar, ignore.case=T)
-callT <- grepl("^T", qc$ntVar, ignore.case=T)
-
-callList <- list(callA, callC, callG, callT)
-varList <- list(varA, varC, varG, varT)
-refList <- list(refA, refC, refG, refT)
-
-tots <- c(
-rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
-rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
-rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
-rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
-
-nvar <- c(
-rep(sum(qc$nVar[refA]), 4),
-rep(sum(qc$nVar[refC]), 4),
-rep(sum(qc$nVar[refG]), 4),
-rep(sum(qc$nVar[refT]), 4))
-
-nref <- c(
-rep(sum(qc$nRef[refA]), 4),
-rep(sum(qc$nRef[refC]), 4),
-rep(sum(qc$nRef[refG]), 4),
-rep(sum(qc$nRef[refT]), 4))
-
-nts <- c("A", "C", "G", "T")
-sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref)
-
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
-# print(sum(refList[[i]] & varList[[j]]))
-
- cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
- if (i == j) {
-# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
- cnt <- NA
- }
- sumTrans$cnt[pos + j] <- cnt
- #print(cnt)
-}}
-#totalNT <- sum(qc$nVar) + sum(qc$nRef)
-sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-#sumTrans$cnt / totalNT -> sumTrans$cor
-#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-#print(sumTrans)
-
-pCor <- rep(1, nrow(qc))
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
- if (i != j) {
- pCor[refList[[i]] & callList[[j]]] <- sumTrans$cor[pos + j]
- }
-
-}}
-return(pCor)
-}
diff -r 13eee0859f94 -r 5287f72e0891 falco/lib/R/plotsPng.R
--- a/falco/lib/R/plotsPng.R Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,504 +0,0 @@
-#!/usr/bin/Rscript
-
-args <- commandArgs(T)
-
-print(args[1]) # qc.ann.qual.txt
-print(args[2]) # qc2.ann.qtxt
-print(args[3]) # qc.targets.txt
-print(args[4]) # res.txt (unfiltered)
-print(args[5]) # clinvar
-print(args[6]) # lociFilt
-print(args[7]) # basename
-options(warn=1)
-print(paste("Reading", args[1], sep=" "));
-read.delim(args[1], header=T, stringsAsFactors=F) -> d
-print(paste("Reading", args[2], sep=" "));
-read.delim(args[2], header=T, stringsAsFactors=F) -> d2
-print(paste("Reading", args[3], sep=" "));
-read.delim(args[3], header=T, stringsAsFactors=F) -> qc
-print(paste("Reading", args[4], sep=" "));
-read.delim(args[4], header=T, stringsAsFactors=F, row.names=NULL) -> res
-print(paste("Reading", args[5], sep=" "));
-## clinvar is a VCF with "##" header lines to skip
-headerLineCount <- length( grep('^##', readLines(args[5])) )
-read.delim(args[5], header=T, stringsAsFactors=F, skip=headerLineCount) -> dclin
-print(paste("Reading", args[6], sep=" "));
-read.delim(args[6], header=F, stringsAsFactors=F) -> lociFilt
-base <- args[7]
-
-# Error rate per cycle
-# VAF distribution
-# varDepth distribution ?
-# Print coverage plots per amplicon
-
-tmpCP <- paste(d$X.chr, d$pos, sep=":")
-tmpClin <- paste(paste("chr", dclin[,1], sep=""), dclin[,2], sep=":")
-tmpSnp <- paste(paste("chr", lociFilt[,1], sep=""), lociFilt[,2], sep=":")
-
-inClin <- tmpCP %in% tmpClin
-
-inSnp <- tmpCP %in% tmpSnp
-
-dir.create(base)
-setwd(base)
-#unique(unlist(strsplit(d$V3, "\\|"))) -> targets
-targets <- qc[,1]
-if (1) {
- print("Plotting variant heatmap")
- # Including off targets increases the amplicon length set at median amplicon length + 10%.
- median(qc$end - qc$start) -> maxAmpLn
- maxAmpLn <- ceiling(1.2 * maxAmpLn)
- print(maxAmpLn)
- ampHeatF <- matrix(ncol=maxAmpLn, nrow=nrow(qc))
- ampHeatR <- matrix(ncol=maxAmpLn, nrow=nrow(qc))
- #dPos <- paste(d$chr, d$pos, sep=":")
- #errPos <- paste(dclin$chr, dclin$pos, sep=":")
- #dPosClin <- dPos %in% errPos
-
- dPosClin <- !inClin & !inSnp
-
- for (i in 1:nrow(qc)) {
- d$pos %in% qc$start[i]:qc$end[i] -> sel
- poss <- d$pos[sel & !dPosClin] - qc$start[i] + 1
- poss <- poss[poss <= maxAmpLn]
- ampHeatF[i, poss] <- d$nVar[sel & !dPosClin] + d$nN[sel & !dPosClin]
- ampHeatR[i, poss] <- rev(d$nVar[sel & !dPosClin] + d$nN[sel & !dPosClin])
- }
- png(paste(base,"heat","png", sep="."), width=960, height=480)
- par(mar=c(4,4,4,2) + .1)
-
- ampHeatF[is.na(ampHeatF)] <- 0
- ampHeatR[is.na(ampHeatR)] <- 0
- layout(matrix(1:2, nrow=2))
- maxN <- quantile(ampHeatF, .99)
- #heatmap(ampHeat)
- boxplot(ampHeatF, ylim=c(0, maxN), pch=20, cex=.4, main="Non reference counts R1")
- rect(150, 0, ncol(ampHeatF), maxN, col="#00000040")
- boxplot(ampHeatR, ylim=c(0, maxN), pch=20, cex=.4, main="Non reference counts R2")
- rect(150, 0, ncol(ampHeatR), maxN, col="#00000040")
- dev.off()
-
- #d[,7]/d[,4] * 100 -> d$pctREF
- #d[,8]/d[,4] * 100 -> d$pctVAR
- d$nRef/d$dp * 100 -> d$pctREF
- d$nVar/d$dp * 100 -> d$pctVAR
- d$vaf <- d$nVar / (d$nVar + d$nRef) * 100
- print("Plotting ref frequencies")
- png(paste(base,"raf","png", sep="."), width=480, height=480)
- dref <- density(d$pctREF[!dPosClin])
- #plot(dref, ylim=range(dref$y[dref$x < 80] ))
- hist(d$pctREF[!dPosClin], breaks=100, ylim=c(0, 30), col="#00000040", border=NA)
- dev.off()
-
- print("Plotting var frequencies")
- png(paste(base,"vaf","png", sep="."), width=480, height=480)
- dvar <- density(d$pctVAR[!dPosClin])
- #plot(dvar, ylim=range(dvar$y[dvar$x > 20] ))
- hist(d$pctVAR[!dPosClin], breaks=100, ylim=c(0, 30), col="#00000040", border=NA)
- dev.off()
-
- # Set vaf cutoff
- #q95 <- quantile(d$qScore, .95, na.rm=T)
- q95 <- 20
- vs95 <- sd(d$pctVAR[!dPosClin & d$qScore < q95], na.rm=T)
- v95 <- mean(d$pctVAR[!dPosClin & d$qScore < q95], na.rm=T) + vs95 * 3
-
- nCall <- sum((d$pctVAR > v95) & d$qScore > q95 & !inSnp, na.rm=T)
- print(nCall)
- print("Plotting vaf vs q")
- png(paste(base,"snv-q","png", sep="."), width=480, height=480)
- plot(d$qScore, d$pctVAR, pch=20, cex=.5, main=paste("q:", q95, "-", "v:", v95, "--", "s:", nCall, sep=" "))
- abline(h=c(1,5), v=c(20,13), lty=2)
- abline(v=q95, h=v95, lty=3, col=2)
- points(d$qScore[d$dp < 100], d$pctVAR[d$dp < 100], pch=20, cex=1.2, col=2)
- points(d$qScore[d$nVar < 10], d$pctVAR[d$nVar < 10], pch=20, cex=1, col=4)
- legend(30,80, c("pass", "dp < 100", "nVar < 10"), col=c(1,2,4), pch=20)
- dev.off()
-
- print("Plotting vaf vs q zoomed")
- png(paste(base,"snv-q-zoom","png", sep="."), width=480, height=480)
- plot(d$qScore, d$pctVAR, pch=20, cex=.5, xlim=c(0, 40),ylim=c(0,10))
- abline(h=c(1,5), v=c(20,13), lty=2)
- abline(v=q95, h=v95, lty=3, col=2)
- points(d$qScore[d$dp < 100], d$pctVAR[d$dp < 100], pch=20, cex=1.2, col=2)
- points(d$qScore[d$nVar < 10], d$pctVAR[d$nVar < 10], pch=20, cex=1, col=4)
- legend("topright", c("pass", "dp < 100", "nVar < 10"), col=c(1,2,4), pch=20)
- dev.off()
-
- print("Plotting ins vs q")
- d$vafI <- 0
- d$occI <- 0
- if (sum(d$ins != ".") > 0) {
- png(paste(base,"ins-q","png", sep="."), width=480, height=480)
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.character(d$ins[d$ins != "."]), "\\|")), ":")), ncol=2, byrow=T)) -> insDat
- colnames(insDat) <- c("seq", "occ")
- insDat$seq <- as.character(insDat$seq)
- insDat$occ <- as.numeric(as.vector(insDat$occ))
- nchar(insDat$seq) -> insDat$ln
- d$occI[d$ins != "."] <- insDat$occ
- d$vafI[d$ins != "."] <- insDat$occ / d$nRef[d$ins != "."]
- d$qScoreI[d$qScoreI > 1000] <- 1000
- plot(d$qScoreI, d$occI, pch=20, cex=1)
- abline(h=c(10), v=c(20,13), lty=2)
- # dp 500
- sel <- d$dp < 100
- points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1.2, col=2)
- # nVar 10
- #sel <- d$occI < 10
- #points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1, col=3)
- # vaf < 0.01
- sel <- d$vafI < 0.01
- points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1, col=4)
- legend("topleft", c("pass", "dp < 100", "vaf < 0.01"), col=c(1,2,4), pch=20)
- dev.off()
- }
-
- print("Plotting del vs q")
- d$vafD <- 0
- d$occD <- 0
- if (sum(d$del != "." ) > 0) {
- png(paste(base,"del-q","png", sep="."), width=480, height=480)
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.character(d$del[d$del != "."]), "\\|")), ":")), ncol=2, byrow=T)) -> delDat
- colnames(delDat) <- c("seq", "occ")
- delDat$seq <- as.character(delDat$seq)
- delDat$occ <- as.numeric(as.vector(delDat$occ))
- nchar(delDat$seq) -> delDat$ln
- d$occD[d$del != "."] <- delDat$occ
- d$vafD[d$del != "."] <- d$occD[d$del != "."] / d$nRef[d$del != "."]
- d$qScoreD[d$qScoreD > 1000] <- 1000
- #hist(d$qScoreD, breaks=1000, ylim=c(0,100))
- #print(sum(d$qScoreD > 20))
- #Sys.sleep(1)
- plot(d$qScoreD, d$occD, pch=20, cex=1)
- abline(h=c(10), v=c(20,13), lty=2)
- # dp 500
- sel <- d$dp < 100
- points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1.2, col=2)
- # nVar 10
- #sel <- d$occD < 10
- #points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1, col=3)
- # vaf < 0.01
- sel <- d$vafD < 0.01
- points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1, col=4)
- legend("topleft", c("pass", "dp < 100", "vaf < 0.01"), col=c(1,2,4), pch=20)
- dev.off()
- }
-
- sub("chr", "", qc$chr) -> qc$chrn
- ampOrd <- order(qc$chrn, qc$start)
-
- print("Plotting amplicon depths")
- png(paste(base,"amp-dp","png", sep="."), width=960, height=480)
- barplot(qc$depth[ampOrd])
- abline(h=100, lty=2)
- totRead <- sum(qc$depth, na.rm=T)
- legend("topright", paste("Total read count: ", totRead, sep=""))
- dev.off()
-}
-
-res$POS <- as.numeric(as.vector(res$POS))
-d$pos <- as.numeric(as.vector(d$pos))
-
-grepl("a", d$ntRef, ignore.case=T) -> Asel
-grepl("c", d$ntRef, ignore.case=T) -> Csel
-grepl("g", d$ntRef, ignore.case=T) -> Gsel
-grepl("t", d$ntRef, ignore.case=T) -> Tsel
-cntr <- 0
-
-biasTable <- data.frame(matrix(NA, nrow=length(targets), ncol=12), row.names=targets)
-colnames(biasTable) <- c("AC", "AG", "AT", "CA", "CG", "CT", "GA", "GC", "GT", "TA", "TC", "TG")
-
-for (i in targets) {
- sel <- grepl(i, d$amp)
- print(i)
- for (c in unique(d[sel,1])) {
- csel <- d[,1] == c
- print(c)
- reg <- paste(range(d[sel & csel,2]), collapse="-", sep="-")
- creg <- paste(c, reg, sep=":")
-
- qcsel <- qc[,1] == i
- allsel <- sel & csel
- clinSel <- dclin$amp == i
- clinPos <- dclin$pos[clinSel]
-# inClin <- d$pos %in% clinPos
-
- #dels <- (d[,15] != ".") & allsel
- #ins <- (d[,14] != ".") & sel
- # print(reg)
- # print(creg)
-
- # Select annotations
- # calculate quantiles and plot them
-# quantile(d$pctVAR[sel & csel], c(.75,.90,.95,.99), na.rm=T) -> cutoffs
- tmp <- d[sel & csel,]
- #dels <- tmp[,15] != "."
- #ins <- tmp[,14] != "."
- dels <- allsel & d$del != "."
- ins <- allsel & d$ins != "."
-
- resTmp <- res[(res$POS >= min(tmp[,2])) & (res$POS <= max(tmp[,2])) & (res$X.CHROM == c),]
- #resSel <- res$TARGET == i
- png(paste(base, i, "cov", "png", sep="."), width=960, height=480)
-
- xvec <- c(0, rep(1:(sum(allsel) - 1), each=2), sum(allsel))
- layout(matrix(1:2, ncol=1), heights=c(2,1))
- par(mar=c(0, 4, 4, 7) + .1, xaxt="n")
- print("Cov")
- plot(NULL, type="n", main=i, xlab="Position", ylab="Depth", xlim=range(d$pos[allsel]), ylim=range(0, d$dp[allsel]))
- xvec <- d$pos[allsel]
- yvec <- d$dp[allsel]
-
- polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(yvec, each=2), 0), col="#00000010", border=1)
- polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(rowSums(d[allsel, 7:8]), each=2), 0), col="#00000010")
-# polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(d$nVar[allsel], each=2), 0), col=2)
- varCol <- rep(2, sum(allsel))
- varCol[grepl("^a", d$ntVar[allsel], ignore.case=T)] <- "green"
- varCol[grepl("^c", d$ntVar[allsel], ignore.case=T)] <- "blue"
- varCol[grepl("^g", d$ntVar[allsel], ignore.case=T)] <- "black"
- varCol[grepl("^t", d$ntVar[allsel], ignore.case=T)] <- "red"
- rect(xvec-.5,d$nVar[allsel],xvec+.5,0, col=varCol)
-
-
- ampMax <- max(d[allsel,4])
- #abline(v=resTmp$POS, lty=2, col=2)
- #delMark <- list()
- #insMark <- list()
-
- for (idx in which(dels)) {
- matrix(unlist(strsplit(as.character(as.vector(d[idx,15])), "[:|]")), ncol=2, byrow=T) -> delDat
- nchar(delDat[,1]) -> lngts
- dpths <- as.numeric(delDat[,2])
- x1 <- d[idx,2] + .5
- x2 <- x1 + lngts - 1
- if (x2 > max(d$pos[allsel])) {
- next
- }
- # delMark[[length(delMark) + 1]] <- c(x1, x2)
- rect(x1, ampMax - dpths , x2, ampMax, col="#00000040")
- }
-
- for (idx in which(ins)) {
- matrix(unlist(strsplit(as.character(as.vector(d[idx,14])), "[:|]")), ncol=2, byrow=T) -> insDat
- nchar(insDat[,1]) -> lngts
- dpths <- as.numeric(insDat[,2])
- x1 <- d$pos[idx] + 1
- # insMark[length(insMark) + 1] <- x1
- for (ii in 1:ncol(insDat)) {
- polygon(
- c(x1, x1 - (.5 * lngts[ii]), x1 + (.5 * lngts[ii])),
- c(ampMax, ampMax - dpths[ii], ampMax - dpths[ii]),
- , col="#00000040")
- }
- }
- par(xpd=NA)
- legend(max(d$pos[allsel]) + sum(allsel) * 0.05, ampMax, legend=c("Filtered nt", "Ref nt", "Non-ref nt"), col=c("#00000040", "#00000080", "red"), pch=15)
- legend(max(d$pos[allsel]) + sum(allsel) * 0.05, 0, legend=c("A", "C", "G", "T"), col=c("green", "blue", "black", "red"), pch=15)
- par(xpd=F)
- par(mar=c(5, 4, 0, 7) + .1, xaxt="s")
- plot(NULL, type="n", xlab="Position", ylab=NA, xlim=range(d$pos[allsel]), ylim=c(0,4.5), yaxt="n")
- axis(side=2, at=1:4, labels=c("ins", "del", "snv", "ref"), las=2)
- abline(h=1:4, col="#00000040")
- sres <- nchar(as.character(resTmp$REF)) == nchar(as.character(resTmp$ALT))
- dres <- nchar(resTmp$REF) > nchar(resTmp$ALT)
- ires <- nchar(resTmp$REF) < nchar(resTmp$ALT)
-
- resTmp$marks <- rep(3.4, nrow(resTmp))
-
- resTmp$cols <- rep("#00000040", nrow(resTmp))
- resTmp$cols[grepl("^a", resTmp$ALT, ignore.case=T) & sres] <- "green"
- resTmp$cols[grepl("^c", resTmp$ALT, ignore.case=T) & sres] <- "blue"
- resTmp$cols[grepl("^g", resTmp$ALT, ignore.case=T) & sres] <- "black"
- resTmp$cols[grepl("^t", resTmp$ALT, ignore.case=T) & sres] <- "red"
-
- #resTmp$cols[dres] <- 2
- #resTmp$cols[ires] <- 2
- resTmp$marks[dres] <- 2.4
- resTmp$marks[ires] <- 1.4
- resTmp$markP1 <- resTmp$POS -.5
- resTmp$markP2 <- resTmp$POS +.5
- resTmp$markP1[dres] <- resTmp$POS[dres] +.5
- resTmp$markP2[dres] <- resTmp$POS[dres] +.5 + nchar(resTmp$REF[dres]) - 1
- resTmp$markP1[ires] <- resTmp$POS[ires] +.5
- resTmp$markP2[ires] <- resTmp$POS[ires] +.5 + 1
- ntCol <- rep(1, sum(allsel))
- ntCol[grepl("a", d$ntRef[allsel], ignore.case=T)] <- "green"
- ntCol[grepl("c", d$ntRef[allsel], ignore.case=T)] <- "blue"
- ntCol[grepl("g", d$ntRef[allsel], ignore.case=T)] <- "black"
- ntCol[grepl("t", d$ntRef[allsel], ignore.case=T)] <- "red"
-
- rect(d$pos[allsel] -.5, 4.4, d$pos[allsel] +.5, 3.6, col=ntCol, border=0)
- rect(resTmp$markP1, resTmp$marks, resTmp$markP2, resTmp$marks - .8, col=resTmp$cols, border=0)
-
- if(length(clinPos != 0)) {
- rect(clinPos - .5, 3.5, clinPos + .5, .5, border="black", lwd=2)
- }
- #lapply(delMark, function(x) { rect(x[1], 2, x[2], 1, col=2, border=NA) })
- #lapply(insMark, function(x) { rect(x[1], 1, x[1] + 1, 0, col=3, border=NA) })
-
- # Highlight clinically relevant loci
- #for (i in which(dclin$amp == i)) {
- # if (dclin$REF[i] == nchar
- #}
-
- dev.off()
-
- sansClin <- allsel & !inClin
- sansSnp <- allsel & !inSnp
-
- d$nA[sansClin & sansSnp & Asel] -> AA
- d$nC[sansClin & sansSnp & Asel] -> AC
- d$nG[sansClin & sansSnp & Asel] -> AG
- d$nT[sansClin & sansSnp & Asel] -> AT
-
- d$nA[sansClin & sansSnp & Csel] -> CA
- d$nC[sansClin & sansSnp & Csel] -> CC
- d$nG[sansClin & sansSnp & Csel] -> CG
- d$nT[sansClin & sansSnp & Csel] -> CT
-
- d$nA[sansClin & sansSnp & Gsel] -> GA
- d$nC[sansClin & sansSnp & Gsel] -> GC
- d$nG[sansClin & sansSnp & Gsel] -> GG
- d$nT[sansClin & sansSnp & Gsel] -> GT
-
- d$nA[sansClin & sansSnp & Tsel] -> TA
- d$nC[sansClin & sansSnp & Tsel] -> TC
- d$nG[sansClin & sansSnp & Tsel] -> TG
- d$nT[sansClin & sansSnp & Tsel] -> TT
-
- print("Bias")
- png(paste(base, i, "bias", "png", sep="."), width=960, height=480)
-
- Acnt <- sum(sum(AA), sum(AC), sum(AG), sum(AT))
- Ccnt <- sum(sum(CA), sum(CC), sum(CG), sum(CT))
- Gcnt <- sum(sum(GA), sum(GC), sum(GG), sum(GT))
- Tcnt <- sum(sum(TA), sum(TC), sum(TG), sum(TT))
-
- NTsums <- c(
- sum(AC)/Acnt, sum(AG)/Acnt, sum(AT)/Acnt,
- sum(CA)/Ccnt, sum(CG)/Ccnt, sum(CT)/Ccnt,
- sum(GA)/Gcnt, sum(GC)/Gcnt, sum(GT)/Gcnt,
- sum(TA)/Tcnt, sum(TC)/Tcnt, sum(TG)/Tcnt)
- NTsums[!is.finite(NTsums)] <- 0
- NTsums[is.nan(NTsums)] <- 0
- NTsums[is.na(NTsums)] <- 0
-
- barplot(NTsums, names.arg=c("AC", "AG", "AT", "CA", "CG", "CT", "GA", "GC", "GT", "TA", "TC", "TG"), main=i)
- dev.off()
- biasTable[rownames(biasTable) == i, ] <- NTsums
-
- print("Cov2")
- png(paste(base, i, "cov2", "png", sep="."), width=960, height=480)
- layout(matrix(1:2, ncol=1), heights=c(2,1))
- par(mar=c(0, 4, 4, 7) + .1, xaxt="n")
-
-
- dat <- d2[d2$amp == i,]
- rled <- rle(sort(dat$pos))
- rds <- sort(unique(dat$X.read))
- lst <- c()
- for (ii in order(rled$lengths, decreasing=T))
- {
-
- rled$values[ii] -> pos
- dat$X.read[dat$pos == pos]
- lst <- c(lst, rds %in% dat$X.read[dat$pos == pos])
-
- }
- #xlim <- range(dat$pos)
- xlim=range(d$pos[allsel])
- ylim=range(0, d$dp[allsel])
- plot(NULL, xlim=xlim, ylim=ylim, main=i, ylab="Depth")
- rect(xlim[1] - 0.5, 0, xlim[2] + .5, ylim[2], col="#00000010")
- if (length(lst) > 0) {
- matrix(lst, nrow=length(rds), byrow=F) -> mat
- do.call( order, data.frame(mat) ) -> ord
- for (e in 1:length(ord)) {
- sel <- dat$X.read == rds[ord[e]]
- colvec <- rep(1, sum(sel))
- colvec[grepl("A$", dat$event[sel], ignore.case=T)] <- "green"
- colvec[grepl("C$", dat$event[sel], ignore.case=T)] <- "blue"
- colvec[grepl("G$", dat$event[sel], ignore.case=T)] <- "black"
- colvec[grepl("T$", dat$event[sel], ignore.case=T)] <- "red"
- #points(dat$pos[ sel ], rep(e, sum(sel)), col=colvec, pch=20, cex=.2)
- segments(dat$pos[ sel ] - .5, rep(e, sum(sel)), dat$pos[ sel ] + .5, rep(e, sum(sel)), col=colvec, pch=20, cex=.2)
- }
- }
-
- par(xpd=NA)
- legend(max(d$pos[allsel]) + sum(allsel) * 0.05, ampMax, legend=c("Filtered nt", "Ref nt", "Non-ref nt"), col=c("#00000040", "#00000080", "red"), pch=15)
- legend(max(d$pos[allsel]) + sum(allsel) * 0.05, 0, legend=c("A", "C", "G", "T"), col=c("green", "blue", "black", "red"), pch=15)
- par(xpd=F)
- par(mar=c(5, 4, 0, 7) + .1, xaxt="s")
- plot(NULL, type="n", xlab="Position", ylab=NA, xlim=range(d$pos[allsel]), ylim=c(0,4.5), yaxt="n")
- axis(side=2, at=1:4, labels=c("ins", "del", "snv", "ref"), las=2)
- abline(h=1:4, col="#00000040")
- sres <- nchar(as.character(resTmp$REF)) == nchar(as.character(resTmp$ALT))
- dres <- nchar(resTmp$REF) > nchar(resTmp$ALT)
- ires <- nchar(resTmp$REF) < nchar(resTmp$ALT)
-
- resTmp$marks <- rep(3.4, nrow(resTmp))
-
- resTmp$cols <- rep("#00000040", nrow(resTmp))
- resTmp$cols[grepl("^a", resTmp$ALT, ignore.case=T) & sres] <- "green"
- resTmp$cols[grepl("^c", resTmp$ALT, ignore.case=T) & sres] <- "blue"
- resTmp$cols[grepl("^g", resTmp$ALT, ignore.case=T) & sres] <- "black"
- resTmp$cols[grepl("^t", resTmp$ALT, ignore.case=T) & sres] <- "red"
-
- #resTmp$cols[dres] <- 2
- #resTmp$cols[ires] <- 2
- resTmp$marks[dres] <- 2.4
- resTmp$marks[ires] <- 1.4
- resTmp$markP1 <- resTmp$POS -.5
- resTmp$markP2 <- resTmp$POS +.5
- resTmp$markP1[dres] <- resTmp$POS[dres] +.5
- resTmp$markP2[dres] <- resTmp$POS[dres] +.5 + nchar(resTmp$REF[dres]) - 1
- resTmp$markP1[ires] <- resTmp$POS[ires] +.5
- resTmp$markP2[ires] <- resTmp$POS[ires] +.5 + 1
- ntCol <- rep(1, sum(allsel))
- ntCol[grepl("a", d$ntRef[allsel], ignore.case=T)] <- "green"
- ntCol[grepl("c", d$ntRef[allsel], ignore.case=T)] <- "blue"
- ntCol[grepl("g", d$ntRef[allsel], ignore.case=T)] <- "black"
- ntCol[grepl("t", d$ntRef[allsel], ignore.case=T)] <- "red"
-
- rect(d$pos[allsel] -.5, 4.4, d$pos[allsel] +.5, 3.6, col=ntCol, border=0)
- rect(resTmp$markP1, resTmp$marks, resTmp$markP2, resTmp$marks - .8, col=resTmp$cols, border=0)
-
- if(length(clinPos != 0)) {
- rect(clinPos - .5, 3.5, clinPos + .5, .5, border="black", lwd=2)
- }
-
-
- dev.off()
-
-# png(paste(base, i, "vaf", "png", sep="."), width=960, height=480)
-# plot(d[sel & csel,2], d$vaf[sel & csel], type="l", ylim=c(0,100), main=paste(i, creg, cutoffs[3], sep=" "), xlab="position", ylab="%", col=2)
- #lines(d[sel & csel,2], d$pctVAR[sel & csel], type="l", col=2)
-# abline(h=100)
-# abline(h=c(1,5,10,50,90,95,99), lty=3, col=1)
-# abline(h=cutoffs[3], lty=4, col=2)
-# abline(v=c(limP[1], limP[1] + 150, limP[2] - 150, limP[2]),lty=4)
-# abline(v=c(qc[qcsel,8:9]), lty=5, col=2)
-# if (length(labs != 0)) {
-# text(resTmp$POS[resTmp$vaf > (cutoffs[3]/100)], 50, labels=resTmp$Amino_Acid_change[resTmp$vaf > (cutoffs[3]/100)])
-# abline(v=resTmp$POS[resTmp$vaf > (cutoffs[3]/100)], col="grey", lty=4)
-# }
-# dev.off()
- }
-}
-
-png(paste(base, "biasheat", "png", sep="."), width=960, height=960)
-write.table(biasTable, paste(base, "bias", "tsv", sep="."), sep="\t")
-biasTable[is.na(biasTable)] <- 0
-heatmap(data.matrix(biasTable), Colv=NA, scale="r")
-dev.off()
-png(paste(base, "bias", "png", sep="."), width=960, height=480)
-boxplot(data.matrix(biasTable), pch=20, cex=.5)
-dev.off()
-warnings()
-#hist(d$pctVAR[d$pctVAR < 20], breaks=40, ylim=c(0,1000))
-#hist(d$pctREF[d$pctREF > 80], breaks=40, ylim=c(0,1000))
-
-#print(summary(d$pctVAR))
-
-
diff -r 13eee0859f94 -r 5287f72e0891 falco/lib/perl/filter.pl
--- a/falco/lib/perl/filter.pl Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,133 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-my $file = shift;
-my $canonicals = shift;
-my $clinvar = shift;
-my $cosmic = shift;
-my $cosmicNC = shift;
-
-my $minDP = 100;
-my $minVAR = 10;
-my $minVAF = 0.01;
-
-my %can = ();
-my %nm = ();
-my %cvHsh = ();
-
-print STDERR "Reading in canonicals,...";
-open CAN, "<$canonicals";
-while () {
- chomp;
- my @row = split(/\t/, $_);
- $can{$row[0]} = @row[1 .. $#row];
- my $pri = 0;
- $nm{$_} = $pri++ foreach @row[1 .. $#row];
-}
-close CAN;
-
-print STDERR "Reading clinvars...\n";
-open CV, "<$clinvar";
-while () { last if (/#CHROM/); }
-while () {
- chomp;
- my @row = split(/\t/, $_);
- push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
-}
-close CV;
-
-print STDERR "Reading COSMIC...\n";
-open CV, "<$cosmic";
-while () { last if (/#CHROM/); }
-while () {
- chomp;
- my @row = split(/\t/, $_);
- push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
-}
-close CV;
-
-print STDERR "Reading COSMICNC...\n";
-open CV, "<$cosmicNC";
-while () { last if (/#CHROM/); }
-while () {
- chomp;
- my @row = split(/\t/, $_);
- push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
-}
-
-close CV;
-print STDERR "done!\n";
-
-print STDERR "Processing $file\n";
-
-open F, "<$file";
-my $head = readline(F);
-chomp $head;
-
-print $head . "\tvaf\n";
-
-my @rowHead = split(/\t/, $head);
-my $colN = 0;
-my %col = map {
- $_ => $colN++ } @rowHead;
-
-while () {
- chomp;
- my $line = $_;
- my @row = split(/\t/, $_);
- next unless (exists($can{$row[$col{Gene_Name}]}));
- my $trans = $row[$col{Transcript_ID}];
- $trans =~ s/\..*$//;
-
- ## impossible to switch off these two transcript filters
- ## so commented out
- # if (not exists($nm{$trans})) {
- # print STDERR "Unknown transcript: $row[$col{Gene_Name}] $trans\n";
- # next;
- # }
- # elsif ($nm{$trans} != 0) {
- # print STDERR "Non cannonical: $row[$col{Gene_Name}] $trans\n";
- # next;
- # }
-
- # Annotate clinincal variants and cosmic mutations
-
- my $chr = $row[$col{"#CHROM"}];
- my $pos = $row[$col{POS}];
-
- if (exists($cvHsh{"$chr:$pos"})) {
- my $rows = $cvHsh{"$chr:$pos"};
- my $cv = join("|", map { $_->[2] } @$rows);
- $row[2] .= "|$cv";
- }
-
- # Filter INTERGENIC
- next if ($row[$col{Context}] eq "INTERGENIC");
-
- # Filter SYNONYMOUS_CODING
- # Dubbel mutatie BRAF
-# next if ($row[$col{Context}] eq "SYNONYMOUS_CODING");
-
- # Filter INTRON
- next if ($row[$col{Context}] eq "INTRON");
-
- # Filter dbSNP
-# next if ($row[2] ne ".");
-
- # Filter depth
- next if ($row[$col{DP}] < $minDP);
-
- my ($nref, $nvar) = split(/,/, $row[$col{AD}]);
- # Filter var depth
-# print STDERR "DEBUG: " . $nvar . "\n"; sleep 1;
- next if ($nvar < $minVAR);
-
- my $vaf = $nvar / ($nref + $nvar);
-# print STDERR "DEBUG: " . $vaf . "\n";
- # Filter vaf
- next if ($vaf < $minVAF);
-
- print join("\t", @row, $vaf) . "\n";
-}
-close F;
diff -r 13eee0859f94 -r 5287f72e0891 falco/lib/perl/mkHtmlReport.pl
--- a/falco/lib/perl/mkHtmlReport.pl Fri Feb 27 10:41:30 2015 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,234 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use Cwd;
-use Spreadsheet::WriteExcel;
-
-$| = 1;
-my $dir = shift || "./";
-my $outdir = shift || "./";
-my $pat = shift || "";
-my $cwd = cwd();
-(my $runName = $cwd) =~ s/^.*\/(.*?)$/$1/;
-
-# QC
-# Results
-my @samples = ();
-#open INDEX, ">$outdir/index.html";
-my $htmlHead = qq(
-
-
-
-
-
-
-);
-#print INDEX $htmlHead;
-opendir DIR, "$dir";
-while (my $cd = readdir DIR) {
- if ($cd =~ /(.*$pat)\.qc\.targets\.txt$/) {
- my $sam = $1;
-# next if ($sam =~ /R[12]/);
- print STDERR $1 . "\n";
- push @samples, $1;
- }
-}
-close DIR;
-
-#print INDEX "";
-#print INDEX "Download: | runQC.xls | ";
-#print INDEX "
---|
Sample | BAM | snp | indel | readCnt | Amp > 100 | \n";
-#foreach my $sam (sort @samples) {
-# print INDEX "
---|
$sam | BAM | BAI |
\n";
-
-#}
-my %link = ();
-my $excelBook0 = Spreadsheet::WriteExcel->new("$outdir/runQC.xls");
-my $excel0 = $excelBook0->add_worksheet("table1");
-my $excel0Ref = [[qw/sampleName runName totalReads pct100 ntbGenes/]];
-
-
-foreach my $sample (sort @samples) {
- print STDERR "Processing $sample\n";
-# next if ($sample =~ /R[12]/);
-
- my $readCnt = 0;
- my $amp100 = 0;
- my %ntbGenes = ();
-
- open OUT, ">$outdir/$sample.html";
- open OUT2, ">$outdir/$sample.tsv";
- my $excelBook = Spreadsheet::WriteExcel->new("$outdir/$sample.xls");
- my $excel1 = $excelBook->add_worksheet("table1");
- my $excel2 = $excelBook->add_worksheet("table2");
- print OUT $htmlHead;
- my %QC = ();
- open QC, "<$dir/$sample.qc.targets.txt";
- readline QC;
- print STDERR "Reading in $sample.qc.targets.txt\n";
- while () {
- chomp;
- my @row = split(/\t/, $_);
- my @id = split(/[\_\.\-:]/, $row[0]);
- $row[-1] = 0 if ($row[-1] eq "NA");
- $readCnt += $row[-1]; # DP
- if ($#id != 10) {
- $id[0] =~ /(\D+)(\d+)/;
- $id[0] = $2;
- unshift @id, $1;
- }
-
- if ($row[-1] >= 100) {
- $amp100++
- }
- else {
- $ntbGenes{$row[0]}{dp} = $row[-1];
- $ntbGenes{$row[0]}{id} = [@id];
- }
-
- $QC{$row[0]}{QC} = [@id, @row];# if ($id[0]);
- foreach my $c ($row[4] .. $row[5]) {
- $link{$id[-3] . ":" . $c}{$row[0]} = "Assay";
- }
- foreach my $c ($row[2] .. $row[4], $row[5] .. $row[3]) {
- $link{$id[-3] . ":" . $c}{$row[0]} = "LSO";
- }
- }
- close QC;
-
- open RES, "<$dir/$sample.res.filtered.tsv" or die "Unable to open $dir/$sample\n";
- my %uniq = ();
- my $colCnt = 0;
- my $resHead = readline(RES);
- chomp $resHead;
- $resHead =~ s/^#//;
- $resHead =~ s/\s+$//;
- my %resCol = map { $_ => $colCnt++ } split(/\t/, $resHead);
- my @keyColsN = qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Gene_Coding Transcript_ID Exon_Rank/;
-# my @keyColsN = qw/CHROM POS ID REF ALT QUAL DP AD vaf Context Effect_Impact Functional_Class Codon_Change Amino_Acid_change Amino_Acid_length Gene_Name Coding Transcript Exon Tag/;
- my @keyColsI = map { $resCol{$_} } @keyColsN;
- foreach my $i (0 .. $#keyColsN) {
- print STDERR join(":", $i, $keyColsN[$i], $keyColsI[$i]) . "\n";
- }
-
- print STDERR "Processing results\n";
- while () {
- chomp;
- my @row = split(/\t/, $_);
- my $cpos = join(":", @row[0, 1]);
- if (exists $link{$cpos}) {
-# my $key = join(":", @row[0 .. 4,6, 9,10,11,12,14 .. 21]);
- my $key = join(":", @row[@keyColsI]);
- #print STDERR join(":", @keyColsI) . "\n";
-# print STDERR join(":", @row) . "\n"; sleep 1;
- if (not exists($uniq{$key})) {
- foreach my $locus (keys(%{$link{$cpos}})) {
- next if ($link{$cpos}{$locus} eq "LSO");
- push @{$QC{$locus}{RES}}, [@row, $link{$cpos}{$locus}];
- # print STDERR "Adding $key to $locus\n\n"; #sleep 1;
- }
- $uniq{$key} = 0;
- }
- else {
- # print STDERR $key . " : Exists\n\n"; #sleep 1;
- }
- }
- }
- close RES;
-
- ## Rplots that are not self-explanatory enough
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
-
- print OUT "
";
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
- #print OUT "
";
- print OUT "Download: | TSV | ";
- print OUT "XLS |
---|
";
- print OUT "\n";
-# my @colnames = qw/depth chr pos id ref var qual DP AD vaf context impact effectClass codonChange AAChange RefSeqLength geneName RefSeqClass RefSeqID Exon Tag/;
- my @colnames = ("depth", @keyColsN);
- print OUT "" . join("", map { "$_ | " } ("Amplicon", "c","c2", "b", @colnames)) . "
";
- print OUT2 join("\t", "Amplicon", @colnames) . "\n";
- my $excelAref = [["Amplicon", @colnames]];
- my $excelAref2 = [[qw/gene exon protein vaf func depth /]];
- my %rescnt = ();
- foreach my $locus (sort keys(%QC)) {
- # my @targets = keys(%{$QC{$locus}{QC}});
-# print OUT "";
- my $nres = 1;
- $nres = scalar(@{$QC{$locus}{RES}}) if ($QC{$locus}{RES});
-
- print OUT "$locus | \n";
- print OUT "c | \n";
- print OUT "c2 | \n";
- print OUT "b | \n";
- print OUT "$QC{$locus}{QC}->[-1] | \n"; #";
-
- foreach my $res (@{$QC{$locus}{RES}}) {
- $res = [map {$_ || ""} @$res];
- print OUT " | \n";
- print OUT join(" | \n", @{$res}[@keyColsI]);
- print OUT " |
\n";
- print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]) . "\n";
- push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]];
-
- # qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Coding Transcript Exon/;
- push @$excelAref2, [map {$_ || "NA"} @{$res}[map { $resCol{$_} } (qw/Gene_Name Exon_Rank Amino_Acid_change vaf Functional_Class/)], $QC{$locus}{QC}->[-1]];
-# push @$excelAref2, [map {$_ || "NA"} @{$res}[[qw/Gene_Name Exon_Rank Cdna_change Amino_Acid_change vaf Functional_Class/]], $QC{$locus}{QC}->[-1]];
- my $pl = $res->[$keyColsI[7]];
- my $ref = $res->[$keyColsI[11]];
- my $var = $res->[$keyColsI[12]];
-# print STDERR "$pl $ref $var\n"; sleep 1;
- if (length($ref) == length($var)) {
- $rescnt{$pl . "snp"}++;
- }
- else {
- $rescnt{$pl . "indel"}++;
- }
-# $rescnt++;
- }
- print STDERR $locus . ":" . $nres . "\n";
- print STDERR $locus . ":" . join("-", @{$QC{$locus}{RES}}) . "\n";
- if (scalar(@{$QC{$locus}{RES}}) == 0) { #$nres == 0) {
- print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)) . "\n";
- push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)];
- }
- print OUT "
\n";
- }
- print OUT "
\n";
- print OUT "\n";
- close OUT;
- close OUT2;
- $excel1->write_col(0,0, $excelAref);
- $excel2->write_col(0,0, $excelAref2);
- $excelBook->close();
- my @resCnt = map { $rescnt{$_} || 0 } qw/Falcosnp Falcoindel/;
- my $pctGood = sprintf("%.2f", $amp100 / scalar(keys(%QC)) * 100);
- #print INDEX "$sample | BAM | $resCnt[0] | $resCnt[1] | $readCnt | $pctGood |
\n";
- my $ntbsAmps = join(",", map { s/^(.*?)\.chr.*$/$1/; $_; } keys(%ntbGenes));
- push @{$excel0Ref}, [$sample, $runName, $readCnt, $pctGood, $ntbsAmps];
-}
-
-$excel0->write_col(0,0, $excel0Ref);
-$excelBook0->close();
-
-#print INDEX "
";
-#print INDEX "
";
-#print INDEX "
";
-#print INDEX "
";
-#print INDEX "