# HG changeset patch
# User stef
# Date 1426856297 14400
# Node ID ebb4c3b8b9db23c4bfc26f78ed21f02dc6faa8e9
# Parent 5287f72e08916f8d4a07f73e52c85810e34431e7
Uploaded
diff -r 5287f72e0891 -r ebb4c3b8b9db README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,24 @@
+FALCO Amplicon Analysis Pipeline & Galaxy wrapper
+========================================
+
+FALCO variant-caller is part of the Amplicon Analysis Pipeline (AAP).
+
+The typical workflow is as follows:
+* paired-end amplicon sequencing
+* merge pairs (so only overlapping pairs are included)
+* map the single read fastq with BWA
+* perform variant calling with FALCO
+* create (html) report of the results
+
+FALCO uses samtools and straight-forward statistics to determine wether a
+potential variant is likely a (technical) artifact or not.
+
+Input / Output
+
+Input of the caller is a BAM file, output VCF
+
+
+Bug Reports & other questions
+=============================
+
+Issues can be reported via http://www.tgac.nl/
diff -r 5287f72e0891 -r ebb4c3b8b9db falco-call.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco-call.sh Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,123 @@
+#!/bin/bash
+TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+echo "Started FALCO calling"
+
+## ----------
+## Variables setup ($1 contains the bash config file path)
+## ----------
+source $1
+
+## ----------
+## make sure all is ok
+## ----------
+if [ ! -f $REF_FILE".fai" ]
+then
+ echo "No FAI index (fai) found for reference fasta [$REF_FILE]"
+ exit "No FAI index (fai) found for reference fasta [$REF_FILE]"
+fi
+
+## ----------
+## set params
+## ----------
+if [[ $filter_file != 'None' && $filter_file != '' ]] # Galaxy default is "None" for some reason
+then
+ FILTER_PARAM=" --filter "$filter_file
+else
+ FILTER_PARAM=""
+fi
+
+if [[ $manifest_file != 'None' && $manifest_file != 'None' ]] # Galaxy default is "None" for some reason
+then
+ MANIFEST_PARAM=" --manifest "$manifest_file
+else
+ MANIFEST_PARAM=""
+fi
+
+## name of file in galaxy not always set so will use a user-set job_name instead
+#bam_base=`echo $bam_name | sed 's#.bam$##' - `
+bam_base=$job_name
+
+## ----------
+## Status / debug
+## ----------
+DEBUG=1
+if [ $DEBUG ]
+then
+ DBS="[INFO] "
+ echo $DBS"FILTER: "$filter_file
+ echo $DBS"MANIFEST: "$manifest_file
+ echo $DBS"REF FILE: "$REF_FILE
+ echo $DBS"DB KEY: "$DB_KEY
+ echo $DBS"REF SRC: "$REF_SOURCE
+ echo $DBS"BAM FILE: "$bam_file
+ echo $DBS"BAM NAME: "$bam_name
+ echo $DBS"BAM BASE: "$bam_base
+ echo $DBS"OUT PATH: "$out_path
+fi
+
+## ----------
+## create output files dir
+## ----------
+mkdir $out_path
+
+## ----------
+## running analysis
+## ----------
+echo "[INFO] Starting variant calling"
+## NOTE: if $FILTER_PARAM is set it includes the param name (--filter)
+## NOTE: if $MANIFEST_PARAM is set it includes the param name (--manifest)
+CALL_STRING="$TOOLDIR/falco/bin/falco --bam $bam_file --output $bam_base --ref $REF_FILE $FILTER_PARAM $MANIFEST_PARAM"
+echo "[INFO] "$CALL_STRING
+perl $CALL_STRING
+echo "[INFO] done with variant calling"
+
+
+## ----------
+## create index html for main galaxy output
+## ----------
+echo "" >> $html_out
+echo "" >> $html_out
+echo "
" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo " FALCO
" >> $html_out
+echo " This page is way to get output files that are not implemented in galaxy history, it is not intended to be a user-friendly way of displaying anything ;)
" >> $html_out
+#echo " HTML" >> $html_out
+echo " " >> $html_out
+for file in *.vcf *.txt *stderr *stdout
+#for file in *
+do
+ lineCount=`wc -l $file | cut -f 1 -d " "`
+ echo " $file has $lineCount lines |
" >> $html_out
+ echo " --> " `head -1 $file` " |
" >> $html_out
+done
+echo "
" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+
+## ----------
+## creating galaxy history outputs
+## ----------
+#cp 'index.html' $html_out # this is the overview of samples html
+#cp $bam_base'.html' $out_path/'out.html' # this is the sample html
+cp $bam_base'.falco.vcf' $vcf_out
+cp $bam_base'.qc.ann.qual.txt' $qc_ann_qual_out
+cp $bam_base'.qc2.ann.txt' $qc2_ann_txt_out
+cp $bam_base'.qc.targets.txt' $qc_targets_txt_out
+
+## ----------
+## copy files to keep to output path
+## ----------
+#cp -r ./$bam_base/*png $out_path/$bam_base/
+#cp -r ./* $out_path
+cp *.vcf $out_path; cp *.txt $out_path; cp *_std* $out_path
+
+## ----------
+echo "END falco sh"
+exit 0
diff -r 5287f72e0891 -r ebb4c3b8b9db falco-filter-report.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco-filter-report.sh Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,120 @@
+#!/bin/bash
+TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+echo "Started FALCO calling"
+
+## ----------
+## Variables setup ($1 contains the bash config file path)
+## ----------
+source $1
+
+## ----------
+## make sure all is ok
+## ----------
+#if [ ! -f $REF_FILE".fai" ]
+#then
+# echo "No FAI index (fai) found for reference fasta [$REF_FILE]"
+# exit "No FAI index (fai) found for reference fasta [$REF_FILE]"
+#fi
+
+## ----------
+## set params
+## ----------
+if [[ $vcf2_file != 'None' && $vcf2_file != '' ]] # Galaxy default is "None" for some reason
+then
+ VCF2_PARAM=" --vcfOther "$vcf2_file
+else
+ VCF2_PARAM=""
+fi
+
+# if [[ $manifest_file != 'None' && $manifest_file != 'None' ]] # Galaxy default is "None" for some reason
+# then
+# MANIFEST_PARAM=" --manifest "$manifest_file
+# else
+# MANIFEST_PARAM=""
+# fi
+
+## name of file in galaxy not always set so will use a user-set job_name instead
+#bam_base=`echo $bam_name | sed 's#.bam$##' - `
+vcf_base=$job_name
+sample_html_file=$vcf_base'.html'
+
+## ----------
+## Status / debug
+## ----------
+DEBUG=1
+# if [ $DEBUG ]
+# then
+# DBS="[INFO] "
+# echo $DBS"FILTER: "$filter_file
+# echo $DBS"MANIFEST: "$manifest_file
+# echo $DBS"REF FILE: "$REF_FILE
+# echo $DBS"DB KEY: "$DB_KEY
+# echo $DBS"REF SRC: "$REF_SOURCE
+# echo $DBS"BAM FILE: "$bam_file
+# echo $DBS"BAM NAME: "$bam_name
+# echo $DBS"BAM BASE: "$bam_base
+# echo $DBS"OUT PATH: "$out_path
+# fi
+
+## ----------
+## create output files dir
+## ----------
+mkdir $out_path
+
+## ----------
+## running analysis
+## ----------
+echo "[INFO] Starting FALCO reporting"
+CMD_STRING="$TOOLDIR/falco/bin/falco-filter-report --vcf $vcf_file --output $vcf_base --qc_ann_qual_txt $qc_ann_qual_file --qc2_ann_txt $qc2_ann_txt_file --qc_targets_txt $qc_targets_txt_file $VCF2_PARAM"
+echo "[INFO] "$CMD_STRING
+perl $CMD_STRING
+echo "[INFO] done with FALCO reporting"
+
+
+## ----------
+## create index html for main galaxy output
+## ----------
+echo "" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+echo " FALCO
" >> $html_out
+echo " RESULTS" >> $html_out
+#echo " This page is way to get output files that are not implemented in galaxy history, it is not intended to be a user-friendly way of displaying anything ;)
" >> $html_out
+#echo " HTML" >> $html_out
+#echo " " >> $html_out
+#for file in *.vcf *.txt *stderr *stdout
+#for file in *
+#do
+# lineCount=`wc -l $file | cut -f 1 -d " "`
+# echo " $file has $lineCount lines |
" >> $html_out
+# echo " --> " `head -1 $file` " |
" >> $html_out
+#done
+#echo "
" >> $html_out
+echo "" >> $html_out
+echo "" >> $html_out
+
+## ----------
+## creating galaxy history outputs
+## ----------
+#cp 'index.html' $html_out # this is the overview of samples html
+cp $sample_html_file $out_path # this is the sample html
+#cp $vcf_base'.html' $html_out # this is the sample html
+
+## ----------
+## copy files to keep to output path
+## ----------
+#cp -r ./$bam_base/*png $out_path/$bam_base/
+cp -r ./* $out_path
+#cp *.vcf $out_path; cp *.txt $out_path; cp *_std* $out_path
+
+## ----------
+echo "END falco sh"
+exit 0
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/README Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,33 @@
+Usage:
+
+falco
+
+input:
+- bam file
+- manifest file
+- ref + fai
+- locifilt
+- manifest
+
+output:
+- *.vcf file
+- *.qc.ann.txt -> catalog of each assayed bp
+- *.qc2.ann.txt -> catalog of transitions
+- *.qc.targets.txt -> amplicon centered information, eg depth, start/end positions.
+
+example:
+
+OUTBASE="HCT116-TEST"
+
+## variant calling
+falco --bam data/HCT-116/HCT116.bam --output $OUTBASE --ref genome.fa
+
+file1=$OUTBASE".qc.ann.qual.txt"
+file2=$OUTBASE".qc2.ann.txt"
+file3=$OUTBASE".qc.targets.txt"
+file4=$OUTBASE".res.filtered.tsv"
+
+## variant filtering and reporting
+falco-filter-report --vcf snpeff.vcf --output $OUTBASE --qc_ann_qual_txt $file1 --qc2_ann_txt $file2 --qc_targets_txt $file3 --res_filtered_tsv $file4
+
+
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/bin/falco
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/bin/falco Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/perl
+
+use strict;
+use Cwd 'abs_path';
+use Getopt::Long;
+use File::Basename;
+
+my $absPath = abs_path($0);
+my $dn = dirname($absPath);
+
+# Set options
+
+my $bam;
+my $ref = "$dn/../ref/hg19/hg19.fa";
+my $locifilt = "$dn/../ref/filters/filter.tsv";
+my $manifest = "$dn/../ref/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
+my $base = $bam;
+
+GetOptions (
+ "bam=s" => \$bam,
+ "ref=s" => \$ref,
+ "filter=s" => \$locifilt,
+ "manifest=s" => \$manifest,
+ "output=s" => \$base,
+)
+or die("Error in command line arguments\n");
+
+## sanity checks
+unless ( -f $ref ){
+ print STDERR "Genome Reference file not found [$ref]\n";
+ exit 1;
+}
+unless ( -f $ref.'.fai' ){
+ print STDERR "Genome Reference index file not found [$ref\.fai]\n";
+ exit 1;
+}
+
+
+# Set paths to external script
+# TODO make modules
+
+my $perAmpliconAnalysis = "$dn/../lib/perl/perAmpliconAnalysis.pl";
+my $addQual = "$dn/../lib/R/addQual.R";
+my $func = "$dn/../lib/R/func.R";
+my $qcFilt = "$dn/../lib/perl/qcFilt.pl";
+my $qc2vcf = "$dn/../lib/perl/qc2vcf.pl";
+
+# Check Rscript
+my $rscript = `which Rscript`;
+chomp $rscript;
+if ($rscript !~ /Rscript$/) {
+ print STDERR "No Rscript present in PATH\n";
+ exit 1;
+}
+
+# Check samtools
+my $samtools = `which samtools`;
+chomp $samtools;
+if ($samtools !~ /samtools$/) {
+ print STDERR "No samtools present in PATH\n";
+ exit 1;
+}
+
+print STDOUT localtime() . " [$$] $perAmpliconAnalysis $bam $ref $manifest $base $samtools\n";
+system("$perAmpliconAnalysis $bam $ref $manifest $base $samtools");
+
+print STDOUT localtime() . " [$$] $rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt\n";
+system("$rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt");
+
+print STDOUT localtime() . " [$$] $qcFilt $base.qc.ann.qual.txt $base\n";
+system("$qcFilt $base.qc.ann.qual.txt $base");
+
+print STDOUT localtime() . " [$$] $qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf\n";
+system("$qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf");
+
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/bin/falco-filter-report
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/bin/falco-filter-report Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/perl
+use strict;
+use Cwd 'abs_path';
+use Getopt::Long;
+use File::Basename;
+
+my $absPath = abs_path($0);
+my $dir = dirname($absPath);
+my $lib = "$dir/../lib/";
+
+my $vcf2tsv = "$lib/perl/vcf2tsv.pl";
+my $spliteff = "$lib/perl/splitEff.pl";
+my $filter = "$lib/perl/filter.pl";
+my $plotPng = "$lib/R/plotsPng.R";
+my $mkReport = "$lib/perl/mkHtmlReportGalaxy.pl";
+
+my $locifilt = "$dir/../ref/filters/filter.tsv";
+my $manifest = "$dir/../ref/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
+
+my $canonicals = "$dir/../ref/TSACP/canonicals.tsv";
+my $clinvar = "$dir/../ref/filters/clinvar_00-latest.f.vcf";
+my $cosmic = "$dir/../ref/filters/CosmicCodingMuts_v64_26032013_noLimit_wgs.f.vcf";
+my $cosmicNC = "$dir/../ref/filters/CosmicNonCodingVariants_v64_26032013_noLimit_wgs.f.vcf";
+
+my $base = undef;
+my $vcf = undef;
+my $vcfOther = undef;
+my $noFilt = undef;
+my $noPlot = undef;
+
+my $qc_ann_qual_txt = undef;
+my $qc2_ann_txt = undef
+my $qc_targets_txt = undef;
+
+GetOptions (
+ "vcf=s" => \$vcf,
+ "vcfOther=s" => \$vcfOther,
+ "output=s" => \$base,
+ "clinvar=s" => \$clinvar,
+ "cosmic=s" => \$cosmic,
+ "cosmicNC=s" => \$cosmicNC,
+ "noFilt" => \$noFilt,
+ "noPlot" => \$noPlot,
+ "qc_ann_qual_txt=s" => \$qc_ann_qual_txt,
+ "qc2_ann_txt=s" => \$qc2_ann_txt,
+ "qc_targets_txt=s" => \$qc_targets_txt,
+)
+or die("Error in command line arguments\n");
+
+## sanity checks
+die( "No base name provided [-output]\n" ) unless defined($base) and $base ne '';
+die( "No VCF file provided [-vcf]\n" ) unless defined($vcf) and -f $vcf;
+die( "Missing input [-qc_ann_qual_txt]\n" ) unless defined($qc_ann_qual_txt) and -f $qc_ann_qual_txt;
+die( "Missing input [-qc2_ann_txt]\n" ) unless defined($qc2_ann_txt) and -f $qc2_ann_txt;
+die( "Missing input [-qc_targets_txt]\n" ) unless defined($qc_targets_txt) and -f $qc_targets_txt;
+die( "Required file does not exists [$canonicals]\n" ) unless -f $canonicals;
+die( "Required file does not exists [$clinvar]\n" ) unless -f $clinvar;
+die( "Required file does not exists [$cosmic]\n" ) unless -f $cosmic;
+die( "Required file does not exists [$cosmicNC]\n" ) unless -f $cosmicNC;
+
+
+## Rscript check
+my $rscript = `which Rscript`;
+chomp $rscript;
+if ($rscript !~ /Rscript$/) {
+ print STDERR "No Rscript present in PATH\n";
+ exit 1;
+}
+
+## FILTERING
+print STDOUT localtime() . " [$$] converting vcf to tsv\n";
+system( "$vcf2tsv $vcf > $base\.tsv" );
+
+print STDOUT localtime() . " [$$] splitting vcf columns\n";
+system( "$spliteff $base\.tsv Falco >> $base\.res\.tsv" );
+
+if ( defined($vcfOther) ){
+ print STDOUT localtime() . " [$$] converting vcf to tsv\n";
+ system( "$vcf2tsv $vcf > $base\.Other\.tsv" );
+ print STDOUT localtime() . " [$$] splitting vcf columns\n";
+ system( "$spliteff $base\.Other\.tsv Other >> $base\.res\.tsv" );
+}
+
+print STDOUT localtime() . " [$$] filtering data\n";
+system( "$filter $base\.res\.tsv $canonicals $clinvar $cosmic $cosmicNC > $base\.res\.filtered\.tsv" );
+
+## PLOTTING
+print STDOUT localtime() . " [$$] Creating plots\n";
+system( "Rscript $plotPng $qc_ann_qual_txt $qc2_ann_txt $qc_targets_txt $base\.res\.filtered\.tsv $clinvar $locifilt $base" );
+
+## REPORTING
+print STDOUT localtime() . " [$$] Creating HTML report\n";
+system( "perl $mkReport $base $qc_targets_txt" );
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/lib/R/addQual.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/R/addQual.R Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,21 @@
+#!/usr/bin/Rscript
+args <- commandArgs(T)
+qcFile <- args[1]
+output <- args[2]
+funcR <- args[3]
+lociFiltFile <- args[4]
+
+read.delim(lociFiltFile, header=F) -> lociFiltD
+lociFilt <- paste(paste("chr", lociFiltD[,1], sep=""), lociFiltD[,2], sep=":")
+
+source(funcR)
+read.delim(qcFile, header=T) -> qc
+
+#addQual(qc) -> qc.qual
+addQualTransCor(qc, lociFilt) -> qc.qual
+#addQualVarCor(qc) -> qc.qual
+
+write.table(qc.qual, output, row.names=F, col.names=T, quote=F, sep="\t")
+#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
+
+
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/lib/R/func.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/R/func.R Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,575 @@
+#!/usr/bin/Rscript
+#args <- commandArgs(T)
+#qcFile <- args[1]
+#output <- args[2]
+
+#read.delim(qcFile, header=T) -> qc
+
+addQual <- function(qc) {
+
+qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.vector(qc$ins)
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+delSel <- qc$del != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+insSel <- qc$ins != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob, lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+ }
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+ }
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+ }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+addQualTransCor <- function(qc, lociFilt) {
+
+
+# lociFilt contains snp, cosmic and clinvar positions
+
+qc$del <- as.character(as.vector(qc$del))
+#qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.character(as.vector(qc$ins))
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+rle(as.vector(qc$ntRef)) -> srle
+hompols <- srle$lengths >= 5
+homPosS <- cumsum(srle$lengths)[hompols]
+homPosE <- cumsum(srle$lengths)[which(hompols) - 1]
+homFilt <- rep(F, nrow(qc))
+homFilt[homPosS] <- T
+homFilt[homPosE] <- T
+
+delSel <- qc$del != "." & !homFilt
+if (sum(delSel > 0)) {
+
+
+data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$del[delSel], "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+}
+
+insSel <- qc$ins != "." & !homFilt
+if (sum(insSel) > 0) {
+data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$ins[insSel], "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(inss$ln, inss$occ)), lower.tail=F)
+}
+# Calculate transition bias
+
+
+#calcTransCor(qc) -> pCor
+globalProb <- sum(qc$nVar) / (sum(qc$nRef) + sum(qc$nVar))
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob1 <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ prob <- prob1 * calcTransCor(qc[sel,], lociFilt)
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob , lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+} }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+
+addQualVarCor <- function(qc) {
+# Corrected for variant occurence
+
+qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.vector(qc$ins)
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+delSel <- qc$del != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+insSel <- qc$ins != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+# Calculate base bias per run
+sum(qc$nA[grepl("A", qc$ntRef, ignore.case=T)]) -> refA
+sum(qc$nA[!grepl("A", qc$ntRef, ignore.case=T)]) -> varA
+sum(qc$nC[grepl("C", qc$ntRef, ignore.case=T)]) -> refC
+sum(qc$nC[!grepl("C", qc$ntRef, ignore.case=T)]) ->varC
+sum(qc$nG[grepl("G", qc$ntRef, ignore.case=T)]) -> refG
+sum(qc$nG[!grepl("G", qc$ntRef, ignore.case=T)]) -> varG
+sum(qc$nT[grepl("T", qc$ntRef, ignore.case=T)]) -> refT
+sum(qc$nT[!grepl("T", qc$ntRef, ignore.case=T)]) -> varT
+
+varA / (refA + varA) -> errA
+varC / (refC + varC) -> errC
+varG / (refG + varG) -> errG
+varT / (refT + varT) -> errT
+c(errA, errC, errG, errT) / mean(c(errA, errC, errG, errT)) -> errACGT
+
+errMat <- qc[,9:12]
+errMat[grepl("A", qc$ntRef, ignore.case=T),1] <- 0
+errMat[grepl("C", qc$ntRef, ignore.case=T),2] <- 0
+errMat[grepl("G", qc$ntRef, ignore.case=T),3] <- 0
+errMat[grepl("T", qc$ntRef, ignore.case=T),4] <- 0
+
+pCor <- rep(1, nrow(qc))
+pCor[grepl("^A", qc$ntVar, ignore.case=T)] <- errACGT[1]
+pCor[grepl("^C", qc$ntVar, ignore.case=T)] <- errACGT[2]
+pCor[grepl("^G", qc$ntVar, ignore.case=T)] <- errACGT[3]
+pCor[grepl("^T", qc$ntVar, ignore.case=T)] <- errACGT[4]
+
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob * pCor[sel], lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+ }
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+ }
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+ }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+#addQual(qc) -> qc
+#write.table(qc, output, row.names=F, col.names=T, quote=F, sep="\t")
+#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
+
+calcTrans <- function(qcRaw, lociFilt = NULL) {
+
+sel <- rep(T, nrow(qcRaw))
+if (!is.null(lociFilt)) {
+paste(qcRaw$X.chr, qcRaw$pos, sep=":") -> qcpos
+sel <- ! qcpos %in% lociFilt
+}
+
+qc <- qcRaw[sel,]
+
+# Calculate transition bias
+refA <- grepl("A", qc$ntRef, ignore.case=T)
+refC <- grepl("C", qc$ntRef, ignore.case=T)
+refG <- grepl("G", qc$ntRef, ignore.case=T)
+refT <- grepl("T", qc$ntRef, ignore.case=T)
+varA <- grepl("A", qc$ntVar, ignore.case=T)
+varC <- grepl("C", qc$ntVar, ignore.case=T)
+varG <- grepl("G", qc$ntVar, ignore.case=T)
+varT <- grepl("T", qc$ntVar, ignore.case=T)
+varN <- grepl(".", qc$ntVar, ignore.case=T)
+callA <- grepl("^A", qc$ntVar, ignore.case=T)
+callC <- grepl("^C", qc$ntVar, ignore.case=T)
+callG <- grepl("^G", qc$ntVar, ignore.case=T)
+callT <- grepl("^T", qc$ntVar, ignore.case=T)
+ntA <- sum(refA)
+ntC <- sum(refC)
+ntG <- sum(refG)
+ntT <- sum(refT)
+
+callList <- list(callA, callC, callG, callT)
+varList <- list(varA, varC, varG, varT)
+refList <- list(refA, refC, refG, refT)
+
+#tots <- c(
+#rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
+#rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
+#rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
+#rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
+
+nvar <- c(
+rep(sum(qc$nC[refA], qc$nG[refA], qc$nT[refA]), 4),
+rep(sum(qc$nA[refC], qc$nG[refC], qc$nT[refC]), 4),
+rep(sum(qc$nA[refG], qc$nC[refG], qc$nT[refG]), 4),
+rep(sum(qc$nA[refT], qc$nC[refT], qc$nG[refT]), 4))
+
+ncall <- c(
+rep(sum(qc$nVar[refA]), 4),
+rep(sum(qc$nVar[refC]), 4),
+rep(sum(qc$nVar[refG]), 4),
+rep(sum(qc$nVar[refT]), 4))
+
+nref <- c(
+rep(sum(qc$nRef[refA]), 4),
+rep(sum(qc$nRef[refC]), 4),
+rep(sum(qc$nRef[refG]), 4),
+rep(sum(qc$nRef[refT]), 4))
+
+tots <- nref + nvar
+
+nts <- c("A", "C", "G", "T")
+sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), npos=rep(c(ntA, ntC, ntG, ntT), each=4), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref, ncall=ncall)
+
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+# print(sum(refList[[i]] & varList[[j]]))
+
+ cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
+ if (i == j) {
+# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
+ cnt <- NA
+ }
+ sumTrans$cnt[pos + j] <- cnt
+ #print(cnt)
+}}
+
+#sumTrans$cnt / sumTrans$nvar / sumTrans$tot * 1000000 -> tmp
+sumTrans$cnt / sumTrans$tot * 1000000 -> tmp
+
+#totalNT <- sum(qc$nVar) + sum(qc$nRef)
+#sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+tmp / median(tmp, na.rm=T) -> sumTrans$cor2
+
+sumTrans$cor[sumTrans$npos <= 15] <- 1
+sumTrans$cor2[sumTrans$npos <= 15] <- 1
+
+#sumTrans$cnt / totalNT -> sumTrans$cor
+#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+
+return(sumTrans)
+
+}
+
+calcTransCor <- function(qcRaw, lociFilt = NULL) {
+
+calcTrans(qcRaw, lociFilt) -> sumTrans
+
+pCor <- rep(1, nrow(qcRaw))
+paste(qcRaw$ntRef, substr(qcRaw$ntVar, 1, 1), sep=":") -> qcTrans
+
+for (i in c("A", "C", "G", "T")) {
+for (j in c("A", "C", "G", "T")) {
+ if (i != j) {
+ tTrans <- paste(i, j, sep=":")
+ pCor[grepl(tTrans, qcTrans, ignore.case=T)] <- sumTrans$cor2[sumTrans$trans == tTrans]
+ }
+
+}}
+return(pCor)
+}
+
+
+calcTransCorOld <- function(qc) {
+# Calculate transition bias
+refA <- grepl("A", qc$ntRef, ignore.case=T)
+refC <- grepl("C", qc$ntRef, ignore.case=T)
+refG <- grepl("G", qc$ntRef, ignore.case=T)
+refT <- grepl("T", qc$ntRef, ignore.case=T)
+varA <- grepl("A", qc$ntVar, ignore.case=T)
+varC <- grepl("C", qc$ntVar, ignore.case=T)
+varG <- grepl("G", qc$ntVar, ignore.case=T)
+varT <- grepl("T", qc$ntVar, ignore.case=T)
+varN <- grepl(".", qc$ntVar, ignore.case=T)
+callA <- grepl("^A", qc$ntVar, ignore.case=T)
+callC <- grepl("^C", qc$ntVar, ignore.case=T)
+callG <- grepl("^G", qc$ntVar, ignore.case=T)
+callT <- grepl("^T", qc$ntVar, ignore.case=T)
+
+callList <- list(callA, callC, callG, callT)
+varList <- list(varA, varC, varG, varT)
+refList <- list(refA, refC, refG, refT)
+
+tots <- c(
+rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
+rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
+rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
+rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
+
+nvar <- c(
+rep(sum(qc$nVar[refA]), 4),
+rep(sum(qc$nVar[refC]), 4),
+rep(sum(qc$nVar[refG]), 4),
+rep(sum(qc$nVar[refT]), 4))
+
+nref <- c(
+rep(sum(qc$nRef[refA]), 4),
+rep(sum(qc$nRef[refC]), 4),
+rep(sum(qc$nRef[refG]), 4),
+rep(sum(qc$nRef[refT]), 4))
+
+nts <- c("A", "C", "G", "T")
+sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref)
+
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+# print(sum(refList[[i]] & varList[[j]]))
+
+ cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
+ if (i == j) {
+# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
+ cnt <- NA
+ }
+ sumTrans$cnt[pos + j] <- cnt
+ #print(cnt)
+}}
+#totalNT <- sum(qc$nVar) + sum(qc$nRef)
+sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+#sumTrans$cnt / totalNT -> sumTrans$cor
+#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+#print(sumTrans)
+
+pCor <- rep(1, nrow(qc))
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+ if (i != j) {
+ pCor[refList[[i]] & callList[[j]]] <- sumTrans$cor[pos + j]
+ }
+
+}}
+return(pCor)
+}
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/lib/R/plotsPng.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/R/plotsPng.R Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,504 @@
+#!/usr/bin/Rscript
+
+args <- commandArgs(T)
+
+print(args[1]) # qc.ann.qual.txt
+print(args[2]) # qc2.ann.qtxt
+print(args[3]) # qc.targets.txt
+print(args[4]) # res.txt (unfiltered)
+print(args[5]) # clinvar
+print(args[6]) # lociFilt
+print(args[7]) # basename
+options(warn=1)
+print(paste("Reading", args[1], sep=" "));
+read.delim(args[1], header=T, stringsAsFactors=F) -> d
+print(paste("Reading", args[2], sep=" "));
+read.delim(args[2], header=T, stringsAsFactors=F) -> d2
+print(paste("Reading", args[3], sep=" "));
+read.delim(args[3], header=T, stringsAsFactors=F) -> qc
+print(paste("Reading", args[4], sep=" "));
+read.delim(args[4], header=T, stringsAsFactors=F, row.names=NULL) -> res
+print(paste("Reading", args[5], sep=" "));
+## clinvar is a VCF with "##" header lines to skip
+headerLineCount <- length( grep('^##', readLines(args[5])) )
+read.delim(args[5], header=T, stringsAsFactors=F, skip=headerLineCount) -> dclin
+print(paste("Reading", args[6], sep=" "));
+read.delim(args[6], header=F, stringsAsFactors=F) -> lociFilt
+base <- args[7]
+
+# Error rate per cycle
+# VAF distribution
+# varDepth distribution ?
+# Print coverage plots per amplicon
+
+tmpCP <- paste(d$X.chr, d$pos, sep=":")
+tmpClin <- paste(paste("chr", dclin[,1], sep=""), dclin[,2], sep=":")
+tmpSnp <- paste(paste("chr", lociFilt[,1], sep=""), lociFilt[,2], sep=":")
+
+inClin <- tmpCP %in% tmpClin
+
+inSnp <- tmpCP %in% tmpSnp
+
+dir.create(base)
+setwd(base)
+#unique(unlist(strsplit(d$V3, "\\|"))) -> targets
+targets <- qc[,1]
+if (1) {
+ print("Plotting variant heatmap")
+ # Including off targets increases the amplicon length set at median amplicon length + 10%.
+ median(qc$end - qc$start) -> maxAmpLn
+ maxAmpLn <- ceiling(1.2 * maxAmpLn)
+ print(maxAmpLn)
+ ampHeatF <- matrix(ncol=maxAmpLn, nrow=nrow(qc))
+ ampHeatR <- matrix(ncol=maxAmpLn, nrow=nrow(qc))
+ #dPos <- paste(d$chr, d$pos, sep=":")
+ #errPos <- paste(dclin$chr, dclin$pos, sep=":")
+ #dPosClin <- dPos %in% errPos
+
+ dPosClin <- !inClin & !inSnp
+
+ for (i in 1:nrow(qc)) {
+ d$pos %in% qc$start[i]:qc$end[i] -> sel
+ poss <- d$pos[sel & !dPosClin] - qc$start[i] + 1
+ poss <- poss[poss <= maxAmpLn]
+ ampHeatF[i, poss] <- d$nVar[sel & !dPosClin] + d$nN[sel & !dPosClin]
+ ampHeatR[i, poss] <- rev(d$nVar[sel & !dPosClin] + d$nN[sel & !dPosClin])
+ }
+ png(paste(base,"heat","png", sep="."), width=960, height=480)
+ par(mar=c(4,4,4,2) + .1)
+
+ ampHeatF[is.na(ampHeatF)] <- 0
+ ampHeatR[is.na(ampHeatR)] <- 0
+ layout(matrix(1:2, nrow=2))
+ maxN <- quantile(ampHeatF, .99)
+ #heatmap(ampHeat)
+ boxplot(ampHeatF, ylim=c(0, maxN), pch=20, cex=.4, main="Non reference counts R1")
+ rect(150, 0, ncol(ampHeatF), maxN, col="#00000040")
+ boxplot(ampHeatR, ylim=c(0, maxN), pch=20, cex=.4, main="Non reference counts R2")
+ rect(150, 0, ncol(ampHeatR), maxN, col="#00000040")
+ dev.off()
+
+ #d[,7]/d[,4] * 100 -> d$pctREF
+ #d[,8]/d[,4] * 100 -> d$pctVAR
+ d$nRef/d$dp * 100 -> d$pctREF
+ d$nVar/d$dp * 100 -> d$pctVAR
+ d$vaf <- d$nVar / (d$nVar + d$nRef) * 100
+ print("Plotting ref frequencies")
+ png(paste(base,"raf","png", sep="."), width=480, height=480)
+ dref <- density(d$pctREF[!dPosClin])
+ #plot(dref, ylim=range(dref$y[dref$x < 80] ))
+ hist(d$pctREF[!dPosClin], breaks=100, ylim=c(0, 30), col="#00000040", border=NA)
+ dev.off()
+
+ print("Plotting var frequencies")
+ png(paste(base,"vaf","png", sep="."), width=480, height=480)
+ dvar <- density(d$pctVAR[!dPosClin])
+ #plot(dvar, ylim=range(dvar$y[dvar$x > 20] ))
+ hist(d$pctVAR[!dPosClin], breaks=100, ylim=c(0, 30), col="#00000040", border=NA)
+ dev.off()
+
+ # Set vaf cutoff
+ #q95 <- quantile(d$qScore, .95, na.rm=T)
+ q95 <- 20
+ vs95 <- sd(d$pctVAR[!dPosClin & d$qScore < q95], na.rm=T)
+ v95 <- mean(d$pctVAR[!dPosClin & d$qScore < q95], na.rm=T) + vs95 * 3
+
+ nCall <- sum((d$pctVAR > v95) & d$qScore > q95 & !inSnp, na.rm=T)
+ print(nCall)
+ print("Plotting vaf vs q")
+ png(paste(base,"snv-q","png", sep="."), width=480, height=480)
+ plot(d$qScore, d$pctVAR, pch=20, cex=.5, main=paste("q:", q95, "-", "v:", v95, "--", "s:", nCall, sep=" "))
+ abline(h=c(1,5), v=c(20,13), lty=2)
+ abline(v=q95, h=v95, lty=3, col=2)
+ points(d$qScore[d$dp < 100], d$pctVAR[d$dp < 100], pch=20, cex=1.2, col=2)
+ points(d$qScore[d$nVar < 10], d$pctVAR[d$nVar < 10], pch=20, cex=1, col=4)
+ legend(30,80, c("pass", "dp < 100", "nVar < 10"), col=c(1,2,4), pch=20)
+ dev.off()
+
+ print("Plotting vaf vs q zoomed")
+ png(paste(base,"snv-q-zoom","png", sep="."), width=480, height=480)
+ plot(d$qScore, d$pctVAR, pch=20, cex=.5, xlim=c(0, 40),ylim=c(0,10))
+ abline(h=c(1,5), v=c(20,13), lty=2)
+ abline(v=q95, h=v95, lty=3, col=2)
+ points(d$qScore[d$dp < 100], d$pctVAR[d$dp < 100], pch=20, cex=1.2, col=2)
+ points(d$qScore[d$nVar < 10], d$pctVAR[d$nVar < 10], pch=20, cex=1, col=4)
+ legend("topright", c("pass", "dp < 100", "nVar < 10"), col=c(1,2,4), pch=20)
+ dev.off()
+
+ print("Plotting ins vs q")
+ d$vafI <- 0
+ d$occI <- 0
+ if (sum(d$ins != ".") > 0) {
+ png(paste(base,"ins-q","png", sep="."), width=480, height=480)
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.character(d$ins[d$ins != "."]), "\\|")), ":")), ncol=2, byrow=T)) -> insDat
+ colnames(insDat) <- c("seq", "occ")
+ insDat$seq <- as.character(insDat$seq)
+ insDat$occ <- as.numeric(as.vector(insDat$occ))
+ nchar(insDat$seq) -> insDat$ln
+ d$occI[d$ins != "."] <- insDat$occ
+ d$vafI[d$ins != "."] <- insDat$occ / d$nRef[d$ins != "."]
+ d$qScoreI[d$qScoreI > 1000] <- 1000
+ plot(d$qScoreI, d$occI, pch=20, cex=1)
+ abline(h=c(10), v=c(20,13), lty=2)
+ # dp 500
+ sel <- d$dp < 100
+ points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1.2, col=2)
+ # nVar 10
+ #sel <- d$occI < 10
+ #points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1, col=3)
+ # vaf < 0.01
+ sel <- d$vafI < 0.01
+ points(d$qScoreI[sel], d$occI[sel], pch=20, cex=1, col=4)
+ legend("topleft", c("pass", "dp < 100", "vaf < 0.01"), col=c(1,2,4), pch=20)
+ dev.off()
+ }
+
+ print("Plotting del vs q")
+ d$vafD <- 0
+ d$occD <- 0
+ if (sum(d$del != "." ) > 0) {
+ png(paste(base,"del-q","png", sep="."), width=480, height=480)
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.character(d$del[d$del != "."]), "\\|")), ":")), ncol=2, byrow=T)) -> delDat
+ colnames(delDat) <- c("seq", "occ")
+ delDat$seq <- as.character(delDat$seq)
+ delDat$occ <- as.numeric(as.vector(delDat$occ))
+ nchar(delDat$seq) -> delDat$ln
+ d$occD[d$del != "."] <- delDat$occ
+ d$vafD[d$del != "."] <- d$occD[d$del != "."] / d$nRef[d$del != "."]
+ d$qScoreD[d$qScoreD > 1000] <- 1000
+ #hist(d$qScoreD, breaks=1000, ylim=c(0,100))
+ #print(sum(d$qScoreD > 20))
+ #Sys.sleep(1)
+ plot(d$qScoreD, d$occD, pch=20, cex=1)
+ abline(h=c(10), v=c(20,13), lty=2)
+ # dp 500
+ sel <- d$dp < 100
+ points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1.2, col=2)
+ # nVar 10
+ #sel <- d$occD < 10
+ #points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1, col=3)
+ # vaf < 0.01
+ sel <- d$vafD < 0.01
+ points(d$qScoreD[sel], d$occD[sel], pch=20, cex=1, col=4)
+ legend("topleft", c("pass", "dp < 100", "vaf < 0.01"), col=c(1,2,4), pch=20)
+ dev.off()
+ }
+
+ sub("chr", "", qc$chr) -> qc$chrn
+ ampOrd <- order(qc$chrn, qc$start)
+
+ print("Plotting amplicon depths")
+ png(paste(base,"amp-dp","png", sep="."), width=960, height=480)
+ barplot(qc$depth[ampOrd])
+ abline(h=100, lty=2)
+ totRead <- sum(qc$depth, na.rm=T)
+ legend("topright", paste("Total read count: ", totRead, sep=""))
+ dev.off()
+}
+
+res$POS <- as.numeric(as.vector(res$POS))
+d$pos <- as.numeric(as.vector(d$pos))
+
+grepl("a", d$ntRef, ignore.case=T) -> Asel
+grepl("c", d$ntRef, ignore.case=T) -> Csel
+grepl("g", d$ntRef, ignore.case=T) -> Gsel
+grepl("t", d$ntRef, ignore.case=T) -> Tsel
+cntr <- 0
+
+biasTable <- data.frame(matrix(NA, nrow=length(targets), ncol=12), row.names=targets)
+colnames(biasTable) <- c("AC", "AG", "AT", "CA", "CG", "CT", "GA", "GC", "GT", "TA", "TC", "TG")
+
+for (i in targets) {
+ sel <- grepl(i, d$amp)
+ print(i)
+ for (c in unique(d[sel,1])) {
+ csel <- d[,1] == c
+ print(c)
+ reg <- paste(range(d[sel & csel,2]), collapse="-", sep="-")
+ creg <- paste(c, reg, sep=":")
+
+ qcsel <- qc[,1] == i
+ allsel <- sel & csel
+ clinSel <- dclin$amp == i
+ clinPos <- dclin$pos[clinSel]
+# inClin <- d$pos %in% clinPos
+
+ #dels <- (d[,15] != ".") & allsel
+ #ins <- (d[,14] != ".") & sel
+ # print(reg)
+ # print(creg)
+
+ # Select annotations
+ # calculate quantiles and plot them
+# quantile(d$pctVAR[sel & csel], c(.75,.90,.95,.99), na.rm=T) -> cutoffs
+ tmp <- d[sel & csel,]
+ #dels <- tmp[,15] != "."
+ #ins <- tmp[,14] != "."
+ dels <- allsel & d$del != "."
+ ins <- allsel & d$ins != "."
+
+ resTmp <- res[(res$POS >= min(tmp[,2])) & (res$POS <= max(tmp[,2])) & (res$X.CHROM == c),]
+ #resSel <- res$TARGET == i
+ png(paste(base, i, "cov", "png", sep="."), width=960, height=480)
+
+ xvec <- c(0, rep(1:(sum(allsel) - 1), each=2), sum(allsel))
+ layout(matrix(1:2, ncol=1), heights=c(2,1))
+ par(mar=c(0, 4, 4, 7) + .1, xaxt="n")
+ print("Cov")
+ plot(NULL, type="n", main=i, xlab="Position", ylab="Depth", xlim=range(d$pos[allsel]), ylim=range(0, d$dp[allsel]))
+ xvec <- d$pos[allsel]
+ yvec <- d$dp[allsel]
+
+ polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(yvec, each=2), 0), col="#00000010", border=1)
+ polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(rowSums(d[allsel, 7:8]), each=2), 0), col="#00000010")
+# polygon(rep(c(xvec, max(xvec) + 1), each=2) - .5, c(0, rep(d$nVar[allsel], each=2), 0), col=2)
+ varCol <- rep(2, sum(allsel))
+ varCol[grepl("^a", d$ntVar[allsel], ignore.case=T)] <- "green"
+ varCol[grepl("^c", d$ntVar[allsel], ignore.case=T)] <- "blue"
+ varCol[grepl("^g", d$ntVar[allsel], ignore.case=T)] <- "black"
+ varCol[grepl("^t", d$ntVar[allsel], ignore.case=T)] <- "red"
+ rect(xvec-.5,d$nVar[allsel],xvec+.5,0, col=varCol)
+
+
+ ampMax <- max(d[allsel,4])
+ #abline(v=resTmp$POS, lty=2, col=2)
+ #delMark <- list()
+ #insMark <- list()
+
+ for (idx in which(dels)) {
+ matrix(unlist(strsplit(as.character(as.vector(d[idx,15])), "[:|]")), ncol=2, byrow=T) -> delDat
+ nchar(delDat[,1]) -> lngts
+ dpths <- as.numeric(delDat[,2])
+ x1 <- d[idx,2] + .5
+ x2 <- x1 + lngts - 1
+ if (x2 > max(d$pos[allsel])) {
+ next
+ }
+ # delMark[[length(delMark) + 1]] <- c(x1, x2)
+ rect(x1, ampMax - dpths , x2, ampMax, col="#00000040")
+ }
+
+ for (idx in which(ins)) {
+ matrix(unlist(strsplit(as.character(as.vector(d[idx,14])), "[:|]")), ncol=2, byrow=T) -> insDat
+ nchar(insDat[,1]) -> lngts
+ dpths <- as.numeric(insDat[,2])
+ x1 <- d$pos[idx] + 1
+ # insMark[length(insMark) + 1] <- x1
+ for (ii in 1:ncol(insDat)) {
+ polygon(
+ c(x1, x1 - (.5 * lngts[ii]), x1 + (.5 * lngts[ii])),
+ c(ampMax, ampMax - dpths[ii], ampMax - dpths[ii]),
+ , col="#00000040")
+ }
+ }
+ par(xpd=NA)
+ legend(max(d$pos[allsel]) + sum(allsel) * 0.05, ampMax, legend=c("Filtered nt", "Ref nt", "Non-ref nt"), col=c("#00000040", "#00000080", "red"), pch=15)
+ legend(max(d$pos[allsel]) + sum(allsel) * 0.05, 0, legend=c("A", "C", "G", "T"), col=c("green", "blue", "black", "red"), pch=15)
+ par(xpd=F)
+ par(mar=c(5, 4, 0, 7) + .1, xaxt="s")
+ plot(NULL, type="n", xlab="Position", ylab=NA, xlim=range(d$pos[allsel]), ylim=c(0,4.5), yaxt="n")
+ axis(side=2, at=1:4, labels=c("ins", "del", "snv", "ref"), las=2)
+ abline(h=1:4, col="#00000040")
+ sres <- nchar(as.character(resTmp$REF)) == nchar(as.character(resTmp$ALT))
+ dres <- nchar(resTmp$REF) > nchar(resTmp$ALT)
+ ires <- nchar(resTmp$REF) < nchar(resTmp$ALT)
+
+ resTmp$marks <- rep(3.4, nrow(resTmp))
+
+ resTmp$cols <- rep("#00000040", nrow(resTmp))
+ resTmp$cols[grepl("^a", resTmp$ALT, ignore.case=T) & sres] <- "green"
+ resTmp$cols[grepl("^c", resTmp$ALT, ignore.case=T) & sres] <- "blue"
+ resTmp$cols[grepl("^g", resTmp$ALT, ignore.case=T) & sres] <- "black"
+ resTmp$cols[grepl("^t", resTmp$ALT, ignore.case=T) & sres] <- "red"
+
+ #resTmp$cols[dres] <- 2
+ #resTmp$cols[ires] <- 2
+ resTmp$marks[dres] <- 2.4
+ resTmp$marks[ires] <- 1.4
+ resTmp$markP1 <- resTmp$POS -.5
+ resTmp$markP2 <- resTmp$POS +.5
+ resTmp$markP1[dres] <- resTmp$POS[dres] +.5
+ resTmp$markP2[dres] <- resTmp$POS[dres] +.5 + nchar(resTmp$REF[dres]) - 1
+ resTmp$markP1[ires] <- resTmp$POS[ires] +.5
+ resTmp$markP2[ires] <- resTmp$POS[ires] +.5 + 1
+ ntCol <- rep(1, sum(allsel))
+ ntCol[grepl("a", d$ntRef[allsel], ignore.case=T)] <- "green"
+ ntCol[grepl("c", d$ntRef[allsel], ignore.case=T)] <- "blue"
+ ntCol[grepl("g", d$ntRef[allsel], ignore.case=T)] <- "black"
+ ntCol[grepl("t", d$ntRef[allsel], ignore.case=T)] <- "red"
+
+ rect(d$pos[allsel] -.5, 4.4, d$pos[allsel] +.5, 3.6, col=ntCol, border=0)
+ rect(resTmp$markP1, resTmp$marks, resTmp$markP2, resTmp$marks - .8, col=resTmp$cols, border=0)
+
+ if(length(clinPos != 0)) {
+ rect(clinPos - .5, 3.5, clinPos + .5, .5, border="black", lwd=2)
+ }
+ #lapply(delMark, function(x) { rect(x[1], 2, x[2], 1, col=2, border=NA) })
+ #lapply(insMark, function(x) { rect(x[1], 1, x[1] + 1, 0, col=3, border=NA) })
+
+ # Highlight clinically relevant loci
+ #for (i in which(dclin$amp == i)) {
+ # if (dclin$REF[i] == nchar
+ #}
+
+ dev.off()
+
+ sansClin <- allsel & !inClin
+ sansSnp <- allsel & !inSnp
+
+ d$nA[sansClin & sansSnp & Asel] -> AA
+ d$nC[sansClin & sansSnp & Asel] -> AC
+ d$nG[sansClin & sansSnp & Asel] -> AG
+ d$nT[sansClin & sansSnp & Asel] -> AT
+
+ d$nA[sansClin & sansSnp & Csel] -> CA
+ d$nC[sansClin & sansSnp & Csel] -> CC
+ d$nG[sansClin & sansSnp & Csel] -> CG
+ d$nT[sansClin & sansSnp & Csel] -> CT
+
+ d$nA[sansClin & sansSnp & Gsel] -> GA
+ d$nC[sansClin & sansSnp & Gsel] -> GC
+ d$nG[sansClin & sansSnp & Gsel] -> GG
+ d$nT[sansClin & sansSnp & Gsel] -> GT
+
+ d$nA[sansClin & sansSnp & Tsel] -> TA
+ d$nC[sansClin & sansSnp & Tsel] -> TC
+ d$nG[sansClin & sansSnp & Tsel] -> TG
+ d$nT[sansClin & sansSnp & Tsel] -> TT
+
+ print("Bias")
+ png(paste(base, i, "bias", "png", sep="."), width=960, height=480)
+
+ Acnt <- sum(sum(AA), sum(AC), sum(AG), sum(AT))
+ Ccnt <- sum(sum(CA), sum(CC), sum(CG), sum(CT))
+ Gcnt <- sum(sum(GA), sum(GC), sum(GG), sum(GT))
+ Tcnt <- sum(sum(TA), sum(TC), sum(TG), sum(TT))
+
+ NTsums <- c(
+ sum(AC)/Acnt, sum(AG)/Acnt, sum(AT)/Acnt,
+ sum(CA)/Ccnt, sum(CG)/Ccnt, sum(CT)/Ccnt,
+ sum(GA)/Gcnt, sum(GC)/Gcnt, sum(GT)/Gcnt,
+ sum(TA)/Tcnt, sum(TC)/Tcnt, sum(TG)/Tcnt)
+ NTsums[!is.finite(NTsums)] <- 0
+ NTsums[is.nan(NTsums)] <- 0
+ NTsums[is.na(NTsums)] <- 0
+
+ barplot(NTsums, names.arg=c("AC", "AG", "AT", "CA", "CG", "CT", "GA", "GC", "GT", "TA", "TC", "TG"), main=i)
+ dev.off()
+ biasTable[rownames(biasTable) == i, ] <- NTsums
+
+ print("Cov2")
+ png(paste(base, i, "cov2", "png", sep="."), width=960, height=480)
+ layout(matrix(1:2, ncol=1), heights=c(2,1))
+ par(mar=c(0, 4, 4, 7) + .1, xaxt="n")
+
+
+ dat <- d2[d2$amp == i,]
+ rled <- rle(sort(dat$pos))
+ rds <- sort(unique(dat$X.read))
+ lst <- c()
+ for (ii in order(rled$lengths, decreasing=T))
+ {
+
+ rled$values[ii] -> pos
+ dat$X.read[dat$pos == pos]
+ lst <- c(lst, rds %in% dat$X.read[dat$pos == pos])
+
+ }
+ #xlim <- range(dat$pos)
+ xlim=range(d$pos[allsel])
+ ylim=range(0, d$dp[allsel])
+ plot(NULL, xlim=xlim, ylim=ylim, main=i, ylab="Depth")
+ rect(xlim[1] - 0.5, 0, xlim[2] + .5, ylim[2], col="#00000010")
+ if (length(lst) > 0) {
+ matrix(lst, nrow=length(rds), byrow=F) -> mat
+ do.call( order, data.frame(mat) ) -> ord
+ for (e in 1:length(ord)) {
+ sel <- dat$X.read == rds[ord[e]]
+ colvec <- rep(1, sum(sel))
+ colvec[grepl("A$", dat$event[sel], ignore.case=T)] <- "green"
+ colvec[grepl("C$", dat$event[sel], ignore.case=T)] <- "blue"
+ colvec[grepl("G$", dat$event[sel], ignore.case=T)] <- "black"
+ colvec[grepl("T$", dat$event[sel], ignore.case=T)] <- "red"
+ #points(dat$pos[ sel ], rep(e, sum(sel)), col=colvec, pch=20, cex=.2)
+ segments(dat$pos[ sel ] - .5, rep(e, sum(sel)), dat$pos[ sel ] + .5, rep(e, sum(sel)), col=colvec, pch=20, cex=.2)
+ }
+ }
+
+ par(xpd=NA)
+ legend(max(d$pos[allsel]) + sum(allsel) * 0.05, ampMax, legend=c("Filtered nt", "Ref nt", "Non-ref nt"), col=c("#00000040", "#00000080", "red"), pch=15)
+ legend(max(d$pos[allsel]) + sum(allsel) * 0.05, 0, legend=c("A", "C", "G", "T"), col=c("green", "blue", "black", "red"), pch=15)
+ par(xpd=F)
+ par(mar=c(5, 4, 0, 7) + .1, xaxt="s")
+ plot(NULL, type="n", xlab="Position", ylab=NA, xlim=range(d$pos[allsel]), ylim=c(0,4.5), yaxt="n")
+ axis(side=2, at=1:4, labels=c("ins", "del", "snv", "ref"), las=2)
+ abline(h=1:4, col="#00000040")
+ sres <- nchar(as.character(resTmp$REF)) == nchar(as.character(resTmp$ALT))
+ dres <- nchar(resTmp$REF) > nchar(resTmp$ALT)
+ ires <- nchar(resTmp$REF) < nchar(resTmp$ALT)
+
+ resTmp$marks <- rep(3.4, nrow(resTmp))
+
+ resTmp$cols <- rep("#00000040", nrow(resTmp))
+ resTmp$cols[grepl("^a", resTmp$ALT, ignore.case=T) & sres] <- "green"
+ resTmp$cols[grepl("^c", resTmp$ALT, ignore.case=T) & sres] <- "blue"
+ resTmp$cols[grepl("^g", resTmp$ALT, ignore.case=T) & sres] <- "black"
+ resTmp$cols[grepl("^t", resTmp$ALT, ignore.case=T) & sres] <- "red"
+
+ #resTmp$cols[dres] <- 2
+ #resTmp$cols[ires] <- 2
+ resTmp$marks[dres] <- 2.4
+ resTmp$marks[ires] <- 1.4
+ resTmp$markP1 <- resTmp$POS -.5
+ resTmp$markP2 <- resTmp$POS +.5
+ resTmp$markP1[dres] <- resTmp$POS[dres] +.5
+ resTmp$markP2[dres] <- resTmp$POS[dres] +.5 + nchar(resTmp$REF[dres]) - 1
+ resTmp$markP1[ires] <- resTmp$POS[ires] +.5
+ resTmp$markP2[ires] <- resTmp$POS[ires] +.5 + 1
+ ntCol <- rep(1, sum(allsel))
+ ntCol[grepl("a", d$ntRef[allsel], ignore.case=T)] <- "green"
+ ntCol[grepl("c", d$ntRef[allsel], ignore.case=T)] <- "blue"
+ ntCol[grepl("g", d$ntRef[allsel], ignore.case=T)] <- "black"
+ ntCol[grepl("t", d$ntRef[allsel], ignore.case=T)] <- "red"
+
+ rect(d$pos[allsel] -.5, 4.4, d$pos[allsel] +.5, 3.6, col=ntCol, border=0)
+ rect(resTmp$markP1, resTmp$marks, resTmp$markP2, resTmp$marks - .8, col=resTmp$cols, border=0)
+
+ if(length(clinPos != 0)) {
+ rect(clinPos - .5, 3.5, clinPos + .5, .5, border="black", lwd=2)
+ }
+
+
+ dev.off()
+
+# png(paste(base, i, "vaf", "png", sep="."), width=960, height=480)
+# plot(d[sel & csel,2], d$vaf[sel & csel], type="l", ylim=c(0,100), main=paste(i, creg, cutoffs[3], sep=" "), xlab="position", ylab="%", col=2)
+ #lines(d[sel & csel,2], d$pctVAR[sel & csel], type="l", col=2)
+# abline(h=100)
+# abline(h=c(1,5,10,50,90,95,99), lty=3, col=1)
+# abline(h=cutoffs[3], lty=4, col=2)
+# abline(v=c(limP[1], limP[1] + 150, limP[2] - 150, limP[2]),lty=4)
+# abline(v=c(qc[qcsel,8:9]), lty=5, col=2)
+# if (length(labs != 0)) {
+# text(resTmp$POS[resTmp$vaf > (cutoffs[3]/100)], 50, labels=resTmp$Amino_Acid_change[resTmp$vaf > (cutoffs[3]/100)])
+# abline(v=resTmp$POS[resTmp$vaf > (cutoffs[3]/100)], col="grey", lty=4)
+# }
+# dev.off()
+ }
+}
+
+png(paste(base, "biasheat", "png", sep="."), width=960, height=960)
+write.table(biasTable, paste(base, "bias", "tsv", sep="."), sep="\t")
+biasTable[is.na(biasTable)] <- 0
+heatmap(data.matrix(biasTable), Colv=NA, scale="r")
+dev.off()
+png(paste(base, "bias", "png", sep="."), width=960, height=480)
+boxplot(data.matrix(biasTable), pch=20, cex=.5)
+dev.off()
+warnings()
+#hist(d$pctVAR[d$pctVAR < 20], breaks=40, ylim=c(0,1000))
+#hist(d$pctREF[d$pctREF > 80], breaks=40, ylim=c(0,1000))
+
+#print(summary(d$pctVAR))
+
+
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/lib/perl/filter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/perl/filter.pl Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,133 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $file = shift;
+my $canonicals = shift;
+my $clinvar = shift;
+my $cosmic = shift;
+my $cosmicNC = shift;
+
+my $minDP = 100;
+my $minVAR = 10;
+my $minVAF = 0.01;
+
+my %can = ();
+my %nm = ();
+my %cvHsh = ();
+
+print STDERR "Reading in canonicals,...";
+open CAN, "<$canonicals";
+while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ $can{$row[0]} = @row[1 .. $#row];
+ my $pri = 0;
+ $nm{$_} = $pri++ foreach @row[1 .. $#row];
+}
+close CAN;
+
+print STDERR "Reading clinvars...\n";
+open CV, "<$clinvar";
+while () { last if (/#CHROM/); }
+while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
+}
+close CV;
+
+print STDERR "Reading COSMIC...\n";
+open CV, "<$cosmic";
+while () { last if (/#CHROM/); }
+while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
+}
+close CV;
+
+print STDERR "Reading COSMICNC...\n";
+open CV, "<$cosmicNC";
+while () { last if (/#CHROM/); }
+while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ push @{$cvHsh{"chr".$row[0].":".$row[1]}}, [@row];
+}
+
+close CV;
+print STDERR "done!\n";
+
+print STDERR "Processing $file\n";
+
+open F, "<$file";
+my $head = readline(F);
+chomp $head;
+
+print $head . "\tvaf\n";
+
+my @rowHead = split(/\t/, $head);
+my $colN = 0;
+my %col = map {
+ $_ => $colN++ } @rowHead;
+
+while () {
+ chomp;
+ my $line = $_;
+ my @row = split(/\t/, $_);
+ next unless (exists($can{$row[$col{Gene_Name}]}));
+ my $trans = $row[$col{Transcript_ID}];
+ $trans =~ s/\..*$//;
+
+ ## impossible to switch off these two transcript filters
+ ## so commented out
+ # if (not exists($nm{$trans})) {
+ # print STDERR "Unknown transcript: $row[$col{Gene_Name}] $trans\n";
+ # next;
+ # }
+ # elsif ($nm{$trans} != 0) {
+ # print STDERR "Non cannonical: $row[$col{Gene_Name}] $trans\n";
+ # next;
+ # }
+
+ # Annotate clinincal variants and cosmic mutations
+
+ my $chr = $row[$col{"#CHROM"}];
+ my $pos = $row[$col{POS}];
+
+ if (exists($cvHsh{"$chr:$pos"})) {
+ my $rows = $cvHsh{"$chr:$pos"};
+ my $cv = join("|", map { $_->[2] } @$rows);
+ $row[2] .= "|$cv";
+ }
+
+ # Filter INTERGENIC
+ next if ($row[$col{Context}] eq "INTERGENIC");
+
+ # Filter SYNONYMOUS_CODING
+ # Dubbel mutatie BRAF
+# next if ($row[$col{Context}] eq "SYNONYMOUS_CODING");
+
+ # Filter INTRON
+ next if ($row[$col{Context}] eq "INTRON");
+
+ # Filter dbSNP
+# next if ($row[2] ne ".");
+
+ # Filter depth
+ next if ($row[$col{DP}] < $minDP);
+
+ my ($nref, $nvar) = split(/,/, $row[$col{AD}]);
+ # Filter var depth
+# print STDERR "DEBUG: " . $nvar . "\n"; sleep 1;
+ next if ($nvar < $minVAR);
+
+ my $vaf = $nvar / ($nref + $nvar);
+# print STDERR "DEBUG: " . $vaf . "\n";
+ # Filter vaf
+ next if ($vaf < $minVAF);
+
+ print join("\t", @row, $vaf) . "\n";
+}
+close F;
diff -r 5287f72e0891 -r ebb4c3b8b9db falco/lib/perl/mkHtmlReport.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/perl/mkHtmlReport.pl Fri Mar 20 08:58:17 2015 -0400
@@ -0,0 +1,234 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Cwd;
+use Spreadsheet::WriteExcel;
+
+$| = 1;
+my $dir = shift || "./";
+my $outdir = shift || "./";
+my $pat = shift || "";
+my $cwd = cwd();
+(my $runName = $cwd) =~ s/^.*\/(.*?)$/$1/;
+
+# QC
+# Results
+my @samples = ();
+#open INDEX, ">$outdir/index.html";
+my $htmlHead = qq(
+
+
+
+
+
+
+);
+#print INDEX $htmlHead;
+opendir DIR, "$dir";
+while (my $cd = readdir DIR) {
+ if ($cd =~ /(.*$pat)\.qc\.targets\.txt$/) {
+ my $sam = $1;
+# next if ($sam =~ /R[12]/);
+ print STDERR $1 . "\n";
+ push @samples, $1;
+ }
+}
+close DIR;
+
+#print INDEX "";
+#print INDEX "Download: | runQC.xls | ";
+#print INDEX "
---|
Sample | BAM | snp | indel | readCnt | Amp > 100 | \n";
+#foreach my $sam (sort @samples) {
+# print INDEX "
---|
$sam | BAM | BAI |
\n";
+
+#}
+my %link = ();
+my $excelBook0 = Spreadsheet::WriteExcel->new("$outdir/runQC.xls");
+my $excel0 = $excelBook0->add_worksheet("table1");
+my $excel0Ref = [[qw/sampleName runName totalReads pct100 ntbGenes/]];
+
+
+foreach my $sample (sort @samples) {
+ print STDERR "Processing $sample\n";
+# next if ($sample =~ /R[12]/);
+
+ my $readCnt = 0;
+ my $amp100 = 0;
+ my %ntbGenes = ();
+
+ open OUT, ">$outdir/$sample.html";
+ open OUT2, ">$outdir/$sample.tsv";
+ my $excelBook = Spreadsheet::WriteExcel->new("$outdir/$sample.xls");
+ my $excel1 = $excelBook->add_worksheet("table1");
+ my $excel2 = $excelBook->add_worksheet("table2");
+ print OUT $htmlHead;
+ my %QC = ();
+ open QC, "<$dir/$sample.qc.targets.txt";
+ readline QC;
+ print STDERR "Reading in $sample.qc.targets.txt\n";
+ while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ my @id = split(/[\_\.\-:]/, $row[0]);
+ $row[-1] = 0 if ($row[-1] eq "NA");
+ $readCnt += $row[-1]; # DP
+ if ($#id != 10) {
+ $id[0] =~ /(\D+)(\d+)/;
+ $id[0] = $2;
+ unshift @id, $1;
+ }
+
+ if ($row[-1] >= 100) {
+ $amp100++
+ }
+ else {
+ $ntbGenes{$row[0]}{dp} = $row[-1];
+ $ntbGenes{$row[0]}{id} = [@id];
+ }
+
+ $QC{$row[0]}{QC} = [@id, @row];# if ($id[0]);
+ foreach my $c ($row[4] .. $row[5]) {
+ $link{$id[-3] . ":" . $c}{$row[0]} = "Assay";
+ }
+ foreach my $c ($row[2] .. $row[4], $row[5] .. $row[3]) {
+ $link{$id[-3] . ":" . $c}{$row[0]} = "LSO";
+ }
+ }
+ close QC;
+
+ open RES, "<$dir/$sample.res.filtered.tsv" or die "Unable to open $dir/$sample\n";
+ my %uniq = ();
+ my $colCnt = 0;
+ my $resHead = readline(RES);
+ chomp $resHead;
+ $resHead =~ s/^#//;
+ $resHead =~ s/\s+$//;
+ my %resCol = map { $_ => $colCnt++ } split(/\t/, $resHead);
+ my @keyColsN = qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Gene_Coding Transcript_ID Exon_Rank/;
+# my @keyColsN = qw/CHROM POS ID REF ALT QUAL DP AD vaf Context Effect_Impact Functional_Class Codon_Change Amino_Acid_change Amino_Acid_length Gene_Name Coding Transcript Exon Tag/;
+ my @keyColsI = map { $resCol{$_} } @keyColsN;
+ foreach my $i (0 .. $#keyColsN) {
+ print STDERR join(":", $i, $keyColsN[$i], $keyColsI[$i]) . "\n";
+ }
+
+ print STDERR "Processing results\n";
+ while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ my $cpos = join(":", @row[0, 1]);
+ if (exists $link{$cpos}) {
+# my $key = join(":", @row[0 .. 4,6, 9,10,11,12,14 .. 21]);
+ my $key = join(":", @row[@keyColsI]);
+ #print STDERR join(":", @keyColsI) . "\n";
+# print STDERR join(":", @row) . "\n"; sleep 1;
+ if (not exists($uniq{$key})) {
+ foreach my $locus (keys(%{$link{$cpos}})) {
+ next if ($link{$cpos}{$locus} eq "LSO");
+ push @{$QC{$locus}{RES}}, [@row, $link{$cpos}{$locus}];
+ # print STDERR "Adding $key to $locus\n\n"; #sleep 1;
+ }
+ $uniq{$key} = 0;
+ }
+ else {
+ # print STDERR $key . " : Exists\n\n"; #sleep 1;
+ }
+ }
+ }
+ close RES;
+
+ ## Rplots that are not self-explanatory enough
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+
+ print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ #print OUT "
";
+ print OUT "Download: | TSV | ";
+ print OUT "XLS |
---|
";
+ print OUT "\n";
+# my @colnames = qw/depth chr pos id ref var qual DP AD vaf context impact effectClass codonChange AAChange RefSeqLength geneName RefSeqClass RefSeqID Exon Tag/;
+ my @colnames = ("depth", @keyColsN);
+ print OUT "" . join("", map { "$_ | " } ("Amplicon", "c","c2", "b", @colnames)) . "
";
+ print OUT2 join("\t", "Amplicon", @colnames) . "\n";
+ my $excelAref = [["Amplicon", @colnames]];
+ my $excelAref2 = [[qw/gene exon protein vaf func depth /]];
+ my %rescnt = ();
+ foreach my $locus (sort keys(%QC)) {
+ # my @targets = keys(%{$QC{$locus}{QC}});
+# print OUT "";
+ my $nres = 1;
+ $nres = scalar(@{$QC{$locus}{RES}}) if ($QC{$locus}{RES});
+
+ print OUT "$locus | \n";
+ print OUT "c | \n";
+ print OUT "c2 | \n";
+ print OUT "b | \n";
+ print OUT "$QC{$locus}{QC}->[-1] | \n"; #";
+
+ foreach my $res (@{$QC{$locus}{RES}}) {
+ $res = [map {$_ || ""} @$res];
+ print OUT " | \n";
+ print OUT join(" | \n", @{$res}[@keyColsI]);
+ print OUT " |
\n";
+ print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]) . "\n";
+ push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]];
+
+ # qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Coding Transcript Exon/;
+ push @$excelAref2, [map {$_ || "NA"} @{$res}[map { $resCol{$_} } (qw/Gene_Name Exon_Rank Amino_Acid_change vaf Functional_Class/)], $QC{$locus}{QC}->[-1]];
+# push @$excelAref2, [map {$_ || "NA"} @{$res}[[qw/Gene_Name Exon_Rank Cdna_change Amino_Acid_change vaf Functional_Class/]], $QC{$locus}{QC}->[-1]];
+ my $pl = $res->[$keyColsI[7]];
+ my $ref = $res->[$keyColsI[11]];
+ my $var = $res->[$keyColsI[12]];
+# print STDERR "$pl $ref $var\n"; sleep 1;
+ if (length($ref) == length($var)) {
+ $rescnt{$pl . "snp"}++;
+ }
+ else {
+ $rescnt{$pl . "indel"}++;
+ }
+# $rescnt++;
+ }
+ print STDERR $locus . ":" . $nres . "\n";
+ print STDERR $locus . ":" . join("-", @{$QC{$locus}{RES}}) . "\n";
+ if (scalar(@{$QC{$locus}{RES}}) == 0) { #$nres == 0) {
+ print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)) . "\n";
+ push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)];
+ }
+ print OUT "
\n";
+ }
+ print OUT "
\n";
+ print OUT "\n";
+ close OUT;
+ close OUT2;
+ $excel1->write_col(0,0, $excelAref);
+ $excel2->write_col(0,0, $excelAref2);
+ $excelBook->close();
+ my @resCnt = map { $rescnt{$_} || 0 } qw/Falcosnp Falcoindel/;
+ my $pctGood = sprintf("%.2f", $amp100 / scalar(keys(%QC)) * 100);
+ #print INDEX "$sample | BAM | $resCnt[0] | $resCnt[1] | $readCnt | $pctGood |
\n";
+ my $ntbsAmps = join(",", map { s/^(.*?)\.chr.*$/$1/; $_; } keys(%ntbGenes));
+ push @{$excel0Ref}, [$sample, $runName, $readCnt, $pctGood, $ntbsAmps];
+}
+
+$excel0->write_col(0,0, $excel0Ref);
+$excelBook0->close();
+
+#print INDEX "
";
+#print INDEX "
";
+#print INDEX "
";
+#print INDEX "
";
+#print INDEX "