# HG changeset patch
# User stef
# Date 1419351993 18000
# Node ID a793ffd0cd040500e3dcb62473c331ea8233e4be
# Parent f1b32c09deafc1f81a9d845e8226d5e68e48d258
Uploaded
diff -r f1b32c09deaf -r a793ffd0cd04 README.rst
--- a/README.rst Fri Oct 10 11:28:57 2014 -0400
+++ b/README.rst Tue Dec 23 11:26:33 2014 -0500
@@ -1,21 +1,35 @@
-Falco Amplicon Analysis Pipeline & Galaxy wrapper
+FALCO Amplicon Analysis Pipeline & Galaxy wrapper
========================================
-Text
+FALCO variant-caller is part of the Amplicon Analysis Pipeline (AAP).
+
+The typical workflow is as follows:
+* paired-end amplicon sequencing
+* merge pairs by overlapping them
+* map the single read fastq with BWA
+* perform variant calling with FALCO
+* create (html) report of the results
+
+FALCO uses samtools and straight-forward statistics to determine wether a
+potential variant is likely a (technical) artifact or not.
+
+Input / Output
+=======
+
+Input of the caller is a BAM file, output VCF
History
=======
-============== ========================================================================
+============== ================================================================
Date Changes
--------------- ------------------------------------------------------------------------
-September 2014 * first release in Tool Shed
-July 2014 * first release in Test Tool Shed
-============== ========================================================================
+-------------- ----------------------------------------------------------------
+December 2014 * first release in Test Tool Shed
+============== ================================================================
Bug Reports & other questions
=============================
-Issues can be reported via the contact form at
+Issues can be reported via http://www.tgac.nl/
diff -r f1b32c09deaf -r a793ffd0cd04 falco-call.sh
--- a/falco-call.sh Fri Oct 10 11:28:57 2014 -0400
+++ b/falco-call.sh Tue Dec 23 11:26:33 2014 -0500
@@ -1,7 +1,7 @@
#!/bin/bash
TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-echo "Start falco sh"
+echo "Started FALCO calling"
## ----------
## Variables setup
@@ -13,105 +13,51 @@
## ----------
## Create extra init script
## ----------
-if [ $design_file != 'None' ] # Galaxy default is "None"
+if [[ $filter_file != 'None' && $filter_file != '' ]] # Galaxy default somehow is "None"
then
- BED_PATH=$design_file
+ FILTER_PARAM=" --filter "$filter_file
else
- BED_PATH="$TOOLDIR/tool-data/GALAXY_TSACP.bed"
-fi
-
-if [ $manifest_file != 'None' ] # Galaxy default is "None"
-then
- MAN_PATH=$manifest_file
-else
- MAN_PATH="$TOOLDIR/tool-data/GALAXY_TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt"
+ FILTER_PARAM=""
fi
-echo "BED:"$design_file
-echo "MAN:"$manifest_file
-echo "BED chosen:"$BED_PATH
-echo "MAN chosen:"$MAN_PATH
-#echo "VARSCAN_PATH1:"$SNPEFF_JAR_PATH
-#echo "VARSCAN_PATH2:"\$SNPEFF_JAR_PATH
-
-init_file="galaxy_init.sh"
-echo '#!/bin/bash' > $init_file
-
-## executables
-#echo "snpEff=/ccagc/lib/snpEff/snpEff_v3_4" >> $init_file
-#echo "snpEff=\$SNPEFF_JAR_PATH/" >> $init_file
-#echo "snpSift=/ccagc/lib/SnpSift" >> $init_file
-#echo "snpSift=\$SNPEFF_JAR_PATH/" >> $init_file
-#echo "varscan=/ccagc/lib/VarScan/VarScan.v2.2.11.jar" >> $init_file
-#echo "varscan=\$JAVA_JAR_PATH/VarScan.v2.3.5.jar" >> $init_file
-echo "samtools=samtools" >> $init_file
-echo "bcftools=bcftools" >> $init_file
-
-## files
-echo "targetBed=$BED_PATH" >> $init_file
-echo "manifest=$MAN_PATH" >> $init_file
+if [[ $manifest_file != 'None' && $manifest_file != 'None' ]] # Galaxy default is "None"
+then
+ MANIFEST_PARAM=" --manifest "$manifest_file
+else
+ MANIFEST_PARAM=""
+fi
## ----------
## Status / debug
## ----------
-echo "BAM: "$bam_file
-echo "OUT: "$bam_name
-echo "BAMBASE: "$bam_base
-#echo "OUTPATH: "$out_path
-#echo "HTMLOUT: "$html_out
-echo "VCFOUT : "$vcf_out
-echo "INIT : "$init_file
-
-echo "------ sourcing init_file"
-source $init_file
-
-#echo "------ --- ------"
-#echo "------ sam ------"
-#echo "------ --- ------"
-#$samtools
-#echo "------ --- ------"
-#echo "------ bcf ------"
-#echo "------ --- ------"
-#$bcftools
-#echo "--- snpsift ---"
-#java -jar $snpSift/SnpSift.jar
-#echo "--- snpeff ---"
-#java -jar $snpEff/snpeff.jar
-#echo "--- varscan ---"
-#java -jar $varScan
-
-
-
+DEBUG=1
+if [ $DEBUG ]
+then
+ DBS="[INFO] "
+ echo $DBS"FILTER: "$filter_file
+ echo $DBS"MANIFES: "$manifest_file
+ echo $DBS"REFFILE: "$REF_FILE
+ echo $DBS"DB_KEY: "$DB_KEY
+ echo $DBS"REF_SRC: "$REF_SOURCE
+ echo $DBS"BAMFILE: "$bam_file
+ echo $DBS"BAMNAME: "$bam_name
+ echo $DBS"BAMBASE: "$bam_base
+ echo $DBS"OUTPATH: "$out_path
+fi
## ----------
## create output files dir
## ----------
mkdir $out_path
-mkdir $out_path/$bam_base
## ----------
## running analysis
## ----------
-
-## DEBUG
-#exit 0
-echo "Starting VcAn"
-$TOOLDIR/falco/amplicon-VcAn-TSACP.sh $bam_file $bam_base
-echo "...done with VcAn"
-#$TOOLDIR/falco/amplicon-AlVcAn-TSACP.sh ## this contains more
-
-#echo "Starting amplicon-Report"
-#$TOOLDIR/falco/amplicon-Report.sh $bam_base
-
-#echo "Starting QC"
-#$TOOLDIR/falco/amplicon-QC.sh $bam_file
-
-#echo "Starting plots.R"
-#$TOOLDIR/falco/misc/plots.R $bam_base.qc.ann.txt $bam_base.qc.targets.txt $bam_base.pdf
-
-#echo "Starting mkHtmlReport"
-#$TOOLDIR/falco/misc/mkHtmlReport.pl ./ ./
-
+echo "Starting variant calling"
+CALL_STRING="$TOOLDIR/falco/bin/falco --bam $bam_file --output $bam_base --ref $REF_FILE $FILTER_PARAM $MANIFEST_PARAM"
+echo "[INFO] "$CALL_STRING
+perl $CALL_STRING
+echo "...done with variant calling"
## ----------
@@ -128,14 +74,15 @@
echo "" >> $html_out
echo "
" >> $html_out
echo " FALCO " >> $html_out
-echo " HTML " >> $html_out
+echo " This page is way to get output files that are not implemented in galaxy history, it is not intended to be a user-friendly way of displaying anything ;)
" >> $html_out
+#echo " HTML " >> $html_out
echo " " >> $html_out
-#for file in *.vcf *.txt *.tsv
-for file in *
+for file in *.vcf *.txt *stderr *stdout
+#for file in *
do
lineCount=`wc -l $file | cut -f 1 -d " "`
echo " $file has $lineCount lines " >> $html_out
- #echo " HEAD: " `head $file` " " >> $html_out
+ echo " --> " `head -1 $file` " " >> $html_out
done
echo "
" >> $html_out
echo "" >> $html_out
@@ -146,14 +93,14 @@
## ----------
#cp 'index.html' $html_out # this is the overview of samples html
#cp $bam_base'.html' $out_path/'out.html' # this is the sample html
-cp $bam_base'.vcf' $vcf_out
+cp $bam_base'.falco.vcf' $vcf_out
## ----------
## copy files to keep to output path
## ----------
#cp -r ./$bam_base/*png $out_path/$bam_base/
#cp -r ./* $out_path
-#cp *.vcf $out_path; cp *.tsv $out_path; cp *.txt $out_path; cp *.html $out_path
+cp *.vcf $out_path; cp *.txt $out_path; cp *_std* $out_path
## ----------
echo "END falco sh"
diff -r f1b32c09deaf -r a793ffd0cd04 falco-call.xml
--- a/falco-call.xml Fri Oct 10 11:28:57 2014 -0400
+++ b/falco-call.xml Tue Dec 23 11:26:33 2014 -0500
@@ -6,48 +6,16 @@
R
bcftools
-
- TOOL_DIR
FALCO: Amplicon Analysis Pipeline
-
-
-
-
-
-
falco-call.sh $falco_cfg;
-
@@ -58,14 +26,31 @@
+
+
+ Use a built-in index
+ Use one from history
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\S+
-
-
+
@@ -79,27 +64,32 @@
- design_file=$design_file
+ filter_file=$filter_file
manifest_file=$manifest_file
bam_file=$bam
bam_name=$bam.name
html_out=$html_output
vcf_out=$vcf_output
- out_path=${vcf_output.files_path}
+ out_path=${html_output.files_path}
+ ## reference source
+ REF_SOURCE="${genomeSource.refGenomeSource}"
+ #if $genomeSource.refGenomeSource == "history":
+ ##build index on the fly
+ REF_FILE="${genomeSource.ownFile}"
+ DB_KEY="${dbkey}"
+ #else:
+ ##use precomputed indexes
+ REF_FILE="${genomeSource.indices.fields.path}"
+ #end if
-
+
-
-
+
+
@@ -120,17 +110,23 @@
**Introduction**
-This tool is a wrapper for the Amplicon Analysis Pipeline (AAP) FALCO_
+This tool is a wrapper for the variant caller FALCO_ which is part of the Amplicon Analysis Pipeline (AAP)
-.. _FALCO: https://bitbucket.org/daoud/falco
+.. _FALCO: https://github.com/tgac-vumc/falco/
Calls and annotates genomic variants for each amplicon in a design.
+**Notes**
+
+Because each amplicon is considered separate test, if two amplicons overlap this can cause the same variant position to be present twice in the output VCF. This is intentional, you can use this to evaluate the quality of the variant call beyond the amplicon.
+
-----
**Citation**
-For the underlying tool please cite: Daoud Sie et al. Performance of amplicon-based next generation DNA sequencing for diagnostic gene mutation profiling in oncopathology (Cell Oncol 2014 Oct;37(5):353-61).
+For the underlying tool please cite: Daoud Sie et al. Performance of amplicon-based next generation DNA sequencing for diagnostic gene mutation profiling in oncopathology (Cell Oncol 2014 Oct;37(5):353-61). [Pubmed]
+
+.. _Pubmed: http://www.ncbi.nlm.nih.gov/pubmed/25209392
diff -r f1b32c09deaf -r a793ffd0cd04 falco-report.sh
--- a/falco-report.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,158 +0,0 @@
-#!/bin/bash
-TOOLDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-echo "Start falco sh"
-
-## ----------
-## Variables setup
-## ----------
-#bam_file=$1
-#bam_name=$2
-#html_out=$3
-#vcf_out=$4
-#out_path=$5
-source $1
-#JAR_PATH=$3
-
-bam_base=`echo $bam_name | sed 's#.bam$##' - `
-
-## ----------
-## Create extra init script
-## ----------
-#if [ $design_file != 'None' && $design_file != '' ]; then BED_PATH=$design_file; fi
-#if [ $manifest_file != 'None' && $manifest_file != '' ]; then MAN_PATH=$manifest_file; fi
-if [ $design_file != 'None' ] # Galaxy default is "None"
-then
- BED_PATH=$design_file
-else
- BED_PATH="$TOOLDIR/tool-data/GALAXY_TSACP.bed"
-fi
-
-if [ $manifest_file != 'None' ] # Galaxy default is "None"
-then
- MAN_PATH=$manifest_file
-else
- MAN_PATH="$TOOLDIR/tool-data/GALAXY_TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt"
-fi
-
-echo "BED:"$design_file
-echo "MAN:"$manifest_file
-echo "BED chosen:"$BED_PATH
-echo "MAN chosen:"$MAN_PATH
-#echo "VARSCAN_PATH1:"$SNPEFF_JAR_PATH
-#echo "VARSCAN_PATH2:"\$SNPEFF_JAR_PATH
-echo "JAR_PATH1:"$JAR_PATH
-#echo "JAR_PATH2:"$jar_path2
-
-init_file="galaxy_init.sh"
-echo '#!/bin/bash' > $init_file
-
-## executables
-#echo "snpEff=/ccagc/lib/snpEff/snpEff_v3_4" >> $init_file
-#echo "snpEff=\$SNPEFF_JAR_PATH/" >> $init_file
-#echo "snpSift=/ccagc/lib/SnpSift" >> $init_file
-#echo "snpSift=\$SNPEFF_JAR_PATH/" >> $init_file
-#echo "varscan=/ccagc/lib/VarScan/VarScan.v2.2.11.jar" >> $init_file
-#echo "varscan=\$JAVA_JAR_PATH/VarScan.v2.3.5.jar" >> $init_file
-
-## files
-echo "targetBed=$BED_PATH" >> $init_file
-echo "manifest=$MAN_PATH" >> $init_file
-
-## ----------
-## Status / debug
-## ----------
-echo "BAM: "$bam_file
-echo "OUT: "$bam_name
-echo "BAMBASE: "$bam_base
-echo "OUTPATH: "$out_path
-echo "HTMLOUT: "$html_out
-echo "VCFOUT : "$vcf_out
-echo "INIT : "$init_file
-
-source $init_file
-
-echo "--- sam ---"
-samtools
-echo "--- bcf ---"
-bcftools
-#echo "--- snpsift ---"
-#java -jar $snpSift/SnpSift.jar
-#echo "--- snpeff ---"
-#java -jar $snpEff/snpeff.jar
-#echo "--- varscan ---"
-#java -jar $varScan
-
-exit 0
-
-## ----------
-## create output files dir
-## ----------
-mkdir $out_path
-mkdir $out_path/$bam_base
-
-## ----------
-## running analysis
-## ----------
-echo "Starting VcAn"
-$TOOLDIR/falco/amplicon-VcAn-TSACP.sh $bam_file $bam_base
-#$TOOLDIR/falco/amplicon-AlVcAn-TSACP.sh ## this contains more
-
-echo "Starting amplicon-Report"
-$TOOLDIR/falco/amplicon-Report.sh $bam_base
-
-echo "Starting QC"
-$TOOLDIR/falco/amplicon-QC.sh $bam_file
-
-#echo "Starting plots.R"
-#$TOOLDIR/falco/misc/plots.R $bam_base.qc.ann.txt $bam_base.qc.targets.txt $bam_base.pdf
-
-echo "Starting mkHtmlReport"
-$TOOLDIR/falco/misc/mkHtmlReport.pl ./ ./
-
-
-
-## ----------
-## create index html for main galaxy output
-## ----------
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-echo " FALCO " >> $html_out
-echo " HTML " >> $html_out
-echo " " >> $html_out
-#for file in *.vcf *.txt *.tsv
-for file in *
-do
- lineCount=`wc -l $file | cut -f 1 -d " "`
- echo " $file has $lineCount lines " >> $html_out
- #echo " HEAD: " `head $file` " " >> $html_out
-done
-echo "
" >> $html_out
-echo "" >> $html_out
-echo "" >> $html_out
-
-## ----------
-## creating galaxy history outputs
-## ----------
-#cp 'index.html' $html_out # this is the overview of samples html
-cp $bam_base'.html' $out_path/'out.html' # this is the sample html
-cp $bam_base'.vcf' $vcf_out
-
-## ----------
-## copy files to keep to output path
-## ----------
-cp -r ./$bam_base/*png $out_path/$bam_base/
-cp -r ./* $out_path
-#cp *.vcf $out_path; cp *.tsv $out_path; cp *.txt $out_path; cp *.html $out_path
-
-## ----------
-echo "END falco sh"
-exit 0
\ No newline at end of file
diff -r f1b32c09deaf -r a793ffd0cd04 falco-report.xml
--- a/falco-report.xml Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-
-
-
-
- samtools
-
- R
-
-
- TOOL_DIR
-
-
-
- FALCO: Amplicon Analysis Pipeline
-
-
-
-
-
-
-
-
- falco-report.sh $falco_cfg;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- \S+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- design_file=$design_file
- manifest_file=$manifest_file
- bam_file=$bam
- bam_name=$bam.name
-
- vcf_out=$vcf_output
- out_path=${vcf_output.files_path}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Introduction**
-
-This tool is a wrapper for the Amplicon Analysis Pipeline (AAP) FALCO_
-
-.. _FALCO: https://bitbucket.org/daoud/falco
-
-Calls and annotates genomic variants for each amplicon in a design.
-
------
-
-**Citation**
-
-For the underlying tool please cite: Daoud Sie et al. Performance of amplicon-based next generation DNA sequencing for diagnostic gene mutation profiling in oncopathology (Cell Oncol 2014 Oct;37(5):353-61).
-
-
-
-
diff -r f1b32c09deaf -r a793ffd0cd04 falco/README
--- a/falco/README Fri Oct 10 11:28:57 2014 -0400
+++ b/falco/README Tue Dec 23 11:26:33 2014 -0500
@@ -1,31 +1,24 @@
-#
-# Author Daoud Sie
-# e-mail d.sie@vumc.nl
-#
-######
+Usage:
+
+falco
+
+input:
-Let's put some useful stuff here...
-
-The Amplicon Analysis Pipeline (AAP) takes fastq files as a starting point. However, each
-stage of the pipeline can be run separately.
-
-amplicon-Al-TSACP.sh R1.fq R2.fq output
-
-This will run the following:
+- bam file
+- manifest file
+- ref + fai
+- locifilt
+- manifest
-- flash read merging
-- adapter clipping
-- quality trimmer
-- alignemt using BWA
+output:
-- amplicon-VcAn-TSACP.sh
--- variant calling and annotation:
--- samtools variant caller
--- custom variant caller
--- snpEff effect prediction
--- dbSNP annotations
+- *.vcf file
+- *.qc.ann.txt -> catalog of each assayed bp
+- *.qc2.ann.txt -> catalog of transitions
+- *.qc.targets.txt -> amplicon centered information, eg depth, start/end positions.
-- amplicon-Report.sh
--- filtering of variants
--- report generation
+example:
+falco --bam input.bam --output outputBase --ref [hg19.fa]
+
+
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-Al-TSACP.sh
--- a/falco/amplicon-Al-TSACP.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-#!/bin/bash
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-source $DIR/init.sh
-
-TMP=./tmp
-if [ ! -e $TMP ]
-then
- mkdir $TMP
-fi
-
-fq1=$1
-fq2=$2
-output=$3
-
-bfq1=`basename $fq1`
-bfq2=`basename $fq2`
-
-if [ ! -e $TMP/$bfq1.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP5 $fq1 > $TMP/$bfq1.trim"
-$adapt_clipper -a $illmP5 $fq1 > $TMP/$bfq1.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq1.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t30 -i $TMP/$bfq1.trim -o $TMP/$bfq1.qtrim -Q34"
-$qTrim -t30 -i $TMP/$bfq1.trim -o $TMP/$bfq1.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq2.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP7 $fq2 > $TMP/$bfq2.trim"
-$adapt_clipper -a $illmP7 $fq2 > $TMP/$bfq2.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq2.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t10 -i $TMP/$bfq2.trim -o $TMP/$bfq2.qtrim -Q34"
-$qTrim -t30 -i $TMP/$bfq2.trim -o $TMP/$bfq2.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq1.qtrim.paired ]
-then
-echo `date` " [$$] - perl $pairUp $TMP/$bfq1.qtrim $TMP/$bfq2.qtrim"
-perl $pairUp $TMP/$bfq1.qtrim $TMP/$bfq2.qtrim
-echo `date` " [$$] - Done!"
-fi
-
-# FLASH Fastq joiner
-if [ ! -e $TMP/$output.flashed.fq ]
-then
-
-$flash -r150 -f180 -c $TMP/$bfq1.qtrim.paired $TMP/$bfq2.qtrim.paired > $TMP/$output.flashed.fq
-
-fi
-
-#if [ ! -e $TMP/$output ]
-#then
-#echo `date` " [$$] - $stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions=\"$bwaref\" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$bfq1.qtrim.paired $TMP/$bfq2.qtrim.paired"
-#$stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions="$bwaref" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$bfq1.qtrim.paired $TMP/$bfq2.qtrim.paired
-#echo `date` " [$$] - Done!"
-#fi
-
-# Added -e 40 for maximum indel size of 40,...
-#if [ ! -e $TMP/$output ]
-#then
-#echo `date` " [$$] - $stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions=\"-e 40 $bwaref\" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$output.flashed.fq"
-#$stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions="-e 40 $bwaref" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$output.flashed.fq
-#echo `date` " [$$] - Done!"
-#fi
-
-if [ ! -e $output.bam ]
-then
-echo `date` " [$$] - $bwa aln -e 40 $bwaref $TMP/$output.flashed.fq > $TMP/$output.sai"
-$bwa aln -e 40 $bwaref $TMP/$output.flashed.fq > $TMP/$output.sai
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $bwa samse -r \"@RG\\tID:$output\\tSM:$output\" $bwaref $TMP/$output.sai $TMP/$output.flashed.fq > $TMP/$output.sam"
-$bwa samse -r "@RG\tID:$output\tSM:$output" $bwaref $TMP/$output.sai $TMP/$output.flashed.fq > $TMP/$output.sam
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools view -uS $TMP/$output.sam | $samtools sort - $output"
-$samtools view -uS $TMP/$output.sam | $samtools sort - $output
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools index $output.bam"
-$samtools index $output.bam
-echo `date` " [$$] - Done!"
-fi
-
-echo `date` "[$$] - $VcAn $output.bam $output"
-$VcAn $output.bam $output
-echo `date` "[$$] - Done!"
-
-echo `date` "[$$] - $report $output"
-$report $output
-echo `date` "[$$] - Done!"
-
-exit
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-AlVcAn-HaloPlex.sh
--- a/falco/amplicon-AlVcAn-HaloPlex.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-index=/scratch/data/ref/hg19/stampy/hg19
-hash=/scratch/data/ref/hg19/stampy/hg19
-bwaref=/scratch/data/ref/hg19/bwa-5/hg19.fa
-
-adapt_clipper=/opt/cutadapt/cutadapt-1.1/bin/cutadapt
-illmP5="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
-illmP7="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
-qTrim=/opt/bin/fastq_quality_trimmer
-pairUp=/storage/shark1/data/Git/ampliconPipeLine/pairUp.pl
-stampy=/opt/stampy/stampy-1.0.18/stampy.py
-samtools=/opt/samtools/samtools-0.1.18/samtools
-pileup2cov=/storage/shark1/data/Git/ampliconPipeLine/pileup2cov.pl
-attachManifest=/storage/shark1/data/Git/ampliconPipeLine/attachGff.pl
-manifest=/scratch/data/ref/manifests/HaloPlex/Analyzable-region-01355-1335878719.gff
-snpEff=/opt/snpEff/snpEff_3_0a
-
-plots=/storage/shark1/data/Git/ampliconPipeLine/plots.R
-
-fq1=$1
-fq2=$2
-output=$3
-
-if [ ! -e $fq1.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP5 $fq1 > $fq1.trim"
-$adapt_clipper -a $illmP5 $fq1 > $fq1.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $fq1.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t30 -i $fq1.trim -o $fq1.qtrim -Q34"
-$qTrim -t30 -i $fq1.trim -o $fq1.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $fq2.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP7 $fq2 > $fq2.trim"
-$adapt_clipper -a $illmP7 $fq2 > $fq2.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $fq2.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t30 -i $fq2.trim -o $fq2.qtrim -Q34"
-$qTrim -t30 -i $fq2.trim -o $fq2.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $fq1.qtrim.paired ]
-then
-echo `date` " [$$] - perl $pairUp $fq1.qtrim $fq2.qtrim"
-perl $pairUp $fq1.qtrim $fq2.qtrim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output ]
-then
-echo `date` " [$$] - $stampy -g $index -h $hash --bwaoptions=\"$bwaref\" -o $output -M $fq1.qtrim.paired $fq2.qtrim.paired"
-$stampy -g $index -h $hash --bwaoptions="$bwaref" -o $output -M $fq1.qtrim.paired $fq2.qtrim.paired
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.bam ]
-then
-echo `date` " [$$] - $samtools view -uS $output | $samtools sort - $output"
-$samtools view -uS $output | $samtools sort - $output
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools index $output.bam"
-$samtools index $output.bam
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.vcf ]
-then
-echo `date` " [$$] - $samtools mpileup -Euf $bwaref -L 1000000 -d 1000000 -F 0.01 -m 10 -o 20 $output.bam | bcftools view -bvcg - > $output.bcf"
-$samtools mpileup -Euf $bwaref -L 1000000 -d 1000000 -F 0.01 -m 10 -o 20 $output.bam | bcftools view -bvcg - > $output.bcf
-echo `date` " [$$] - Done!"
-echo `date` " [$$] - bcftools view $output.bcf > $output.vcf"
-bcftools view $output.bcf > $output.vcf
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.txt ]
-then
-echo `date` " [$$] - java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o txt -c $snpEff/snpEff.config $output.vcf > $output.snpeff.vcf"
-java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o txt -c $snpEff/snpEff.config $output.vcf > $output.snpeff.vcf
-echo `date` " [$$] - Done!"
-
-dbSnp=/scratch/data/ref/dbSNP/dbSnp_120616.vcf
-dbNSFP=/scratch/data/ref/dbNSFP/dbNSFP2.0b3.txt
-snpSift=/opt/SnpSift
-
-echo `date` " [$$] [$$] - java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-echo `date` " [$$] [$$] - Done!"
-
-echo `date` " [$$] [$$] - java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-echo `date` " [$$] [$$] - Done!"
-
-fi
-
-if [ ! -e $output.qc.txt ]
-then
-echo `date` " [$$] - $pileup2cov $bwaref $output.bam > $output.qc.txt"
-$samtools view -Xq10 $output.bam | $pileup2cov $bwaref > $output.qc.txt
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.qc.ann.txt ]
-then
-echo `date` " [$$] - $attachManifest $manifest $output.qc.txt $output.qc.targets.txt > $output.qc.ann.txt"
-$attachManifest $manifest $output.qc.txt $output.qc.targets.txt > $output.qc.ann.txt
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.pdf ]
-then
-echo `date` " [$$] - $plots $output.qc.ann.targets.txt $output.qc.targets.txt"
-$plots $output.qc.ann.txt $output.qc.targets.txt $output.pdf
-echo `date` " [$$] - Done!"
-fi
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-AlVcAn-TSACP.sh
--- a/falco/amplicon-AlVcAn-TSACP.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,236 +0,0 @@
-#!/bin/bash
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-TMP=./tmp
-if [ ! -e $TMP ]
-then
- mkdir $TMP
-fi
-
-index=$DIR/ref/hg19/stampy/hg19
-hash=$DIR/ref/hg19/stampy/hg19
-bwaref=$DIR/ref/hg19/bwa-5/hg19.fa
-manifest=$DIR/ref/manifests/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt
-
-illmP5="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
-illmP7="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
-
-adapt_clipper=$DIR/bin/cutadapt/cutadapt-1.1/bin/cutadapt
-qTrim=$DIR/bin/fastq_quality_trimmer
-stampy=$DIR/bin/stampy/stampy-1.0.18/stampy.py
-samtools=$DIR/bin/samtools/samtools-0.1.18/samtools
-snpEff=$DIR/bin/snpEff/snpEff_3_0a
-varscan=$DIR/bin/VarScan/VarScan.v2.2.11.jar
-java=/usr/java/latest/bin/java
-bwa=$DIR/bin/bwa/bwa-0.5.9/bwa
-
-pairUp=$DIR/misc/pairUp.pl
-pileup2cov=$DIR/misc/pileup2cov.pl
-attachManifest=$DIR/misc/attachManifest.pl
-plots=$DIR/misc/plots.R
-VcAn=$DIR/amplicon-VcAn-TSACP.sh
-report=$DIR/amplicon-Report.sh
-
-fq1=$1
-fq2=$2
-output=$3
-
-bfq1=`basename $fq1`
-bfq2=`basename $fq2`
-
-if [ ! -e $TMP/$bfq1.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP5 $fq1 > $TMP/$bfq1.trim"
-$adapt_clipper -a $illmP5 $fq1 > $TMP/$bfq1.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq1.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t30 -i $TMP/$bfq1.trim -o $TMP/$bfq1.qtrim -Q34"
-$qTrim -t30 -i $TMP/$bfq1.trim -o $TMP/$bfq1.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq2.trim ]
-then
-echo `date` " [$$] - $adapt_clipper -a $illmP7 $fq2 > $TMP/$bfq2.trim"
-$adapt_clipper -a $illmP7 $fq2 > $TMP/$bfq2.trim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq2.qtrim ]
-then
-echo `date` " [$$] - $qTrim -t10 -i $TMP/$bfq2.trim -o $TMP/$bfq2.qtrim -Q34"
-$qTrim -t30 -i $TMP/$bfq2.trim -o $TMP/$bfq2.qtrim -Q34
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$bfq1.qtrim.paired ]
-then
-echo `date` " [$$] - perl $pairUp $TMP/$bfq1.qtrim $TMP/$bfq2.qtrim"
-perl $pairUp $TMP/$bfq1.qtrim $TMP/$bfq2.qtrim
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $TMP/$output ]
-then
-echo `date` " [$$] - $stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions=\"$bwaref\" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$bfq1.qtrim.paired $TMP/$bfq2.qtrim.paired"
-$stampy -g $index -h $hash --readgroup=ID:$output,SM:$output --gatkcigarworkaround --bwaoptions="$bwaref" --bwa="$bwa" --bwamark -o $TMP/$output -M $TMP/$bfq1.qtrim.paired $TMP/$bfq2.qtrim.paired
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.bam ]
-then
-echo `date` " [$$] - $samtools view -uS $TMP/$output | $samtools sort - $output"
-$samtools view -uS $TMP/$output | $samtools sort - $output
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools index $output.bam"
-$samtools index $output.bam
-echo `date` " [$$] - Done!"
-fi
-
-echo `date` "[$$] - $VcAn $output.bam $output"
-$VcAn $output.bam $output
-echo `date` "[$$] - Done!"
-
-echo `date` "[$$] - $report $output"
-$report $output
-echo `date` "[$$] - Done!"
-
-exit
-if [ ! -e $output.vcf ]
-then
-# Use Extended BAQ!
-# Tweak indel settings -m and -F
-echo `date` " [$$] - $samtools mpileup -Euf $bwaref -L 1000000 -d 1000000 -m 150 -F 0.01 -o 20 $output.bam | bcftools view -bvcg - > $output.bcf"
-$samtools mpileup -Euf $bwaref -L 1000000 -d 1000000 -m 150 -F 0.01 -o 20 $output.bam | bcftools view -bvcg - > $output.bcf
-echo `date` " [$$] - Done!"
-echo `date` " [$$] - bcftools view $output.bcf > $output.vcf"
-bcftools view $output.bcf > $output.vcf
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.vs.snp.vcf ]
-then
-# VarScan seems a better alternative for smaller VAF's Samtools seems to obscure some false negatives....
-echo `date` " [$$] - $samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2snp --min-var-freq 0.05 --output-vcf > $output.vs.snp.vcf"
-$samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2snp --min-var-freq 0.05 --output-vcf > $output.vs.snp.vcf
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.vs.snp.tsv ]
-then
-# VarScan seems a better alternative for smaller VAF's Samtools seems to obscure some false negatives....
-echo `date` " [$$] - $samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2snp --min-var-freq 0.05 > $output.vs.snp.tsv"
-$samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2snp --min-var-freq 0.05 > $output.vs.snp.tsv
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.vs.indel.vcf ]
-then
-echo `date` " [$$] - $samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2indel --min-var-freq 0.05 --output-vcf > $output.vs.indel.vcf"
-$samtools mpileup -Bf $bwaref $output.bam | $java -jar $varscan mpileup2indel --min-var-freq 0.05 --output-vcf > $output.vs.indel.vcf
-echo `date` " [$$] - Done!"
-fi
-
-# Annotation
-dbSnp=/scratch/data/ref/dbSNP/dbSnp_120616.vcf
-dbNSFP=/scratch/data/ref/dbNSFP/dbNSFP2.0b3.txt
-snpSift=/opt/SnpSift
-
-VcfIn=$output.vcf
-VcfOut=$output.samtools.vcf
-
-if [ ! -e $VcfOut ]
-then
-# Samtools annotation
-echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-mv $output.snpeff.vcf $VcfOut
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-echo `date` " [$$] - Done!"
-fi
-
-VcfIn=$output.vs.snp.vcf
-VcfOut=$output.vs.snp.a.vcf
-
-if [ ! -e $VcfOut ]
-then
-# VarScan SNP annotation
-echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-mv $output.snpeff.vcf $VcfOut
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-echo `date` " [$$] - Done!"
-fi
-
-VcfIn=$output.vs.indel.vcf
-VcfOut=$output.vs.indel.a.vcf
-
-if [ ! -e $VcfOut ]
-then
-# VarScan INDEL annotation
-echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-mv $output.snpeff.vcf $VcfOut
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-echo `date` " [$$] - Done!"
-fi
-
-
-if [ ! -e $output.qc.txt ]
-then
-echo `date` " [$$] - $pileup2cov $bwaref $output.bam > $output.qc.txt"
-$samtools view -Xq10 $output.bam | $pileup2cov $bwaref > $output.qc.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools view -Xq10 -f0x0040 $output.bam | $pileup2cov $bwaref $output.bam > $output.R1.qc.txt"
-$samtools view -Xq10 -f0x0040 $output.bam | $pileup2cov $bwaref > $output.R1.qc.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $samtools view -Xq10 -f0x0080 $output.bam | $pileup2cov $bwaref $output.bam > $output.R2.qc.txt"
-$samtools view -Xq10 -f0x0080 $output.bam | $pileup2cov $bwaref > $output.R2.qc.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $attachManifest $manifest $output.qc.txt $output.qc.targets.txt > $output.qc.ann.txt"
-$attachManifest $manifest $output.qc.txt $output.qc.targets.txt > $output.qc.ann.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $attachManifest $manifest $output.R1.qc.txt $output.R1.qc.targets.txt > $output.R1.qc.ann.txt"
-$attachManifest $manifest $output.R1.qc.txt $output.R1.qc.targets.txt > $output.R1.qc.ann.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $attachManifest $manifest $output.R2.qc.txt $output.R2.qc.targets.txt > $output.R2.qc.ann.txt"
-$attachManifest $manifest $output.R2.qc.txt $output.R2.qc.targets.txt > $output.R2.qc.ann.txt
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $plots $output.qc.ann.targets.txt $output.qc.targets.txt"
-$plots $output.qc.ann.txt $output.qc.targets.txt $output.pdf
-echo `date` " [$$] - Done!"
-fi
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-QC.sh
--- a/falco/amplicon-QC.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-bam=$1
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-TMP=./tmp
-if [ ! -e $TMP ]
-then
- mkdir $TMP
-fi
-
-source $DIR/init.sh
-
-#targetBed=$DIR/ref/manifests/TSACP/TSACP.bed
-#samtools=$DIR/bin/samtools/samtools-0.1.18/samtools
-#alignmentStats=$DIR/misc/alignmentStats.pl
-#errorRate=$DIR/misc/errorRate.pl
-#plotQc=$DIR/misc/plotsQC.R
-
-#export PATH=$PATH:$DIR/bin/samtools/samtools-0.1.18/
-
-if [ ! -e alignmentStats.tsv ]
-then
-$alignmentStats $targetBed $bam > alignmentStats.tsv
-fi
-
-if [ ! -e errorStats.tsv ]
-then
-$errorRate *.qc.ann.txt > errorStats.tsv
-fi
-
-Rscript $plotQc alignmentStats.tsv errorStats.tsv
-
-
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-Report.sh
--- a/falco/amplicon-Report.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-source $DIR/init.sh
-
-
-base=$1
-
-vcf2tsv=$DIR/misc/vcf2tsv.pl
-spliteff=$DIR/misc/splitEff.pl
-addQc=$DIR/misc/addQc.pl
-filterR=$DIR/misc/filter.R
-filter=$DIR/misc/filter.pl
-plotPng=$DIR/misc/plotsPng.R
-
-# convert to tsv
-
-echo `date` " [$$] - converting vcf to tsv"
-$vcf2tsv $base.samtools.vcf > $base.samtools.tsv
-#$vcf2tsv $base.vs.snp.a.vcf > $base.vs.snp.a.tsv
-$vcf2tsv $base.qc.ann.filt.a.vcf > $base.qc.ann.filt.a.tsv
-$vcf2tsv $base.qc.ann.clinvar.a.vcf > $base.qc.ann.clinvar.a.tsv
-#$vcf2tsv $base.qc.ann.2.a.vcf > $base.qc.ann.2.a.tsv
-
-# split eff columns
-echo `date` " [$$] - splitting vcf columns"
-$spliteff $base.samtools.tsv Samtools > $base.res.tsv
-#$spliteff $base.vs.snp.a.tsv >> $base.res.tsv
-$spliteff $base.qc.ann.filt.a.tsv Custom >> $base.res.tsv
-$spliteff $base.qc.ann.clinvar.a.tsv Clinvar >> $base.res.tsv
-#$spliteff $base.qc.ann.2.a.tsv Custom2 >> $base.res.tsv
-
-# add count data
-#$addQc $base.res.tmp $base.qc.ann.txt > $base.res.tsv
-
-# Filter
-echo `date` " [$$] - filtering data"
-#Rscript $filterR $base.res.tsv $base.qc.ann.txt 10 100 .95 $base
-$filter $base.res.tsv $canonicals > $base.res.filtered.tsv
-
-# Plot
-echo `date` " [$$] - plotting data"
-Rscript $plotPng $base.qc.ann.qual.txt $base.qc2.ann.txt $base.qc.targets.txt $base.res.filtered.tsv $base.qc.ann.qual.clinvar.txt $locifilt $base
diff -r f1b32c09deaf -r a793ffd0cd04 falco/amplicon-VcAn-TSACP.sh
--- a/falco/amplicon-VcAn-TSACP.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-#!/bin/bash
-echo `date` " [$$] - Starting now in amplicon-VcAn-TSACP.sh!"
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-source $DIR/init.sh
-
-TMP=./tmp
-if [ ! -e $TMP ]
-then
- mkdir $TMP
-fi
-
-bam=$1
-output=$2
-
-if [ ! -e $output.vcf ]
-then
-# Use Extended BAQ!
-# Tweak indel settings -m and -F
-echo `date` " [$$] - $samtools mpileup -Euf $faref -L 1000000 -d 1000000 -m 150 -F 0.01 -o 20 $bam | bcftools view -bvcg - > $output.bcf"
-$samtools mpileup -Euf $faref -L 1000000 -d 1000000 -m 150 -F 0.01 -o 20 $bam | $bcftools view -bvcg - > $output.bcf
-echo `date` " [$$] - Done!"
-echo `date` " [$$] - bcftools view $output.bcf > $output.vcf"
-$bcftools view $output.bcf > $output.vcf
-echo `date` " [$$] - Done!"
-fi
-
-# Annotation
-#VcfIn=$output.vcf
-#VcfOut=$output.samtools.vcf
-
-#if [ ! -e $VcfOut ]
-#then
-# Samtools annotation
-#echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-#$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-#mv $output.snpeff.2.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-#fi
-
-if [ ! -e $output.qc.ann.txt ]
-then
-echo `date` " [$$] - $pileup2cov2 $bam $faref $manifest $output $samtools"
-$pileup2cov2 $bam $faref $manifest $output $samtools
-echo `date` " [$$] - Done!"
-fi
-
-if [ ! -e $output.qc.ann.qual.txt ]
-then
-echo `date` " [$$] - RScript $addQual $output.qc.ann.txt $output.qc.ann.qual.txt $DIR/misc/func.R $locifilt"
-Rscript $addQual $output.qc.ann.txt $output.qc.ann.qual.txt $DIR/misc/func.R $locifilt
-echo `date` " [$$] - Done!"
-fi
-
-echo `date` " [$$] - $qcFilt $output.qc.ann.qual.txt $clinvar $cosmic $cosmicNC $output"
-$qcFilt $output.qc.ann.qual.txt $clinvar $cosmic $cosmicNC $output
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $qc2vcf $output.qc.ann.qual.filt.txt > $output.qc.ann.filt.vcf"
-$qc2vcf $output.qc.ann.qual.filt.txt > $output.qc.ann.filt.vcf
-echo `date` " [$$] - Done!"
-
-echo `date` " [$$] - $qc2vcf $output.qc.ann.qual.clinvar.txt > $output.qc.ann.clinvar.vcf"
-$qc2vcf $output.qc.ann.qual.clinvar.txt > $output.qc.ann.clinvar.vcf
-echo `date` " [$$] - Done!"
-
-
-
-#VcfIn=$output.qc.ann.filt.vcf
-#VcfOut=$output.qc.ann.filt.a.vcf
-
-#if [ ! -e $VcfOut ]
-#then
-
-# qc vcf annotation
-#echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-#$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-#mv $output.snpeff.2.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-#fi
-
-#VcfIn=$output.qc.ann.clinvar.vcf
-# VcfOut=$output.qc.ann.clinvar.a.vcf
-
-#if [ ! -e $VcfOut ]
-#then
-
-# qc vcf annotation
-#echo `date` " [$$] - $java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > tmp.snpeff.vcf"
-#$java -jar $snpEff/snpEff.jar eff hg19 -noStats -noLog -o vcf -c $snpEff/snpEff.config $VcfIn > $output.snpeff.vcf
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf"
-#$java -jar $snpSift/SnpSift_latest.jar annotate -v $dbSnp $output.snpeff.vcf > $output.snpeff.2.vcf
-#mv $output.snpeff.2.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-
-#echo `date` " [$$] - $java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf"
-#$java -jar /opt/SnpSift/SnpSift_latest.jar dbnsfp -v $dbNSFP $output.snpeff.2.vcf > $output.snpeff.3.vcf
-#mv $output.snpeff.3.vcf $VcfOut
-#echo `date` " [$$] - Done!"
-#fi
diff -r f1b32c09deaf -r a793ffd0cd04 falco/bin/falco
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/bin/falco Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,69 @@
+#!/usr/bin/perl
+
+use strict;
+use Cwd 'abs_path';
+use Getopt::Long;
+use File::Basename;
+
+my $absPath = abs_path($0);
+my $dn = dirname($absPath);
+
+# Set options
+
+my $bam;
+my $ref = "$dn/../ref/hg19/hg19.fa";
+my $locifilt = "$dn/../ref/filters/filter.tsv";
+my $manifest = "$dn/../ref/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
+my $base = $bam;
+
+GetOptions (
+ "bam=s" => \$bam,
+ "ref=s" => \$ref,
+ "filter=s" => \$locifilt,
+ "manifest=s" => \$manifest,
+ "output=s" => \$base,
+ )
+or die("Error in command line arguments\n");
+
+# Set paths to external script
+# TODO make modules
+
+my $perAmpliconAnalysis = "$dn/../lib/perl/perAmpliconAnalysis.pl";
+my $addQual = "$dn/../lib/R/addQual.R";
+my $func = "$dn/../lib/R/func.R";
+my $qcFilt = "$dn/../lib/perl/qcFilt.pl";
+my $qc2vcf = "$dn/../lib/perl/qc2vcf.pl";
+
+# Check Rscript
+my $rscript = `which Rscript`;
+chomp $rscript;
+if ($rscript !~ /Rscript$/) {
+ print STDERR "No Rscript present in PATH\n";
+ exit 1;
+}
+
+# Check samtools
+my $samtools = `which samtools`;
+chomp $samtools;
+if ($samtools !~ /samtools$/) {
+ print STDERR "No samtools present in PATH\n";
+ exit 1;
+}
+
+# perl perAmpliconAnalysis
+# R addQual.R
+# perl qcFilt
+# perl qc2vcf
+
+print STDOUT localtime() . " [$$] $perAmpliconAnalysis $bam $ref $manifest $base $samtools\n";
+system("$perAmpliconAnalysis $bam $ref $manifest $base $samtools");
+
+print STDOUT localtime() . " [$$] $rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt\n";
+system("$rscript $addQual $base.qc.ann.txt $base.qc.ann.qual.txt $func $locifilt");
+
+print STDOUT localtime() . " [$$] $qcFilt $base.qc.ann.qual.txt $base\n";
+system("$qcFilt $base.qc.ann.qual.txt $base");
+
+print STDOUT localtime() . " [$$] $qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf\n";
+system("$qc2vcf $base.qc.ann.qual.filt.txt > $base.falco.vcf");
+
diff -r f1b32c09deaf -r a793ffd0cd04 falco/bin/fastq_quality_trimmer
Binary file falco/bin/fastq_quality_trimmer has changed
diff -r f1b32c09deaf -r a793ffd0cd04 falco/data/HCT-116/HCT116.bam
Binary file falco/data/HCT-116/HCT116.bam has changed
diff -r f1b32c09deaf -r a793ffd0cd04 falco/data/HCT-116/HCT116.bam.bai
Binary file falco/data/HCT-116/HCT116.bam.bai has changed
diff -r f1b32c09deaf -r a793ffd0cd04 falco/init.sh
--- a/falco/init.sh Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-#!/bin/bash
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-index=/ccagc/data/ref/hg19/stampy/hg19
-hash=/ccagc/data/ref/hg19/stampy/hg19
-faref=/ccagc/data/ref/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa
-bwaref=/ccagc/data/ref/iGenomes/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.5.x/genome.fa
-#manifest=/ccagc/data/ref/manifests/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt
-#targetBed=/ccagc/data/ref/manifests/TSACP/TSACP.bed
-
-dbSnp=/ccagc/data/ref/dbSNP/common_no_known_medical_impact_00-latest.f.vcf
-dbNSFP=/ccagc/data/ref/dbNSFP/dbNSFP2.0b3.txt
-clinvar=/ccagc/data/ref/dbSNP/clinvar_00-latest.f.vcf
-cosmic=/ccagc/data/ref/cosmic/CosmicCodingMuts_v64_26032013_noLimit_wgs.f.vcf
-cosmicNC=/ccagc/data/ref/cosmic/CosmicNonCodingVariants_v64_26032013_noLimit_wgs.f.vcf
-locifilt=/ccagc/data/ref/dbSNP/locifilt.tsv
-canonicals=/ccagc/data/ref/manifests/TSACP/canonicals.tsv
-
-illmP5="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
-illmP7="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
-
-adapt_clipper=/ccagc/lib/cutadapt/cutadapt-1.1/bin/cutadapt
-qTrim=/ccagc/bin/fastq_quality_trimmer
-stampy=/ccagc/lib/stampy/stampy-1.0.18/stampy.py
-#samtools=/ccagc/lib/samtools/samtools-0.1.18/samtools
-#bcftools=/ccagc/lib/samtools/samtools-0.1.18/bcftools/bcftools
-snpEff=/ccagc/lib/snpEff/snpEff_3_0a
-snpEff=/ccagc/lib/snpEff/snpEff_v3_4
-snpSift=/ccagc/lib/SnpSift
-
-varscan=/ccagc/lib/VarScan/VarScan.v2.2.11.jar
-java=/usr/bin/java
-bwa=/ccagc/lib/bwa/bwa-0.5.9/bwa
-flash=/ccagc/lib/flash/FLASH-1.2.2/flash
-
-pairUp=$DIR/misc/pairUp.pl
-pileup2cov=$DIR/misc/pileup2cov.pl
-pileup2cov2=$DIR/misc/perAmpliconAnalysis.pl
-addQual=$DIR/misc/addQual.R
-
-qc2vcf=$DIR/misc/qc2vcf.pl
-qcFilt=$DIR/misc/qcFilt.pl
-plots=$DIR/misc/plots.R
-plotsPng=$DIR/misc/plotsPng.R
-
-attachManifest=$DIR/misc/attachManifest.pl
-plots=$DIR/misc/plots.R
-VcAn=$DIR/amplicon-VcAn-TSACP.sh
-report=$DIR/amplicon-Report.sh
-
-alignmentStats=$DIR/misc/alignmentStats.pl
-errorRate=$DIR/misc/errorRate.pl
-plotQc=$DIR/misc/plotsQC.R
-
-#export PATH=$PATH:/ccagc/lib/samtools/samtools-0.1.18/
-source ./galaxy_init.sh
diff -r f1b32c09deaf -r a793ffd0cd04 falco/lib/R/addQual.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/R/addQual.R Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,21 @@
+#!/usr/bin/Rscript
+args <- commandArgs(T)
+qcFile <- args[1]
+output <- args[2]
+funcR <- args[3]
+lociFiltFile <- args[4]
+
+read.delim(lociFiltFile, header=F) -> lociFiltD
+lociFilt <- paste(paste("chr", lociFiltD[,1], sep=""), lociFiltD[,2], sep=":")
+
+source(funcR)
+read.delim(qcFile, header=T) -> qc
+
+#addQual(qc) -> qc.qual
+addQualTransCor(qc, lociFilt) -> qc.qual
+#addQualVarCor(qc) -> qc.qual
+
+write.table(qc.qual, output, row.names=F, col.names=T, quote=F, sep="\t")
+#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
+
+
diff -r f1b32c09deaf -r a793ffd0cd04 falco/lib/R/func.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/R/func.R Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,575 @@
+#!/usr/bin/Rscript
+#args <- commandArgs(T)
+#qcFile <- args[1]
+#output <- args[2]
+
+#read.delim(qcFile, header=T) -> qc
+
+addQual <- function(qc) {
+
+qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.vector(qc$ins)
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+delSel <- qc$del != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+insSel <- qc$ins != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob, lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+ }
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+ }
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+ }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+addQualTransCor <- function(qc, lociFilt) {
+
+
+# lociFilt contains snp, cosmic and clinvar positions
+
+qc$del <- as.character(as.vector(qc$del))
+#qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.character(as.vector(qc$ins))
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+rle(as.vector(qc$ntRef)) -> srle
+hompols <- srle$lengths >= 5
+homPosS <- cumsum(srle$lengths)[hompols]
+homPosE <- cumsum(srle$lengths)[which(hompols) - 1]
+homFilt <- rep(F, nrow(qc))
+homFilt[homPosS] <- T
+homFilt[homPosE] <- T
+
+delSel <- qc$del != "." & !homFilt
+if (sum(delSel > 0)) {
+
+
+data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$del[delSel], "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+}
+
+insSel <- qc$ins != "." & !homFilt
+if (sum(insSel) > 0) {
+data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$ins[insSel], "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(inss$ln, inss$occ)), lower.tail=F)
+}
+# Calculate transition bias
+
+
+#calcTransCor(qc) -> pCor
+globalProb <- sum(qc$nVar) / (sum(qc$nRef) + sum(qc$nVar))
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob1 <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ prob <- prob1 * calcTransCor(qc[sel,], lociFilt)
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob , lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+} }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+
+addQualVarCor <- function(qc) {
+# Corrected for variant occurence
+
+qc$del <- as.character(as.vector(qc$del))
+qc$ins <- as.vector(qc$ins)
+qc$pVal <- 1
+qc$pValAdj <- 1
+#qc$pValPois <- 1
+qc$qScore <- 0
+qc$qScoreI <- 0
+qc$qScoreD <- 0
+
+pVal.I <- rep(1, nrow(qc))
+pVal.D <- rep(1, nrow(qc))
+
+delSel <- qc$del != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+dels$ln <- nchar(as.vector(dels$X1))
+colnames(dels) <- c("seq", "occ", "ln")
+delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+insSel <- qc$ins != "."
+data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+inss$ln <- nchar(as.vector(inss$X1))
+colnames(inss) <- c("seq", "occ", "ln")
+insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
+
+# Calculate base bias per run
+sum(qc$nA[grepl("A", qc$ntRef, ignore.case=T)]) -> refA
+sum(qc$nA[!grepl("A", qc$ntRef, ignore.case=T)]) -> varA
+sum(qc$nC[grepl("C", qc$ntRef, ignore.case=T)]) -> refC
+sum(qc$nC[!grepl("C", qc$ntRef, ignore.case=T)]) ->varC
+sum(qc$nG[grepl("G", qc$ntRef, ignore.case=T)]) -> refG
+sum(qc$nG[!grepl("G", qc$ntRef, ignore.case=T)]) -> varG
+sum(qc$nT[grepl("T", qc$ntRef, ignore.case=T)]) -> refT
+sum(qc$nT[!grepl("T", qc$ntRef, ignore.case=T)]) -> varT
+
+varA / (refA + varA) -> errA
+varC / (refC + varC) -> errC
+varG / (refG + varG) -> errG
+varT / (refT + varT) -> errT
+c(errA, errC, errG, errT) / mean(c(errA, errC, errG, errT)) -> errACGT
+
+errMat <- qc[,9:12]
+errMat[grepl("A", qc$ntRef, ignore.case=T),1] <- 0
+errMat[grepl("C", qc$ntRef, ignore.case=T),2] <- 0
+errMat[grepl("G", qc$ntRef, ignore.case=T),3] <- 0
+errMat[grepl("T", qc$ntRef, ignore.case=T),4] <- 0
+
+pCor <- rep(1, nrow(qc))
+pCor[grepl("^A", qc$ntVar, ignore.case=T)] <- errACGT[1]
+pCor[grepl("^C", qc$ntVar, ignore.case=T)] <- errACGT[2]
+pCor[grepl("^G", qc$ntVar, ignore.case=T)] <- errACGT[3]
+pCor[grepl("^T", qc$ntVar, ignore.case=T)] <- errACGT[4]
+
+# Calculate errorrate per amplicon
+amps <- unique(qc$amp)
+for (i in amps) {
+ #print(i)
+ sel <- qc$amp == i
+ prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
+ #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
+ pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob * pCor[sel], lower.tail=F)
+ qc$pVal[sel] <- pees
+# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
+# qc$pValPois[sel] <- pois
+
+ # Seperate for indel/deletion events
+ selDels <- delSel & sel
+ if (sum(selDels) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
+ dels2$ln <- nchar(as.vector(dels2$X1))
+ colnames(dels2) <- c("seq", "occ", "ln")
+ dels2$occ <- as.numeric(as.vector(dels2$occ))
+ }
+ for (j in which(selDels)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
+ dels$ln <- nchar(as.vector(dels$X1))
+ colnames(dels) <- c("seq", "occ", "ln")
+ dels$occ <- as.numeric(as.vector(dels$occ))
+ #dels$pDel <- delPees[as.integer(dels$ln)]
+ #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
+ dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
+ dels$Q <- -10 * log10(dels$pVal)
+
+ high <- order(dels$occ, -dels$Q, decreasing=T)[1]
+ qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
+ qc$qScoreD[j] <- dels$Q[high]
+ pVal.D[j] <- dels$pVal[high]
+ }
+ selIns <- insSel & sel
+ if (sum(selIns) > 0) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
+ inss2$ln <- nchar(as.vector(inss2$X1))
+ colnames(inss2) <- c("seq", "occ", "ln")
+ inss2$occ <- as.numeric(as.vector(inss2$occ))
+ }
+ for (j in which(selIns)) {
+ data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
+ inss$ln <- nchar(as.vector(inss$X1))
+ colnames(inss) <- c("seq", "occ", "ln")
+ inss$occ <- as.numeric(as.vector(inss$occ))
+ #inss$pIns <- insPees[as.integer(inss$ln)]
+ #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
+ inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
+ inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
+ inss$Q <- -10 * log10(inss$pVal)
+
+ high <- order(inss$occ, -inss$Q, decreasing=T)[1]
+ qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
+ qc$qScoreI[j] <- inss$Q[high]
+ pVal.I[j] <- inss$pVal[high]
+ }
+ #print(sum(-10 * log10(pees) > 100))
+ #print(sum(qc$qScoreI[selIns] > 100))
+ #print(sum(qc$qScoreD[selDels] > 100))
+}
+
+p.adjust(qc$pVal) -> qc$pValAdj
+-10 * log(p.adjust(pVal.I)) -> qc$qScoreI
+-10 * log(p.adjust(pVal.D)) -> qc$qScoreD
+qc$qScore <- -10 * log10(qc$pValAdj)
+#qc$qScore[qc$pValAdj == 0] <- 1000
+qc$qScore[qc$qScore > 1000] <- 1000
+qc$qScoreI[qc$qScoreI > 1000] <- 1000
+qc$qScoreD[qc$qScoreD > 1000] <- 1000
+qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
+dpfilt <- as.numeric(qc$dp) >= 100
+#varfilt <- as.numeric(qc$nVar) >= 10
+
+selection <- qsel & dpfilt
+
+print(paste("Filtered out: ", sum(!selection), sep=""))
+print(paste("Variants selected: ", sum(selection), sep=""))
+return(qc)
+}
+
+#addQual(qc) -> qc
+#write.table(qc, output, row.names=F, col.names=T, quote=F, sep="\t")
+#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
+
+calcTrans <- function(qcRaw, lociFilt = NULL) {
+
+sel <- rep(T, nrow(qcRaw))
+if (!is.null(lociFilt)) {
+paste(qcRaw$X.chr, qcRaw$pos, sep=":") -> qcpos
+sel <- ! qcpos %in% lociFilt
+}
+
+qc <- qcRaw[sel,]
+
+# Calculate transition bias
+refA <- grepl("A", qc$ntRef, ignore.case=T)
+refC <- grepl("C", qc$ntRef, ignore.case=T)
+refG <- grepl("G", qc$ntRef, ignore.case=T)
+refT <- grepl("T", qc$ntRef, ignore.case=T)
+varA <- grepl("A", qc$ntVar, ignore.case=T)
+varC <- grepl("C", qc$ntVar, ignore.case=T)
+varG <- grepl("G", qc$ntVar, ignore.case=T)
+varT <- grepl("T", qc$ntVar, ignore.case=T)
+varN <- grepl(".", qc$ntVar, ignore.case=T)
+callA <- grepl("^A", qc$ntVar, ignore.case=T)
+callC <- grepl("^C", qc$ntVar, ignore.case=T)
+callG <- grepl("^G", qc$ntVar, ignore.case=T)
+callT <- grepl("^T", qc$ntVar, ignore.case=T)
+ntA <- sum(refA)
+ntC <- sum(refC)
+ntG <- sum(refG)
+ntT <- sum(refT)
+
+callList <- list(callA, callC, callG, callT)
+varList <- list(varA, varC, varG, varT)
+refList <- list(refA, refC, refG, refT)
+
+#tots <- c(
+#rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
+#rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
+#rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
+#rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
+
+nvar <- c(
+rep(sum(qc$nC[refA], qc$nG[refA], qc$nT[refA]), 4),
+rep(sum(qc$nA[refC], qc$nG[refC], qc$nT[refC]), 4),
+rep(sum(qc$nA[refG], qc$nC[refG], qc$nT[refG]), 4),
+rep(sum(qc$nA[refT], qc$nC[refT], qc$nG[refT]), 4))
+
+ncall <- c(
+rep(sum(qc$nVar[refA]), 4),
+rep(sum(qc$nVar[refC]), 4),
+rep(sum(qc$nVar[refG]), 4),
+rep(sum(qc$nVar[refT]), 4))
+
+nref <- c(
+rep(sum(qc$nRef[refA]), 4),
+rep(sum(qc$nRef[refC]), 4),
+rep(sum(qc$nRef[refG]), 4),
+rep(sum(qc$nRef[refT]), 4))
+
+tots <- nref + nvar
+
+nts <- c("A", "C", "G", "T")
+sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), npos=rep(c(ntA, ntC, ntG, ntT), each=4), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref, ncall=ncall)
+
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+# print(sum(refList[[i]] & varList[[j]]))
+
+ cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
+ if (i == j) {
+# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
+ cnt <- NA
+ }
+ sumTrans$cnt[pos + j] <- cnt
+ #print(cnt)
+}}
+
+#sumTrans$cnt / sumTrans$nvar / sumTrans$tot * 1000000 -> tmp
+sumTrans$cnt / sumTrans$tot * 1000000 -> tmp
+
+#totalNT <- sum(qc$nVar) + sum(qc$nRef)
+#sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+tmp / median(tmp, na.rm=T) -> sumTrans$cor2
+
+sumTrans$cor[sumTrans$npos <= 15] <- 1
+sumTrans$cor2[sumTrans$npos <= 15] <- 1
+
+#sumTrans$cnt / totalNT -> sumTrans$cor
+#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+
+return(sumTrans)
+
+}
+
+calcTransCor <- function(qcRaw, lociFilt = NULL) {
+
+calcTrans(qcRaw, lociFilt) -> sumTrans
+
+pCor <- rep(1, nrow(qcRaw))
+paste(qcRaw$ntRef, substr(qcRaw$ntVar, 1, 1), sep=":") -> qcTrans
+
+for (i in c("A", "C", "G", "T")) {
+for (j in c("A", "C", "G", "T")) {
+ if (i != j) {
+ tTrans <- paste(i, j, sep=":")
+ pCor[grepl(tTrans, qcTrans, ignore.case=T)] <- sumTrans$cor2[sumTrans$trans == tTrans]
+ }
+
+}}
+return(pCor)
+}
+
+
+calcTransCorOld <- function(qc) {
+# Calculate transition bias
+refA <- grepl("A", qc$ntRef, ignore.case=T)
+refC <- grepl("C", qc$ntRef, ignore.case=T)
+refG <- grepl("G", qc$ntRef, ignore.case=T)
+refT <- grepl("T", qc$ntRef, ignore.case=T)
+varA <- grepl("A", qc$ntVar, ignore.case=T)
+varC <- grepl("C", qc$ntVar, ignore.case=T)
+varG <- grepl("G", qc$ntVar, ignore.case=T)
+varT <- grepl("T", qc$ntVar, ignore.case=T)
+varN <- grepl(".", qc$ntVar, ignore.case=T)
+callA <- grepl("^A", qc$ntVar, ignore.case=T)
+callC <- grepl("^C", qc$ntVar, ignore.case=T)
+callG <- grepl("^G", qc$ntVar, ignore.case=T)
+callT <- grepl("^T", qc$ntVar, ignore.case=T)
+
+callList <- list(callA, callC, callG, callT)
+varList <- list(varA, varC, varG, varT)
+refList <- list(refA, refC, refG, refT)
+
+tots <- c(
+rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
+rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
+rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
+rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
+
+nvar <- c(
+rep(sum(qc$nVar[refA]), 4),
+rep(sum(qc$nVar[refC]), 4),
+rep(sum(qc$nVar[refG]), 4),
+rep(sum(qc$nVar[refT]), 4))
+
+nref <- c(
+rep(sum(qc$nRef[refA]), 4),
+rep(sum(qc$nRef[refC]), 4),
+rep(sum(qc$nRef[refG]), 4),
+rep(sum(qc$nRef[refT]), 4))
+
+nts <- c("A", "C", "G", "T")
+sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref)
+
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+# print(sum(refList[[i]] & varList[[j]]))
+
+ cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
+ if (i == j) {
+# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
+ cnt <- NA
+ }
+ sumTrans$cnt[pos + j] <- cnt
+ #print(cnt)
+}}
+#totalNT <- sum(qc$nVar) + sum(qc$nRef)
+sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+#sumTrans$cnt / totalNT -> sumTrans$cor
+#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
+#print(sumTrans)
+
+pCor <- rep(1, nrow(qc))
+for (i in 1:4) {
+ pos <- (i - 1) * 4
+for (j in 1:4) {
+ if (i != j) {
+ pCor[refList[[i]] & callList[[j]]] <- sumTrans$cor[pos + j]
+ }
+
+}}
+return(pCor)
+}
diff -r f1b32c09deaf -r a793ffd0cd04 falco/lib/perl/perAmpliconAnalysis.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/perl/perAmpliconAnalysis.pl Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,366 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $bam = shift;
+my $ref = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa";
+my $manifest = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/ref/manifests/TSACP/TruSeq_Amplicon_Cancer_Panel_Manifest_AFP1_PN15032433.txt";
+my $base = shift || "output";
+my $samtools = shift || "/gsdata/tmp/TSACP/ampliconPipeLine/bin/samtools/samtools-0.1.18/samtools";
+my $fai = $ref . ".fai";
+my $Qthreshold = 10;
+my $QavgLim = 20;
+
+open FAI, "<$fai";
+my %idx = ();
+print STDOUT "Reading fa index ... ";
+while () {
+ chomp;
+ my @row = split(/\t/, $_);
+ $idx{$row[0]} = [@row];
+}
+print STDOUT "done\n";
+
+close FAI;
+open REF, "<$ref";
+
+my %stats = ();
+my %manif = ();
+my %targets = ();
+my %probe = ();
+my %hash = ();
+
+open M, "<$manifest";
+while () { last if (/\[Probes\]/) }
+readline(M);
+while () {
+ chomp;
+ if (/\[Targets\]/) {
+ my $h = readline M;
+ chomp $h;
+ my @H = split(/\t/, $h);
+ last;
+ }
+ my @row = split(/\t/, $_);
+ $probe{$row[2]}{probe} = [@row];
+ $probe{$row[2]}{ULSO} = $row[9];
+ $probe{$row[2]}{DLSO} = $row[11];
+}
+
+while () {
+ chomp;
+ s/\s+$//;
+ my @row = split(/\t/, $_);
+ push @{$targets{$row[3]}{$_}}, $row[0] foreach ($row[4] .. $row[5]);
+ # $manif{$aTarget} = [@row];
+ my $aTarget = $row[0] . "-" . $row[3] . ":". "$row[4]-$row[5]";
+ if (not exists($stats{$aTarget}{target})) {
+ $stats{$aTarget}{target} = [@row];
+ $stats{$aTarget}{ln} = $row[5] - $row[4] + 1;
+ $stats{$aTarget}{n} = 0;
+ $stats{$aTarget}{sam} = [];
+ $stats{$aTarget}{depth} = 0;
+ $stats{$aTarget}{covs} = [];
+ $stats{$aTarget}{chr} = $row[3];
+ $stats{$aTarget}{start} = $row[4];
+ $stats{$aTarget}{end} = $row[5];
+ $stats{$aTarget}{ULSO} = $probe{$row[0]}{ULSO};
+ $stats{$aTarget}{DLSO} = $probe{$row[0]}{DLSO};
+ $stats{$aTarget}{assayStart} = $row[4] + length($probe{$row[0]}{ULSO});
+ $stats{$aTarget}{assayEnd} = $row[5] - length($probe{$row[0]}{DLSO});
+ }
+# print $row[0] . "\n";
+ foreach my $i (0 .. 5) {
+ foreach my $j (0 .. 5) {
+# print $row[4] + $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n";
+# print $row[4] + $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n";
+# print $row[4] - $i .":". ($row[5] - $row[4] + $j) .":". $row[0] . "\n";
+# print $row[4] - $i .":". ($row[5] - $row[4] - $j) .":". $row[0] . "\n";
+ $hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] + $j} = $aTarget;
+ $hash{$row[3]}{$row[4] + $i}{$row[5] - $row[4] - $j} = $aTarget;
+ $hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] + $j} = $aTarget;
+ $hash{$row[3]}{$row[4] - $i}{$row[5] - $row[4] - $j} = $aTarget;
+ }
+ }
+# sleep 1;
+}
+close M;
+
+# samtools view -b 25MUT5.bam chr9:133738302-133738491 | samtools calmd - /gsdata/tmp/TSACP/ampliconPipeLine/ref/hg19/bwa-5/hg19.fa | head
+
+#foreach my $amp (keys(%stats)) {
+
+# print $amp . "\n";
+# my @SAM = ();
+ # Fetch reads
+# my $coord = $stats{$amp}{chr} .":".$stats{$amp}{start}."-".$stats{$amp}{end};
+# my $ln = $stats{$amp}{ln};
+# print $coord . " $ln\n";
+
+# Open output
+open QC, ">$base.qc.ann.txt";
+open QC2, ">$base.qc2.ann.txt";
+open TARGET, ">$base.qc.targets.txt";
+
+#TODO Remove stampy check
+my $stampy = 0;
+open SAMH, "$samtools view -H $bam |";
+while () {
+ $stampy = 1 if (/PN:stampy/);
+ print STDERR "STAMPY SPOTTED!!!" if (/PN:stampy/);
+}
+close SAMH;
+
+open SAM, "$samtools view $bam |";
+#open SAM, "$samtools view -b $bam | $samtools calmd - $ref |";
+# open SAM, "$samtools view -b $bam $coord | $samtools calmd - $ref |";
+while () {
+ last unless (/^@/);
+}
+my $cnt = 0;
+my %hsh = ();
+my $pamp = ".";
+
+#Print header
+
+print QC "#" . join("\t", qw/chr pos amp dp ntRef ntVar nRef nVar nA nC nG nT nN ins del/) . "\n";
+print QC2 "#" . join("\t", qw/read chr pos amp event/) . "\n";
+print TARGET "#" . join("\t", qw/amp chr start end assayStart assayEnd depth/) . "\n";
+while () {
+ # Skip stampy reads? Nope, stampy is no longer needed
+ #TODO Remove stampy check
+# next if ($stampy == 1 && ! /XP:Z:BWA/);
+ chomp;
+ my @row = split(/\t/, $_);
+ my $l = 0;
+ my $cig = $row[5];
+ $cig =~ s/(\d+)[MD]/$l+=$1/eg;
+
+ my $amp = $hash{$row[2]}{$row[3]}{$l} || next;
+ #print "$row[2] $row[3] $l $amp\n";
+
+ push @{$stats{$amp}{sam}}, [@row];
+ $stats{$amp}{depth}++;
+ if (($pamp ne ".") && ($amp ne $pamp)) {
+ print STDOUT "Flushing $pamp $stats{$pamp}{depth}\n";
+# print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} } qw/chr start end assayStart assayEnd depth/) . "\n";
+ my $data = &call($stats{$pamp}{sam});
+ &flush($data, $stats{$pamp}{chr}, $pamp, $stats{$pamp}{assayStart}, $stats{$pamp}{assayEnd});
+ $stats{$pamp}{sam} = [];
+ }
+ $pamp = $amp;
+ $cnt++;
+}
+
+close QC;
+close QC2;
+foreach my $pamp (keys %stats) {
+ print TARGET join("\t", $pamp, map { $stats{$pamp}{$_} || "NA"} qw/chr start end assayStart assayEnd depth/) . "\n";
+}
+close TARGET;
+
+
+# print join("\n", map {$hsh{$_} . ":" . $_} keys(%hsh)) . "\n";
+# print $cnt . "\n"; sleep 1;
+ close SAM;
+# $stats{$amp}{depth} = $cnt;
+# my $data = &call(\@SAM);
+# &flush($data, $stats{$amp}{chr}, $amp);
+
+#}
+
+#foreach my $amp (keys(%stats)) {
+
+# print $amp . ":" . scalar(@{$stats{$amp}{sam}}) . "\n";
+#}
+
+sub call {
+ my $SAM = shift;
+
+ my $ppos = 0;
+ my $pchr = "";
+ my %data = ();
+ my $refSeq = "";
+ my $c = 0;
+ my $col_chr = 2;
+ my $col_read = 9;
+ my $col_qual = 10;
+ my $col_pos = 3;
+ my $col_cigar = 5;
+ my $refSeqPad = "P";
+ my %cooc = ();
+ my $cnt = 1;
+ foreach my $r (0 .. $#$SAM) {
+ my @row = @{$SAM->[$r]};
+ my $read = $row[$col_read];
+ my $qual = $row[$col_qual];
+ my $pos = $row[$col_pos];
+ my @cig = ();
+ my $aln = 0;
+
+ # Calculate mean read quality and skip if it's below threshold
+ my $QSum = 0;
+ my $Qlen = length($qual);
+
+ $QSum += ord($qual) foreach 1 .. $Qlen;
+
+ my $Qavg = $QSum / $Qlen - 33;
+ next if ($Qavg < $QavgLim);
+
+ while ($row[$col_cigar] =~ /(\d+)(\D)/g) {
+ push @cig, [$1, $2];
+ $aln += $1;
+ }
+ if ($ppos != $pos) {
+ my $offset = $idx{$row[$col_chr]}->[2];
+ my $nl = int (($row[$col_pos] - 1) / $idx{$row[$col_chr]}->[3]);
+ $offset += $row[$col_pos] + $nl - 1;
+ seek REF, $offset, 0;
+ $refSeqPad = substr($refSeq, -1, 1);
+ read REF, $refSeq, 1000;# length($read) * 5;
+ $refSeq =~ s/\s//g;
+ $data{$row[$col_chr]}{$row[$col_pos] + $_}{"refNT"} = substr($refSeq, $_, 1) foreach (0 .. $aln);
+ $ppos = $pos;
+ }
+
+ foreach my $cigE (@cig) {
+ my $n = $cigE->[0];
+ if ($cigE->[1] eq "I") {
+ my $rpos = $pos - $row[$col_pos];
+ my $insert = substr($refSeq, $rpos - 1, 1) or print STDERR "Outside?" . $rpos . ":" . length($refSeq) . " $pos\n";
+ $insert .= substr($read, 0, $n, "");
+ my $qinsert .= substr($qual, 0, $n, "");
+ push @{$data{$row[$col_chr]}{$pos - 1}{"IR"}{$insert}}, $r;
+ $data{$row[$col_chr]}{$pos - 1}{"I"}{$insert}++;
+# push @{$cooc{$pos - 1 }{$insert}}, $cnt;
+ # $cooc{($pos - 1) .":".$insert}{$cnt} = 0;
+ }
+ elsif ($cigE->[1] eq "M") {
+ my $mseq = substr($read, 0, $n, "");
+ my $mqual = substr($qual, 0, $n, "");
+ foreach my $nt (0 .. (length($mseq) - 1)) {
+ $data{$row[$col_chr]}{$pos + $nt}{"DP"}++;
+ my $ntq = substr($mqual, $nt, 1);
+ my $ntn = substr($mseq, $nt, 1);
+ # if (ord($ntq) - 33 < $Qthreshold) {
+ # $ntn = "N";
+ # }
+ $data{$row[$col_chr]}{$pos + $nt}{"NT"}{$ntn}++;
+ push @{$data{$row[$col_chr]}{$pos + $nt}{"NTR"}{$ntn}}, $r if (uc($ntn) ne uc($data{$row[$col_chr]}{$pos + $nt}{refNT}));
+# if (($ntn ne $data{$row[$col_chr]}{$pos + $nt}{"refNT"}) && ($ntn ne "N")) {
+ #push @{$cooc{$pos + $nt}{$ntn}}, $cnt;
+ # $cooc{($pos + $nt) .":".$ntn}{$cnt} = 0;
+# }
+ }
+ $pos += $n;
+
+ }
+ elsif ($cigE->[1] eq "D") {
+ my $rpos = $pos - $row[$col_pos];
+ my $deletion = substr($refSeq, $rpos - 1, $n + 1); # Substring out of string error ?
+ push @{$data{$row[$col_chr]}{$pos - 1}{"DR"}{$deletion}}, $r;
+ $data{$row[$col_chr]}{$pos - 1}{"D"}{$deletion}++;
+ #push @{$cooc{$pos - 1 }{$deletion}}, $cnt;
+ # $cooc{($pos - 1) . ":" . $deletion}{$cnt} = 0;
+ $pos += $n;
+ }
+ }
+ $cnt++;
+ }
+ #my $thresh = scalar(@$SAM) * .05;
+ #&coocCalc(\%cooc, \%data, $thresh);
+ return \%data;
+}
+
+sub coocCalc {
+ my $cooc = shift;
+ my $data = shift;
+ my $thresh = shift;
+ my %rev = ();
+ # Filter low frequency variants
+ foreach my $k (keys(%$cooc)) {
+ if (scalar(keys(%{$cooc->{$k}})) < $thresh) {
+ delete($cooc->{$k});
+ next;
+ }
+ print STDERR "$k : $thresh : " . join(":", sort {$a cmp $b} keys(%{$cooc->{$k}})) . "\n";
+ }
+ my @muts = keys(%{$cooc});
+ foreach my $i (0 .. $#muts) {
+# my %hsh = map { $_ => 0 } keys(%{$cooc{$muts[$i]}});
+ foreach my $j (($i + 1) .. $#muts) {
+ my $score = 0;
+ foreach my $m (keys(%{$cooc->{$muts[$i]}})) {
+ if (exists($cooc->{$muts[$j]}{$m})) {
+ $score++;
+ }
+ }
+ print STDERR $muts[$i] .":". $muts[$j] . ":" . $score . "\n"; sleep 1;
+ }
+ }
+
+ #{read}{mutation}
+ #{mutation}{read}
+ sleep 1;
+}
+
+sub flush {
+ my $data = shift;
+ my $chr = shift;
+ my $amp = shift;
+ my $s = shift;
+ my $e = shift;
+ return unless (exists($data->{$chr}));
+ print STDOUT "printing ... ";
+ my %reads = (); # read->pos->nt
+# foreach my $p (sort {$a <=> $b} keys(%{$data->{$chr}})) {
+# unless (($p >= $s) && ($p <= $e)) {
+# next;
+# }
+ foreach my $p ($s .. $e) {
+ my $posdat = $data->{$chr}{$p} || next;
+ next unless (exists($posdat->{DP}));
+ my $refNT = uc($posdat->{refNT});
+ my @varOrd = sort { $posdat->{NT}{$b} <=> $posdat->{NT}{$a} } grep {$_ ne "N"} grep { $_ ne $refNT } keys(%{$posdat->{NT}});
+ my $seqNT = join("/", @varOrd);
+ #my $refCNT = scalar(@{$posdat->{NT}{$refNT} || []});
+ my $refCNT = $posdat->{NT}{$refNT} || 0;
+ my $varCNT = 0;
+ #$varCNT += scalar(@{$posdat->{NT}{$_}}) foreach (@varOrd);
+ #$varCNT += $posdat->{NT}{$_} foreach (@varOrd);
+ $varCNT = ($varOrd[0])?$posdat->{NT}{$varOrd[0]}:0;
+ my $Istr = ".";
+ my $Dstr = ".";
+ if ($varOrd[0]) {
+ $reads{$_}{$p} = "$refNT/$varOrd[0]" foreach (@{$posdat->{NTR}{$varOrd[0]}});
+ }
+ if (exists($posdat->{I})) {
+ $Istr = join("|", map { $_ . ":" . $posdat->{I}{$_} } keys(%{$posdat->{I}}));
+ foreach my $I (keys(%{$posdat->{IR}})) {
+ $reads{$_}{$p} = "$refNT/$I" foreach (@{$posdat->{IR}{$I}});
+ }
+ }
+ if (exists($posdat->{D})) {
+ $Dstr = join("|", map { $_ . ":" . $posdat->{D}{$_} } keys(%{$posdat->{D}}));
+ foreach my $D (keys(%{$posdat->{DR}})) {
+ $reads{$_}{$p} = "$D/$refNT" foreach (@{$posdat->{DR}{$D}});
+ }
+ }
+ #$Dstr = join("|", map { $_ . ":" . $posdat->{D}{$_} } keys(%{$posdat->{D}})) if exists($posdat->{D});
+ print QC join("\t", $chr, $p, $amp,
+ $posdat->{DP} || 0,
+ $posdat->{refNT} || ".",
+ $seqNT || ".",
+ $refCNT, $varCNT,
+ #(map { scalar(@{$posdat->{NT}{$_} || []})} qw(A C G T N)), $Istr, $Dstr,
+ (map { $posdat->{NT}{$_} || 0 } qw(A C G T N)), $Istr, $Dstr,
+ ) . "\n";
+ # delete $data->{$chr}{$p};
+ }
+ delete $data->{$chr};
+ foreach my $r (sort { $a <=> $b } keys(%reads)) {
+ print QC2 join("\t", $r, $chr, $_, $amp, $reads{$r}{$_}) . "\n" foreach (sort {$a <=> $b} keys(%{$reads{$r}}));
+ }
+ print STDOUT "done\n";
+}
+
diff -r f1b32c09deaf -r a793ffd0cd04 falco/lib/perl/qc2vcf.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/perl/qc2vcf.pl Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,135 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $qc = shift;
+
+my $minDp = 100;
+my $minVarDp = 5;
+my $minVaf = 0.01;
+
+my $base = (split(/\//, $qc))[-1];
+my $header = qq(##fileformat=VCFv4.1
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##INFO=
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $base
+);
+
+my $chr = 0;
+my $pos = 1;
+my $target = 2;
+my $dp = 3;
+my $ref = 4;
+my $var = 5;
+my $nref = 6;
+my $nvar = 7;
+my $nA = 8;
+my $nC = 9;
+my $nG = 10;
+my $nT = 11;
+my $nN = 12;
+my $ins = 13;
+my $del = 14;
+my $pval = 15;
+my $pvala = 16;
+my $varQual = 17;
+my $insQual = 18;
+my $delQual = 19;
+
+my %acgt = (A => 0, C => 1, G => 2, T => 3);
+print $header;
+open QC, "<$qc";
+while () {
+ chomp;
+ next if (/^#/);
+ next if (/^X\./);
+ my @row = split(/\t/, $_);
+ next if ($row[$target] eq ".");
+
+ my $VAR = substr($row[$var], 0, 1);
+ my $REF = $row[$ref];
+ my $nvarall = $row[$nvar];
+ my $indel = $row[$ins] . $row[$del];
+ if ($row[$nvar] > 0 || $indel eq "..") {
+ my %ord = ( A => $row[$nA], C => $row[$nC], G => $row[$nG], T => $row[$nT], N => $row[$nN] );
+ $ord{uc($REF)} = -1; # Exclude ref from sort
+ $ord{N} = -1; # Exclude N!
+ my @order = sort { $ord{$b} <=> $ord{$a} } keys(%ord);
+ my $dpVar = $ord{$order[0]};
+ my $VAR = $order[0];
+ my $skip = 0;
+ my $tot = $row[$nref] + $dpVar;
+ my $vaf = ($tot > 0)?$dpVar / $tot:"NA"; # Excluding other alleles...
+ #$skip++ if ($tot == 0);
+ #$skip++ if ($ord{$order[0]} == $ord{$order[1]}); # Multiallelic, skipping for now
+ #$skip++ if ($dpVar == 0); # Skip if dpVar is zero, can occur when N is the only variant. Above line also protects for this.
+ #$skip++ if ($dpVar < $minVarDp);
+ #$skip++ if ($row[$dp] < $minDp);
+ #$skip++ if ($vaf < $minVaf);
+ my $gt = "$row[$ref]/$VAR";
+ my $qual = $row[$varQual] || 0;
+ $skip++ if ($qual == 0);
+ my $filter = 0;
+ my $info = "TARGET=$row[$target]";
+ my $format = "DP:AD:AB";
+ my $sample = "$row[$dp]:$row[$nref],$dpVar:$vaf";
+ print join("\t", @row[$chr, $pos], ".", $REF, $VAR, $qual, $filter, $info, $format, $sample) . "\n" if ($skip == 0);
+ }
+ # INDEL
+ if ($row[$ins] ne ".") {
+ my $skip = 0;
+ # Modify coordinte to conform to VCF?
+ my %inss = split(/[\|\:]/, $row[$ins]);
+ my @keys = sort { $inss{$b} <=> $inss{$a} } keys %inss;
+# if ($inss{$keys[0]} > $row[$nvar]) {
+# $nvarall = $inss{$keys[0]};
+# $VAR = $keys[0];
+# $row[$dp] += $nvarall;
+# }
+ my $nins = $inss{$keys[0]};
+ my $DP = $row[$dp];
+ my $nREF = $DP - $nins;
+ #$skip++ if ($nins < $minVarDp);
+ #$skip++ if ($DP < $minDp);
+ my $vaf = $nins / $row[$dp];
+ #$skip++ if ($vaf < $minVaf);
+ my $gt = "$row[$ref]/$keys[0]";
+ my $qual = $row[$insQual];
+ $skip++ if ($qual == 0);
+ my $filter = 0;
+ my $info = "TARGET=$row[$target]";
+ my $format = "DP:AD:AB";
+ my $sample = "$DP:$nREF,$nins:$vaf";
+ print join("\t", @row[$chr, $pos], ".", $REF, $keys[0], $qual, $filter, $info, $format, $sample) . "\n" if ($skip == 0);
+ }
+ if ($row[$del] ne ".") {
+ my $skip = 0;
+ # Modify coordinte to conform to VCF?
+ my %dels = split(/[\|\:]/, $row[$del]);
+ my @keys = sort { $dels{$b} <=> $dels{$a} } keys %dels;
+# if ($dels{$keys[0]} > $row[$nvar]) {
+# $nvarall = $dels{$keys[0]};
+# $REF = $keys[0];
+# $row[$dp] += $nvarall;
+# }
+ my $ndel = $dels{$keys[0]};
+ my $DP = $row[$dp];
+ my $nREF = $DP - $ndel;
+ #$skip++ if ($ndel < $minVarDp);
+ #$skip++ if ($DP < $minDp);
+ my $vaf = $ndel / $DP;
+ #$skip++ if ($vaf < $minVaf);
+ my $gt = "$keys[0]/$row[$ref]";
+ my $qual = $row[$delQual];
+ $skip++ if ($qual == 0);
+ my $filter = 0;
+ my $info = "TARGET=$row[$target]";
+ my $format = "DP:AD:AB";
+ my $sample = "$DP:$nREF,$ndel:$vaf";
+ print join("\t", @row[$chr, $pos], ".", $keys[0], $row[$ref], $qual, $filter, $info, $format, $sample) . "\n" if ($skip == 0);
+ }
+}
+close QC;
+
diff -r f1b32c09deaf -r a793ffd0cd04 falco/lib/perl/qcFilt.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/falco/lib/perl/qcFilt.pl Tue Dec 23 11:26:33 2014 -0500
@@ -0,0 +1,54 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $qcQual = shift;
+my $output = shift;
+
+my $minDp = shift || 100;
+my $minVar = shift || 10;
+my $minIns = shift || $minVar;
+my $minDel = shift || $minVar;
+my $minVarQ = shift || 100;
+my $minInsQ = shift || 200;
+my $minDelQ = shift || 200;
+
+open QC, "<$qcQual";
+my $head = readline QC;
+chomp;
+$head =~ s/\s+$//;
+my @rhead = split(/\t/, $head);
+my $coln = 0;
+my %col = map { $_ => $coln++ } @rhead;
+open OUT, ">$output.qc.ann.qual.filt.txt";
+print OUT "$head\n";
+
+while () {
+ my $line = $_;
+ my @row = split(/\t/, $_);
+ my $key = $row[$col{"X.chr"}] . ":" . $row[$col{pos}];
+ next if ($row[$col{dp}] < $minDp);
+ # Var
+ if ($row[$col{qScore}] >= $minVarQ && $row[$col{nVar}] > $minVar) {
+ print OUT $line;
+ next;
+ }
+ # Ins
+ if ($row[$col{qScoreI}] >= $minInsQ) {
+ $row[$col{ins}] =~ /:(\d+)/;
+ if ($1 >= $minIns) {
+ print OUT $line;
+ next;
+ }
+ }
+ # Del
+ if ($row[$col{qScoreD}] >= $minDelQ) {
+ $row[$col{del}] =~ /:(\d+)/;
+ if ($1 >= $minDel) {
+ print OUT $line;
+ next;
+ }
+ }
+}
+
+close OUT;
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/addQc.pl
--- a/falco/misc/addQc.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-
-my $file = shift;
-my $qc = shift;
-my %crds = ();
-open F, "<$file";
-my $fhead = readline F;
-chomp $fhead;
-print $fhead . "\t" . join("\t", qw/dp dpref dpvar vaf/) . "\n";
-while () {
- chomp;
- my @row = split(/\t/, $_, -1);
-# print STDERR scalar(@row) . "\n";
- push @{$crds{$row[0]}{$row[1]}}, [@row];
-}
-close F;
-
-# INDELS?
-my %ntcol = (A => 8, C => 9, G => 10, T => 11);
-open QC, "<$qc";
-while () {
- chomp;
- my @row = split(/\t/, $_);
- next if ($row[2] eq ".");
- my $muts = $crds{$row[0]}{$row[1]} || undef;
- next unless $muts;
- foreach my $mut (@$muts) {
- my $rl = length($mut->[3]);
- my $vl = length($mut->[4]);
-
- my $refDP = $row[$ntcol{uc($row[4])}];
-
- if ($rl > $vl && $row[14] ne ".") { # Deletion
- my %dels = split(/[\|\:]/, $row[14]);
- my @del = sort { $dels{$b} <=> $dels{$a} } keys(%dels);
- my $deldp = $dels{$del[0]};
- print join("\t", @{$mut}, $row[3], $row[6] - $deldp, $deldp, $deldp / $row[3]) . "\n";
-
- }
- elsif ($rl < $vl && $row[13] ne ".") { # Insertion
- my %inss = split(/[\|:]/, $row[13]);
- my @ins = sort { $inss{$b} <=> $inss{$a} } keys(%inss);
- my $insdp = $inss{$ins[0]};
- print join("\t", @{$mut}, $row[3], $row[6] - $insdp, $insdp, $insdp / $row[3]) . "\n";
- }
- elsif ($rl == $vl) {
- my @varnts = sort { $row[$ntcol{$b}] <=> $row[$ntcol{$a}] } split(/\//, $row[5]);
- next if ($row[$ntcol{$varnts[0]}] == $row[$ntcol{$varnts[1]}]); # Equeal variant counts
- my $vardp = $row[$ntcol{$varnts[0]}];
- print join("\t", @{$mut}, $row[3], $row[6], $vardp, $vardp / ($row[6] + $vardp)) . "\n";
- }
-
- }
- next;
-# if (exists($crds{$row[0]}{$row[1]}) && length($crds{$row[0]}{$row[1]}->[3]) > 1) { # In case of snp
- if (exists($crds{$row[0]}{$row[1]})) { # && length($crds{$row[0]}{$row[1]}->[3]) > 1) { # In case of snp
- print join("\t", @{$_}, $row[3], $row[6], $row[7], $row[7] / $row[3]) . "\n" foreach @{$crds{$row[0]}{$row[1]}};
- }
- if ($row[14] ne "." && exists($crds{$row[0]}{$row[1]})) { # In case of deletion!
- my %dels = split(/[\|\:]/, $row[14]);
- my @del = sort { $dels{$b} <=> $dels{$a} } keys(%dels);
- $row[7] = $dels{$del[0]};
- print join("\t", @{$_}, $row[3], $row[6] - $row[7], $row[7], $row[7] / $row[3]) . "\n" foreach @{$crds{$row[0]}{$row[1]}};
- }
- if ($row[13] ne "." && exists($crds{$row[0]}{$row[1]})) { # In case of insertion
- my %inss = split(/[\|:]/, $row[13]);
- my @ins = sort { $inss{$b} <=> $inss{$a} } keys(%inss);
- $row[7] = $inss{$ins[0]};
- print join("\t", @{$_}, $row[3], $row[6] - $row[7], $row[7], $row[7] / $row[3]) . "\n" foreach @{$crds{$row[0]}{$row[1]}};
- }
-}
-close QC;
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/addQual.R
--- a/falco/misc/addQual.R Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-#!/usr/bin/Rscript
-args <- commandArgs(T)
-qcFile <- args[1]
-output <- args[2]
-funcR <- args[3]
-lociFiltFile <- args[4]
-
-read.delim(lociFiltFile, header=F) -> lociFiltD
-lociFilt <- paste(paste("chr", lociFiltD[,1], sep=""), lociFiltD[,2], sep=":")
-
-source(funcR)
-read.delim(qcFile, header=T) -> qc
-
-#addQual(qc) -> qc.qual
-addQualTransCor(qc, lociFilt) -> qc.qual
-#addQualVarCor(qc) -> qc.qual
-
-write.table(qc.qual, output, row.names=F, col.names=T, quote=F, sep="\t")
-#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
-
-
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/alignmentStats.pl
--- a/falco/misc/alignmentStats.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-my $targetBed = shift;
-my @bams = @ARGV;
-$|= 1;
-
-foreach my $bam (@bams) {
- my $file = `basename $bam`;
- chomp $file;
- my $res0 = `samtools idxstats $bam`;
- my @Qdat = ();
- my %Q = ();
- my $mapped = 0;
- my $ontarget = 0;
- my $unmapped = 0;
- while ($res0 =~ /(\S+)\s(\S+)\s(\S+)\s(\S+)\n/g) {
- if ($1 ne "*") {
- $mapped += $3;
- $unmapped += $4;
- }
- else {
- $unmapped += $3;
- $unmapped += $4;
- }
- }
- my %Qcon = map { chr($_ + 33) => $_ } (0 .. 40);
- open PIPE, "samtools view -XF0x014 -L $targetBed $bam | cut -f11 |";
- while () {
- my @is = (0 .. (length($_) - 2));
- $ontarget++;
- foreach my $i (@is) {
- $Qdat[$i][$Qcon{substr($_, $i, 1)}]++;
- }
- }
-
- open PIPE, "samtools view -XF0x004 -f0x010 -L $targetBed $bam | cut -f11 |";
- while () {
- my @is = reverse(0 .. (length($_) - 2));
- $ontarget++;
- foreach my $i (@is) {
- $Qdat[$i][$Qcon{substr($_, $i, 1)}]++;
- }
- }
-
- open QUAL, ">$file.qual.tsv";
- print QUAL join("\t", "pos", 0 .. $#Qdat) . "\n";
- my $q0 = 0;
- my $q10 = 0;
- my $q20 = 0;
- my $q30 = 0;
- my $qtot = 0;
- foreach my $p (0 .. $#Qdat) {
- print QUAL join("\t", $p, map { $Qdat[$p][$_] || 0 } (0 .. $#{$Qdat[$p]})) . "\n";
- foreach my $q (0 .. $#{$Qdat[$p]}) {
- $qtot += $Qdat[$p][$q] || 0;
- $q30 += $Qdat[$p][$q] || 0 if ($q >= 30);
- $q20 += $Qdat[$p][$q] || 0 if ($q >= 20 && $q < 30);
- $q10 += $Qdat[$p][$q] || 0 if ($q >= 10 && $q < 20);
- $q0 += $Qdat[$p][$q] || 0 if ($q < 10);
- }
- }
- close QUAL;
-
- my $offtarget = $mapped - $ontarget;
- print join("\t", $file, $mapped, $ontarget, $offtarget, $unmapped, $q30/$qtot, $q20/$qtot, $q10/$qtot, $q0/$qtot) . "\n";
-
-}
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/attachGff.pl
--- a/falco/misc/attachGff.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-#!/usr/bin/perl -w
-#
-# TODO: tiled targets give wrong length...
-use strict;
-
-my $gff = shift;
-my $covtable = shift;
-
-my $QCrep = shift || "qc.txt";
-
-my %targets = ();
-my %stats = ();
-my %manif = ();
-open M, "<$gff";
-while () {
- if (/^track/) {
- my $h = readline M;
- chomp $h;
- my @H = split(/\t/, $h);
-# print $_ . ":" . $H[$_] . "\n" foreach (0 .. $#H);
- last;
- }
-}
-while () {
- chomp;
- s/\s+$//;
- my @row = split(/\t/, $_);
- push @{$targets{$row[0]}{$_}}, $row[8] foreach ($row[3] .. $row[4]);
- $manif{$row[8]} = [@row];
- if (not exists($stats{$row[8]}{dat})) {
- $stats{$row[8]}{dat} = [@row];
- $stats{$row[8]}{ln} = $row[4] - $row[3] + 1;
- $stats{$row[8]}{n} = 0;
- $stats{$row[8]}{covs} = [];
- }
-}
-close M;
-
-open C, "<$covtable";
-while () {
- chomp;
- my @row = split(/\t/, $_);
- $#row = 13;
- if (exists($targets{$row[0]}{$row[1]})) {
- foreach my $t (@{$targets{$row[0]}{$row[1]}}) {
- $stats{$t}{n}++;
- $stats{$t}{covSum} += $row[2];
- push @{$stats{$t}{covs}}, $row[2];
- }
- }
- print join("\t", @row[0 .. 1],
- (join("|", @{$targets{$row[0]}{$row[1]} || ["."]} )),
- (@row[2 .. 11]),
- (map { $_ || "." } @row[12 .. $#row]),
- ). "\n";
-# sleep 1;
-}
-close C;
-
-# Calc assay stats
-open QC, ">$QCrep";
-foreach my $t (sort keys %stats) {
- my @covs = sort {$a <=> $b} @{$stats{$t}{covs}};
- my $mean = ($stats{$t}{n} != 0)?$stats{$t}{covSum} / $stats{$t}{n}:0;
- #print join("\t", @{$stats{$t}{dat}}, $mean, map { $stats{$t}{$_} } qw/ln n/). "\n";
- print QC join("\t", $t, sprintf("%.2f", $mean), map { $stats{$t}{$_} } qw/ln n/). "\n";
-
-}
-close QC;
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/attachManifest.pl
--- a/falco/misc/attachManifest.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-#!/usr/bin/perl -w
-#
-# TODO: tiled targets give wrong length...
-use strict;
-
-my $manifest = shift;
-my $covtable = shift;
-
-my $QCrep = shift || "qc.txt";
-
-my %targets = ();
-my %stats = ();
-my %manif = ();
-open M, "<$manifest";
-while () { last if (/\[Probes\]/) }
-readline(M);
-while () {
- chomp;
- if (/\[Targets\]/) {
- my $h = readline M;
- chomp $h;
- my @H = split(/\t/, $h);
-# print $_ . ":" . $H[$_] . "\n" foreach (0 .. $#H);
- last;
- }
- my @row = split(/\t/, $_);
- $stats{$row[2]}{probe} = [@row];
- $stats{$row[2]}{ULSO} = $row[9];
- $stats{$row[2]}{DLSO} = $row[11];
-
-
-}
-while () {
- chomp;
- s/\s+$//;
- my @row = split(/\t/, $_);
- push @{$targets{$row[3]}{$_}}, $row[0] foreach ($row[4] .. $row[5]);
- $manif{$row[0]} = [@row];
- if (not exists($stats{$row[0]}{target})) {
- $stats{$row[0]}{target} = [@row];
- $stats{$row[0]}{ln} = $row[5] - $row[4] + 1;
- $stats{$row[0]}{n} = 0;
- $stats{$row[0]}{covs} = [];
- $stats{$row[0]}{chr} = $row[3];
- $stats{$row[0]}{start} = $row[4];
- $stats{$row[0]}{end} = $row[5];
- $stats{$row[0]}{assayStart} = $row[4] + length($stats{$row[0]}{ULSO});
- $stats{$row[0]}{assayEnd} = $row[5] - length($stats{$row[0]}{DLSO});
- }
-}
-close M;
-
-open C, "<$covtable";
-while () {
- chomp;
- my @row = split(/\t/, $_);
- $#row = 13;
- if (exists($targets{$row[0]}{$row[1]})) {
- foreach my $t (@{$targets{$row[0]}{$row[1]}}) {
- $stats{$t}{n}++;
- $stats{$t}{covSum} += $row[2];
- push @{$stats{$t}{covs}}, $row[2];
- }
- }
- print join("\t", @row[0 .. 1],
- (join("|", @{$targets{$row[0]}{$row[1]} || ["."]} )),
- (@row[2 .. 11]),
- (map { $_ || "." } @row[12 .. $#row]),
- ). "\n";
-# sleep 1;
-}
-close C;
-
-# Calc assay stats
-open QC, ">$QCrep";
-foreach my $t (sort keys %stats) {
- my @covs = sort {$a <=> $b} @{$stats{$t}{covs} || []};
- my $mean = ($stats{$t}{n} != 0)?$stats{$t}{covSum} / $stats{$t}{n}:0;
- #print join("\t", @{$stats{$t}{dat}}, $mean, map { $stats{$t}{$_} } qw/ln n/). "\n";
- print QC join("\t", $t, sprintf("%.2f", $mean), map { $stats{$t}{$_} } qw/ln n chr start end assayStart assayEnd/). "\n";
-
-}
-close QC;
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/checkRunComplete.pl
--- a/falco/misc/checkRunComplete.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-
-# Check RTAcomplete.txt
-# Check existence in AnalisysOutput
-
-my $base = "/ccagc/lib/pipelines/TSACP/falco/misc";
-my $getData = "$base/getData.pl";
-my $startAn = "$base/startAnalysis.pl";
-
-my $Project = shift or die "No project specified!\n";
-my $AnalysisOutput = shift || "./"; #"/ccagc/data/SeqAnalysis/$Project";
-
-my $MiSeqOutput = shift || "/ccagc/data/processing/MiSeqOutput/";
-
-
-my $LOG = shift || "$AnalysisOutput/log/$Project.log";
-
-if (! -d "$AnalysisOutput/log") {
- mkdir "$AnalysisOutput/log";
-}
-
-my %log = ();
-open LOG, "<$LOG";
-while () {
- chomp;
- # runDir date
- my @row = split(/\t/, $_);
- $log{$row[0]} = [@row];
-}
-close LOG;
-
-my @runs = `find $MiSeqOutput -maxdepth 2 -iname "RTAComplete.txt"`;
-my %runHsh = ();
-
-foreach my $r (@runs) {
- chomp;
- my $runDir = `dirname $r`;
- chomp $runDir;
- if (exists($log{$runDir})) {
- print STDERR "$runDir previously checked\n";
- next;
- }
- my @row = split(/\//, $runDir);
- my @run = split(/\_/, $row[$#row]);
- my $outDir = $AnalysisOutput . "/" . join("_", @run[0 .. 2]);
- my $baseDir = $AnalysisOutput;
- if (-e $outDir) {
- print STDERR "$outDir exists, skipping\n";
- } else {
- print STDERR "perl $getData $runDir $Project\n";
- system("perl $getData $runDir $Project $baseDir");
- print STDERR "perl $startAn $outDir\n";
- system("perl $startAn $outDir");
- open LOG, ">>$LOG";
- print LOG join("\t", $runDir, scalar(localtime())) . "\n";
- close LOG;
- }
-}
-
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/errorRate.pl
--- a/falco/misc/errorRate.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-#!/usr/bin/perl -w
-#
-
-use strict;
-my @files = @ARGV;
-foreach my $file (@files) {
-my $ref = 0;
-my $var = 0;
-
-my $file = shift;
-
-open F, "<$file";
-my $header = readline F; # Skip header
-chomp $header;
-my @hrow = split(/\t/, $header);
-my $ci = 0;
-my %col = ();
-$col{$_} = $ci++ foreach (@hrow);
-
-
-while () {
- chomp;
- my @row = split(/\t/, $_);
- #next if ($row[2] eq ".");
- #print join("\t", $row[6], $row[7]) . "\n"; sleep 1;
- my $ntRef = uc($row[4]);
- $ref += $row[$col{"n$ntRef"}];
- $var += $row[$col{"n$_"}] foreach (grep { $_ ne $ntRef } qw/A C G T/);
-}
-
-close F;
-my @path = split(/\//, $file);
-my $rate = $var / ($ref + $var) * 100;
-#print STDERR "$var/$ref -> $rate%\n";
-print "$path[$#path]\t$var\t$ref\t$rate\n";
-}
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/filter.R
--- a/falco/misc/filter.R Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-#!/usr/bin/Rscript
-args <- commandArgs(T)
-resFile <- args[1]
-qcFile <- args[2]
-varLim <- args[3]
-dpLim <- args[4]
-quants <- args[5]
-output <- args[6]
-
-read.delim(resFile, header=T) -> d
-read.delim(qcFile, header=T) -> qc
-
-# Calculate errorrate per amplicon
-amps <- unique(qc$X.amp)
-for (i in amps) {
- sel <- qc$X.amp == amps
- prob <- sum(qc$nVar[sel]) / (sum(qc$nRef) + sum(qc$nVar))
- pees <- pbinom(qc$nVar[sel], qc$nRef + qc$nVar, prob, lower.tail=F)
- Qscore <- -10 * log10(pees)
-
-}
-
-quantile(d$vaf, as.numeric(quants), na.rm=T) -> vafCut
-
-sel1 <- d$vaf > vafCut
-sel2 <- d$dp > dpLim
-sel3 <- d$dpvar > varLim
-
-sel <- sel1 & sel2 & sel3
-write.table(d[sel,], paste(output, "res.f1.tsv", sep="."), row.names=F, col.names=T, quote=F, sep="\t")
-
-dir.create(output)
-setwd(output)
-
-png(paste(output, "vafcut", "png", sep="."))
-plot(density(log10(d$vaf), na.rm=T), main=paste("VAF cutoff", vafCut, sep=" "))
-abline(v=log10(vafCut), col=2)
-dev.off()
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/filter.pl
--- a/falco/misc/filter.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,100 +0,0 @@
-#!/usr/bin/perl -w
-#
-
-use strict;
-
-my $file = shift;
-my $canonicals = shift;
-
-my %can = ();
-my %nm = ();
-
-open CAN, "<$canonicals";
-
-while () {
- chomp;
- my @row = split(/\t/, $_);
- $can{$row[0]} = @row[1 .. $#row];
- my $pri = 0;
- $nm{$_} = $pri++ foreach @row[1 .. $#row];
-# print STDERR join(":", @row[0,1]) . "\n"; sleep 1;
-}
-
-open F, "<$file";
-my $head = readline(F);
-chomp $head;
-print $head . "\tvaf\n";
-my @rowHead = split(/\t/, $head);
-my $colN = 0;
-my %col = map {
-# print STDERR $_ . " $colN\n";
- $_ => $colN++ } @rowHead;
-
-while () {
- chomp;
- my $line = $_;
- my @row = split(/\t/, $_);
- #g12 t15
- next unless (exists($can{$row[$col{Gene_Name}]}));
- my $trans = $row[$col{Transcript_ID}];
- $trans =~ s/\..*$//;
-
- if (not exists($nm{$trans})) {
- print STDERR "Unknown transcript: $row[$col{Gene_Name}] $trans\n";
- next;
- }
- elsif ($nm{$trans} != 0) {
- print STDERR "Non cannonical: $row[$col{Gene_Name}] $trans\n";
- next;
- }
-
- #next unless ($nm{$trans} == 0);
-
- # Do not filter out clinicvar loci
- #print $_ . "\n" if ($row[$col{Tag}] eq "Clinvar");
- next if ($row[$col{Tag}] eq "Clinvar");
-# print STDERR $row[15]; sleep 1;
-# print STDERR $nm{$row[15]}; sleep 1;
-
-#CODON_CHANGE_PLUS_CODON_DELETION
-#CODON_DELETION
-#DOWNSTREAM
-#EXON
-#FRAME_SHIFT
-#INTRON
-#NON_SYNONYMOUS_CODING
-#SPLICE_SITE_ACCEPTOR
-#SYNONYMOUS_CODING
-#UPSTREAM
-#UTR_3_PRIME
-
- # Filter INTERGENIC
- next if ($row[$col{Context}] eq "INTERGENIC");
-
- # Filter SYNONYMOUS_CODING
- # Dubbel mutatie BRAF
-# next if ($row[$col{Context}] eq "SYNONYMOUS_CODING");
-
- # Filter INTRON
- next if ($row[$col{Context}] eq "INTRON");
-
- # Filter dbSNP
-# next if ($row[2] ne ".");
-
- # Filter depth
- next if ($row[$col{DP}] < 100);
-
- my ($nref, $nvar) = split(/,/, $row[$col{AD}]);
- # Filter var depth
-# print STDERR "DEBUG: " . $nvar . "\n"; sleep 1;
- next if ($nvar < 10);
-
- my $vaf = $nvar / ($nref + $nvar);
-# print STDERR "DEBUG: " . $vaf . "\n";
- # Filter vaf
- next if ($vaf < 0.01);
-
-
- print $_ . "\t$vaf\n";
-}
-close F;
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/filterQC.R
--- a/falco/misc/filterQC.R Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-#!/usr/bin/Rscript
-args <- commandArgs(T)
-qcFile <- args[1]
-varLim <- as.numeric(args[2])
-dpLim <- as.numeric(args[3])
-quants <- as.numeric(args[4])
-output <- args[5]
-
-read.delim(qcFile, header=T) -> d
-
-vaf <- d$nVar / (d$nRef + d$nVar)
-
-sapply(1:nrow(d), function(x) {
- if (d$del[x] != ".") {
- dels <- matrix(unlist(strsplit(as.character(d$del[x]), c("[:|]"))), byrow=T, ncol=2)
- delRanks <- rank(dels[,2])
- delDp <- as.numeric(dels[rev(delRanks)[1],2])
- if (delDp > varLim) {
- return(T)
- }
- }
- if (d$ins[x] != ".") {
- ins <- matrix(unlist(strsplit(as.character(d$ins[x]), c("[:|]"))), byrow=T, ncol=2)
- insRanks <- rank(ins[,2])
- insDp <- as.numeric(ins[rev(insRanks)[1],2])
- if (insDp > varLim) {
- return(T)
- }
-
- }
- return(F)
-}) -> indelSel
-
-
-#dels <- matrix(unlist(strsplit(d$del, ":|"))
-#inss <- strsplit(d$ins, ":|")
-#d$vafDel
-#d$vafIns
-
-quantile(vaf, as.numeric(quants), na.rm=T) -> vafCut
-
-sel1 <- vaf > vafCut
-sel2 <- d$dp > dpLim
-sel3 <- (d$nVar > varLim) | indelSel
-
-sel <- sel1 & sel2 & sel3
-write.table(d[sel,], paste(output, "res.sigfilt.tsv", sep="."), row.names=F, col.names=F, quote=F, sep="\t")
-
-dir.create(output)
-setwd(output)
-
-png(paste(output, "vafcut", "png", sep="."))
-plot(density(log10(vaf), na.rm=T), main=paste("VAF cutoff", vafCut, sep=" "))
-abline(v=log10(vafCut), col=2)
-dev.off()
-
-ampls <- unique(d$amp)
-
-pdf(paste(output, "pdf", sep="."), width=14)
-sapply(ampls, function(x) {
- sel <- d[,3] == x
- main = x
-# plot(density(as.numeric(as.vector(d[sel,8]))), main=main)
-# hist(as.numeric(as.vector(d[sel,8])), breaks=range(d[sel,8])[2], main=main)
-
- # Get indels
- dels <- (d[,15] != ".") & sel
- ins <- (d[,14] != ".") & sel
-
- pNoise <- sum(d$nVar[sel]) / (sum(d$nRef[sel]) + sum(d$nVar[sel]))
- print(pNoise)
- pVals <- ppois(d$nVar[sel], (d$nVar[sel] + d$nRef[sel]) * pNoise, lower.tail=F)
- Qval <- -10*log10(pVals)
- noiseThrs <- qpois(0.01, (d$nVar[sel] + d$nRef[sel]) * pNoise)
-
- plot(d[sel,2], d[sel,4], pch=20, type="s", ylim=c(0, max(d[sel,4])), main=main)
- lines(d[sel,2], d[sel,7], pch=20, type="s", col=2)
- lines(d[sel,2], d[sel,8], pch=20, type="s",col=3)
-
- ampMax <- max(d[sel,4])
-
- for (idx in which(dels)) {
- matrix(unlist(strsplit(as.character(as.vector(d[idx,15])), "[:|]")), ncol=2, byrow=T) -> delDat
- nchar(delDat[,1]) -> lngts
- dpths <- as.numeric(delDat[,2])
- rect(d[idx,2] + 1, ampMax - dpths , d[idx,2] + lngts, ampMax, col="#00000040")
- }
-
- for (idx in which(ins)) {
- matrix(unlist(strsplit(as.character(as.vector(d[idx,14])), "[:|]")), ncol=2, byrow=T) -> insDat
- nchar(insDat[,1]) -> lngts
- dpths <- as.numeric(insDat[,2])
- for (i in 1:ncol(insDat)) {
- polygon(
- c(d[idx,2], d[idx,2] - (.5 * lngts[i]), d[idx, 2] + (.5 * lngts[i])),
- c(ampMax, ampMax - dpths[i], ampMax - dpths[i]),
- , col="#00000040")
- }
- }
-
- #abline(v=d[dels,2], col=4)
- #abline(v=d[ins,2], col=5)
-
- vaf <- d[sel,8]/(d[sel,7] + d[sel,8])
-
- plot(density(vaf), main=main)
-
- hist(vaf, breaks=100, main=main)
-
- plot(d[sel,2], d[sel,8]/(d[sel,7] + d[sel,8]), pch=20, cex=.5, ylim=c(0,1), main=main)
- points(d[sel,2], noiseThrs / (d[sel,7] + d[sel,8]), col=2, pch=20)
- plot(d[sel,2], log10(d[sel,8]/(d[sel,7] + d[sel,8])), pch=20, cex=.5, ylim=c(-4,0), main=main)
- points(d[sel,2], log10(noiseThrs / (d[sel,7] + d[sel,8])), col=2, pch=20)
- print(x)
-# print(sum(sel))
- })
-
-dev.off()
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/func.R
--- a/falco/misc/func.R Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,569 +0,0 @@
-#!/usr/bin/Rscript
-#args <- commandArgs(T)
-#qcFile <- args[1]
-#output <- args[2]
-
-#read.delim(qcFile, header=T) -> qc
-
-addQual <- function(qc) {
-
-qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.vector(qc$ins)
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-delSel <- qc$del != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-insSel <- qc$ins != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- print(i)
- sel <- qc$amp == i
- prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob, lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
- }
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
- }
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
- }
- print(sum(-10 * log10(pees) > 100))
- print(sum(qc$qScoreI[selIns] > 100))
- print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-addQualTransCor <- function(qc, lociFilt) {
-
-
-# lociFilt contains snp, cosmic and clinvar positions
-
-qc$del <- as.character(as.vector(qc$del))
-#qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.character(as.vector(qc$ins))
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-rle(as.vector(qc$ntRef)) -> srle
-hompols <- srle$lengths >= 5
-homPosS <- cumsum(srle$lengths)[hompols]
-homPosE <- cumsum(srle$lengths)[which(hompols) - 1]
-homFilt <- rep(F, nrow(qc))
-homFilt[homPosS] <- T
-homFilt[homPosE] <- T
-
-delSel <- qc$del != "." & !homFilt
-if (sum(delSel > 0)) {
-
-
-data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$del[delSel], "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-}
-
-insSel <- qc$ins != "." & !homFilt
-if (sum(insSel) > 0) {
-data.frame(matrix(unlist(strsplit(unlist(strsplit(qc$ins[insSel], "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(inss$ln, inss$occ)), lower.tail=F)
-}
-# Calculate transition bias
-
-
-#calcTransCor(qc) -> pCor
-globalProb <- sum(qc$nVar) / (sum(qc$nRef) + sum(qc$nVar))
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- print(i)
- sel <- qc$amp == i
- prob1 <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- prob <- prob1 * calcTransCor(qc[sel,], lociFilt)
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob , lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
-
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
-
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
-} }
- print(sum(-10 * log10(pees) > 100))
- print(sum(qc$qScoreI[selIns] > 100))
- print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-
-addQualVarCor <- function(qc) {
-# Corrected for variant occurence
-
-qc$del <- as.character(as.vector(qc$del))
-qc$ins <- as.vector(qc$ins)
-qc$pVal <- 1
-qc$pValAdj <- 1
-#qc$pValPois <- 1
-qc$qScore <- 0
-qc$qScoreI <- 0
-qc$qScoreD <- 0
-
-pVal.I <- rep(1, nrow(qc))
-pVal.D <- rep(1, nrow(qc))
-
-delSel <- qc$del != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[delSel]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
-dels$ln <- nchar(as.vector(dels$X1))
-colnames(dels) <- c("seq", "occ", "ln")
-delPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-insSel <- qc$ins != "."
-data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[insSel]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
-inss$ln <- nchar(as.vector(inss$X1))
-colnames(inss) <- c("seq", "occ", "ln")
-insPees <- ppois(1:50, mean(rep(dels$ln, dels$occ)), lower.tail=F)
-
-# Calculate base bias per run
-sum(qc$nA[grepl("A", qc$ntRef, ignore.case=T)]) -> refA
-sum(qc$nA[!grepl("A", qc$ntRef, ignore.case=T)]) -> varA
-sum(qc$nC[grepl("C", qc$ntRef, ignore.case=T)]) -> refC
-sum(qc$nC[!grepl("C", qc$ntRef, ignore.case=T)]) ->varC
-sum(qc$nG[grepl("G", qc$ntRef, ignore.case=T)]) -> refG
-sum(qc$nG[!grepl("G", qc$ntRef, ignore.case=T)]) -> varG
-sum(qc$nT[grepl("T", qc$ntRef, ignore.case=T)]) -> refT
-sum(qc$nT[!grepl("T", qc$ntRef, ignore.case=T)]) -> varT
-
-varA / (refA + varA) -> errA
-varC / (refC + varC) -> errC
-varG / (refG + varG) -> errG
-varT / (refT + varT) -> errT
-c(errA, errC, errG, errT) / mean(c(errA, errC, errG, errT)) -> errACGT
-
-errMat <- qc[,9:12]
-errMat[grepl("A", qc$ntRef, ignore.case=T),1] <- 0
-errMat[grepl("C", qc$ntRef, ignore.case=T),2] <- 0
-errMat[grepl("G", qc$ntRef, ignore.case=T),3] <- 0
-errMat[grepl("T", qc$ntRef, ignore.case=T),4] <- 0
-
-pCor <- rep(1, nrow(qc))
-pCor[grepl("^A", qc$ntVar, ignore.case=T)] <- errACGT[1]
-pCor[grepl("^C", qc$ntVar, ignore.case=T)] <- errACGT[2]
-pCor[grepl("^G", qc$ntVar, ignore.case=T)] <- errACGT[3]
-pCor[grepl("^T", qc$ntVar, ignore.case=T)] <- errACGT[4]
-
-# Calculate errorrate per amplicon
-amps <- unique(qc$amp)
-for (i in amps) {
- print(i)
- sel <- qc$amp == i
- prob <- sum(qc$nVar[sel]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel]))
- #prob <- median(qc$nVar[sel] / (qc$nRef[sel] + qc$nVar[sel]))
- pees <- pbinom(qc$nVar[sel], qc$nRef[sel] + qc$nVar[sel], prob * pCor[sel], lower.tail=F)
- qc$pVal[sel] <- pees
-# pois <- ppois(qc$nVar[sel], (qc$nRef + qc$nVar) * prob, lower.tail=F)
-# qc$pValPois[sel] <- pois
-
- # Seperate for indel/deletion events
- selDels <- delSel & sel
- if (sum(selDels) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[selDels]), "\\|")), ":")), ncol=2, byrow=T)) -> dels2
- dels2$ln <- nchar(as.vector(dels2$X1))
- colnames(dels2) <- c("seq", "occ", "ln")
- dels2$occ <- as.numeric(as.vector(dels2$occ))
- }
- for (j in which(selDels)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$del[j]), "\\|")), ":")), ncol=2, byrow=T)) -> dels
- dels$ln <- nchar(as.vector(dels$X1))
- colnames(dels) <- c("seq", "occ", "ln")
- dels$occ <- as.numeric(as.vector(dels$occ))
- #dels$pDel <- delPees[as.integer(dels$ln)]
- #dels$pVal <- ppois(as.numeric(dels$occ), dels$pDel, lower.tail=F)
- dels$pDel <- sapply(dels$ln, function(x) {sum(dels2$occ[dels2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- dels$pVal <- pbinom(dels$occ, qc$dp[j], dels$pDel, lower.tail=F)
- dels$Q <- -10 * log10(dels$pVal)
-
- high <- order(dels$occ, -dels$Q, decreasing=T)[1]
- qc$del[j] <- paste(dels$seq[high], dels$occ[high], sep=":")
- qc$qScoreD[j] <- dels$Q[high]
- pVal.D[j] <- dels$pVal[high]
- }
- selIns <- insSel & sel
- if (sum(selIns) > 0) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[selIns]), "\\|")), ":")), ncol=2, byrow=T)) -> inss2
- inss2$ln <- nchar(as.vector(inss2$X1))
- colnames(inss2) <- c("seq", "occ", "ln")
- inss2$occ <- as.numeric(as.vector(inss2$occ))
- }
- for (j in which(selIns)) {
- data.frame(matrix(unlist(strsplit(unlist(strsplit(as.vector(qc$ins[j]), "\\|")), ":")), ncol=2, byrow=T)) -> inss
- inss$ln <- nchar(as.vector(inss$X1))
- colnames(inss) <- c("seq", "occ", "ln")
- inss$occ <- as.numeric(as.vector(inss$occ))
- #inss$pIns <- insPees[as.integer(inss$ln)]
- #inss$pVal <- ppois(as.numeric(inss$occ), inss$pIns, lower.tail=F)
- inss$pIns <- sapply(inss$ln, function(x) {sum(inss2$occ[inss2$ln == x]) / (sum(qc$nRef[sel]) + sum(qc$nVar[sel])) })
- inss$pVal <- pbinom(inss$occ, qc$dp[j], inss$pIns, lower.tail=F)
- inss$Q <- -10 * log10(inss$pVal)
-
- high <- order(inss$occ, -inss$Q, decreasing=T)[1]
- qc$ins[j] <- paste(inss$seq[high], inss$occ[high], sep=":")
- qc$qScoreI[j] <- inss$Q[high]
- pVal.I[j] <- inss$pVal[high]
- }
- print(sum(-10 * log10(pees) > 100))
- print(sum(qc$qScoreI[selIns] > 100))
- print(sum(qc$qScoreD[selDels] > 100))
-}
-
-p.adjust(qc$pVal) -> qc$pValAdj
--10 * log(p.adjust(pVal.I)) -> qc$qScoreI
--10 * log(p.adjust(pVal.D)) -> qc$qScoreD
-qc$qScore <- -10 * log10(qc$pValAdj)
-#qc$qScore[qc$pValAdj == 0] <- 1000
-qc$qScore[qc$qScore > 1000] <- 1000
-qc$qScoreI[qc$qScoreI > 1000] <- 1000
-qc$qScoreD[qc$qScoreD > 1000] <- 1000
-qsel <- (qc$qScore >= 100 & qc$nVar >= 10) | (qc$qScoreI >= 200) | (qc$qScoreD >= 200)
-dpfilt <- as.numeric(qc$dp) >= 100
-#varfilt <- as.numeric(qc$nVar) >= 10
-
-selection <- qsel & dpfilt
-
-print(paste("Filtered out: ", sum(!selection), sep=""))
-print(paste("Variants selected: ", sum(selection), sep=""))
-return(qc)
-}
-
-#addQual(qc) -> qc
-#write.table(qc, output, row.names=F, col.names=T, quote=F, sep="\t")
-#write.table(qc[selection,], output, row.names=F, col.names=T, quote=F, sep="\t")
-
-calcTrans <- function(qcRaw, lociFilt = NULL) {
-
-sel <- rep(T, nrow(qcRaw))
-if (!is.null(lociFilt)) {
-paste(qcRaw$X.chr, qcRaw$pos, sep=":") -> qcpos
-sel <- ! qcpos %in% lociFilt
-}
-qc <- qcRaw[sel,]
-
-print(sum(sel))
-
-# Calculate transition bias
-refA <- grepl("A", qc$ntRef, ignore.case=T)
-refC <- grepl("C", qc$ntRef, ignore.case=T)
-refG <- grepl("G", qc$ntRef, ignore.case=T)
-refT <- grepl("T", qc$ntRef, ignore.case=T)
-varA <- grepl("A", qc$ntVar, ignore.case=T)
-varC <- grepl("C", qc$ntVar, ignore.case=T)
-varG <- grepl("G", qc$ntVar, ignore.case=T)
-varT <- grepl("T", qc$ntVar, ignore.case=T)
-varN <- grepl(".", qc$ntVar, ignore.case=T)
-callA <- grepl("^A", qc$ntVar, ignore.case=T)
-callC <- grepl("^C", qc$ntVar, ignore.case=T)
-callG <- grepl("^G", qc$ntVar, ignore.case=T)
-callT <- grepl("^T", qc$ntVar, ignore.case=T)
-
-callList <- list(callA, callC, callG, callT)
-varList <- list(varA, varC, varG, varT)
-refList <- list(refA, refC, refG, refT)
-
-#tots <- c(
-#rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
-#rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
-#rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
-#rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
-
-nvar <- c(
-rep(sum(qc$nC[refA], qc$nG[refA], qc$nT[refA]), 4),
-rep(sum(qc$nA[refC], qc$nG[refC], qc$nT[refC]), 4),
-rep(sum(qc$nA[refG], qc$nC[refG], qc$nT[refG]), 4),
-rep(sum(qc$nA[refT], qc$nC[refT], qc$nG[refT]), 4))
-
-ncall <- c(
-rep(sum(qc$nVar[refA]), 4),
-rep(sum(qc$nVar[refC]), 4),
-rep(sum(qc$nVar[refG]), 4),
-rep(sum(qc$nVar[refT]), 4))
-
-nref <- c(
-rep(sum(qc$nRef[refA]), 4),
-rep(sum(qc$nRef[refC]), 4),
-rep(sum(qc$nRef[refG]), 4),
-rep(sum(qc$nRef[refT]), 4))
-
-tots <- nref + nvar
-
-nts <- c("A", "C", "G", "T")
-sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref, ncall=ncall)
-
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
-# print(sum(refList[[i]] & varList[[j]]))
-
- cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
- if (i == j) {
-# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
- cnt <- NA
- }
- sumTrans$cnt[pos + j] <- cnt
- print(cnt)
-}}
-
-#sumTrans$cnt / sumTrans$nvar / sumTrans$tot * 1000000 -> tmp
-sumTrans$cnt / sumTrans$tot * 1000000 -> tmp
-
-#totalNT <- sum(qc$nVar) + sum(qc$nRef)
-#sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-tmp / median(tmp, na.rm=T) -> sumTrans$cor2
-#sumTrans$cnt / totalNT -> sumTrans$cor
-#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-
-
-return(sumTrans)
-
-}
-
-calcTransCor <- function(qcRaw, lociFilt = NULL) {
-
-calcTrans(qcRaw, lociFilt) -> sumTrans
-
-pCor <- rep(1, nrow(qcRaw))
-paste(qcRaw$ntRef, substr(qcRaw$ntVar, 1, 1), sep=":") -> qcTrans
-
-for (i in c("A", "C", "G", "T")) {
-for (j in c("A", "C", "G", "T")) {
- if (i != j) {
- tTrans <- paste(i, j, sep=":")
- pCor[grepl(tTrans, qcTrans, ignore.case=T)] <- sumTrans$cor2[sumTrans$trans == tTrans]
- }
-
-}}
-return(pCor)
-}
-
-
-calcTransCorOld <- function(qc) {
-# Calculate transition bias
-refA <- grepl("A", qc$ntRef, ignore.case=T)
-refC <- grepl("C", qc$ntRef, ignore.case=T)
-refG <- grepl("G", qc$ntRef, ignore.case=T)
-refT <- grepl("T", qc$ntRef, ignore.case=T)
-varA <- grepl("A", qc$ntVar, ignore.case=T)
-varC <- grepl("C", qc$ntVar, ignore.case=T)
-varG <- grepl("G", qc$ntVar, ignore.case=T)
-varT <- grepl("T", qc$ntVar, ignore.case=T)
-varN <- grepl(".", qc$ntVar, ignore.case=T)
-callA <- grepl("^A", qc$ntVar, ignore.case=T)
-callC <- grepl("^C", qc$ntVar, ignore.case=T)
-callG <- grepl("^G", qc$ntVar, ignore.case=T)
-callT <- grepl("^T", qc$ntVar, ignore.case=T)
-
-callList <- list(callA, callC, callG, callT)
-varList <- list(varA, varC, varG, varT)
-refList <- list(refA, refC, refG, refT)
-
-tots <- c(
-rep(sum(qc$nRef[refA], qc$nVar[refA]), 4),
-rep(sum(qc$nRef[refC], qc$nVar[refC]), 4),
-rep(sum(qc$nRef[refG], qc$nVar[refG]), 4),
-rep(sum(qc$nRef[refT], qc$nVar[refT]), 4))
-
-nvar <- c(
-rep(sum(qc$nVar[refA]), 4),
-rep(sum(qc$nVar[refC]), 4),
-rep(sum(qc$nVar[refG]), 4),
-rep(sum(qc$nVar[refT]), 4))
-
-nref <- c(
-rep(sum(qc$nRef[refA]), 4),
-rep(sum(qc$nRef[refC]), 4),
-rep(sum(qc$nRef[refG]), 4),
-rep(sum(qc$nRef[refT]), 4))
-
-nts <- c("A", "C", "G", "T")
-sumTrans <- data.frame(trans=paste(rep(nts, each=4), rep(nts, 4), sep=":"), cnt=rep(0, 16), tot=tots, nvar=nvar, nref=nref)
-
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
-# print(sum(refList[[i]] & varList[[j]]))
-
- cnt <- sum(qc[refList[[i]] & varList[[j]], 8 + j])
- if (i == j) {
-# cnt <- sum(qc[refList[[i]] & varN, 8 + j])
- cnt <- NA
- }
- sumTrans$cnt[pos + j] <- cnt
- print(cnt)
-}}
-#totalNT <- sum(qc$nVar) + sum(qc$nRef)
-sumTrans$cnt / median(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-#sumTrans$cnt / totalNT -> sumTrans$cor
-#sumTrans$cnt / min(sumTrans$cnt, na.rm=T) -> sumTrans$cor
-print(sumTrans)
-
-pCor <- rep(1, nrow(qc))
-for (i in 1:4) {
- pos <- (i - 1) * 4
-for (j in 1:4) {
- if (i != j) {
- pCor[refList[[i]] & callList[[j]]] <- sumTrans$cor[pos + j]
- }
-
-}}
-return(pCor)
-}
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/getData.pl
--- a/falco/misc/getData.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,140 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-
-# Read in path(s) and run bcl2fastq to fetch data into appropriate directories
-
-my $runDir = shift;
-my $project = shift || die "Project required";
-my $baseDir = shift || "/ccagc/data/SeqAnalysis/$project";
-my $dry = 0;
-my $outDir = undef;
-(my $FCID = $runDir) =~ s/.*_(.*?)\/?$/$1/;
-
-if (not defined($outDir)) {
- ($outDir = $runDir) =~ s/.*\/(.*_.*_.*)_.*/$1/;
- $outDir = $baseDir ."/". $outDir;
-}
-
-print STDERR localtime() . " : PATH: " . $runDir . "\n";
-print STDERR localtime() . " : BASEDIR: " . $baseDir . "\n";
-print STDERR localtime() . " : OUTDIR: " . $outDir . "\n";
-print STDERR localtime() . " : FCID: " . $FCID . "\n";
-
-# open samplesheet
-# Check run type and convert for bcl2fastq
-my $col = 0;
-
-my @headOut = qw/FCID Lane SampleID SampleRef Index Description Control Recipe Operator SampleProject/;
-my $Lane = 1;
-my $SampleRef = "human";
-my $Control = "N";
-my $SampleProject = $project;
-my %header = ();
-open SAMPLE, "<$runDir/SampleSheet.csv" or die;
-while () {
- chomp;
- s/\s+$//;
- my @row = split(/,/, $_);
-# if (/^(.*),(.*)$/) {
-# $header{$1} = $2;
-# }
- $header{$row[0]} = $row[1];
- last if (/\[Data\]/);
-}
-
-if ($header{Assay} !~ /Amplicon/) {
- print STDERR localtime() . " : Wrong assay, exiting.\n";
- exit;
-}
-
-# Skipping project name check for now
-#if ($header{"Project Name"} !~ /$project/) {
-# print STDERR localtime() . " : Wrong project, exiting.\n";
-# exit;
-#}
-
-my $Operator = $header{"Investigator Name"};
-my $Recipe = $header{"Workflow"};
-
-my $head = readline(SAMPLE);
-chomp $head;
-my %headIn = map { $_ => $col++ } split(/,/, $head);#"Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description,Manifest,GenomeFolder");
-
-my @samples = ();
-my @ctrls = ();
-while () {
- chomp;
- my @row = split(/,/, $_);
- print STDERR localtime() . " : " . join(":", @row) . "\n";
- my $bc = substr($row[$headIn{index}],0,7) ."-". substr($row[$headIn{index2}],0,7);
- if ($row[$headIn{Sample_Project}] != /$SampleProject/) {
- push @samples, join(",", $FCID, $Lane, $row[$headIn{Sample_ID}], $SampleRef, $bc, $row[$headIn{Description}], $Control, $Recipe, $Operator, $SampleProject) . "\n";
- }
- elsif ($row[$headIn{Sample_Project}] != /CONTROL/i) {
- push @ctrls, join(",", $FCID, $Lane, $row[$headIn{Sample_ID}], $SampleRef, $bc, $row[$headIn{Description}], $Control, $Recipe, $Operator, $SampleProject) . "\n";
- }
-
-}
-close SAMPLE;
-
-if ($dry == 1) {
- print STDERR localtime() . " : mkdir $outDir\n";
- print STDERR localtime() . " : " . join(",", @headOut) . "\n";
- print STDERR localtime() . " : " . join("", @samples);
-}
-elsif (-e $outDir) {
- print STDERR localtime() . " : Folder $outDir exists, exiting\n";
- exit;
-}
-elsif ($#samples >= 0) {
- print STDERR localtime() . " : mkdir $outDir\n";
- system("mkdir -p $outDir");
- open SAMPLEOUT, ">$outDir/SampleSheet.csv";
- print SAMPLEOUT join(",", @headOut) . "\n";
- print STDERR localtime() . " : " . join(",", @headOut) . "\n";
- print SAMPLEOUT join("", @samples, @ctrls);
- print STDERR localtime() . " : " . join("", @samples, @ctrls);
- close SAMPLEOUT;
-}
-else {
- # Nothing to do exit!
- print STDERR localtime() . " : Nothing to do for $SampleProject, exiting!";
- exit;
-}
-
-
-# run configureBCL2fastq
-my $bcl2fqbin = "/usr/local/bin/configureBclToFastq.pl";
-my $bcl2fq = "$bcl2fqbin --no-eamss --input-dir $runDir/Data/Intensities/BaseCalls --output-dir $outDir/Unaligned --sample-sheet $outDir/SampleSheet.csv --mismatches 0 --fastq-cluster-count 0";
-my $make = "cd $outDir/Unaligned && nohup make && touch bcl2fq.done && cd ..";
-
-print STDERR localtime() . " : $bcl2fq\n";
-print STDERR localtime() . " : $make\n";
-if ($dry == 1) {
- exit;
-}
-
-my $pid1 = open BCL2FQ, "$bcl2fq 2>&1 |";
-while () {
- print STDERR localtime() . " - $_";
-}
-close BCL2FQ;
-
-my $pid2 = open MAKE, "$make 2>&1 |";
-while () {
- print STDERR localtime() . " - $_";
-}
-close MAKE;
-
-
-#my $rc1 = 0; #system($bcl2fq);
-# run make
-#if ($rc1 == 0) {
-# my $rc2 = system($make);
-#}
-#else {
-# print STDERR localtime() . " : Error or warning encountered :$rc1\n";
-# print STDERR localtime() . " : Run make manually\n";
-# print STDERR localtime() . " : $make\n";
-#}
diff -r f1b32c09deaf -r a793ffd0cd04 falco/misc/mkHtmlReport.pl
--- a/falco/misc/mkHtmlReport.pl Fri Oct 10 11:28:57 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,201 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use Spreadsheet::WriteExcel;
-
-$| = 1;
-my $dir = shift;
-my $outdir = shift || "./";
-my $pat = shift || "";
-
-# QC
-# Results
-my @samples = ();
-open INDEX, ">$outdir/index.html";
-my $htmlHead = qq(
-
-
-
-
-
-
-);
-print INDEX $htmlHead;
-opendir DIR, "$dir";
-while (my $cd = readdir DIR) {
- if ($cd =~ /(.*$pat)\.qc\.targets\.txt$/) {
- my $sam = $1;
-# next if ($sam =~ /R[12]/);
- print STDERR $1 . "\n";
- push @samples, $1;
- }
-}
-close DIR;
-
-print INDEX "";
-print INDEX "Sample BAM snp indel readCnt Amp > 100 \n";
-#foreach my $sam (sort @samples) {
-# print INDEX "$sam BAM BAI \n";
-
-#}
-my %link = ();
-
-foreach my $sample (sort @samples) {
- print STDERR "Processing $sample\n";
-# next if ($sample =~ /R[12]/);
-
- my $readCnt = 0;
- my $amp100 = 0;
-
- open OUT, ">$outdir/$sample.html";
- open OUT2, ">$outdir/$sample.tsv";
- my $excelBook = Spreadsheet::WriteExcel->new("$outdir/$sample.xls");
- my $excel = $excelBook->add_worksheet();
- print OUT $htmlHead;
- my %QC = ();
- open QC, "<$dir/$sample.qc.targets.txt";
- readline QC;
- print STDERR "Reading in $sample.qc.targets.txt\n";
- while () {
- chomp;
- my @row = split(/\t/, $_);
- my @id = split(/[\_\.\-:]/, $row[0]);
- $readCnt += $row[-1]; # DP
- $amp100++ if ($row[-1] >= 100); #
- if ($#id != 10) {
- $id[0] =~ /(\D+)(\d+)/;
- $id[0] = $2;
- unshift @id, $1;
- }
- $QC{$row[0]}{QC} = [@id, @row];# if ($id[0]);
- foreach my $c ($row[4] .. $row[5]) {
- $link{$id[-3] . ":" . $c}{$row[0]} = "Assay";
- }
- foreach my $c ($row[2] .. $row[4], $row[5] .. $row[3]) {
- $link{$id[-3] . ":" . $c}{$row[0]} = "LSO";
- }
- }
- close QC;
-
- open RES, "<$dir/$sample.res.filtered.tsv" or die "Unable to open $dir/$sample\n";
- my %uniq = ();
- my $colCnt = 0;
- my $resHead = readline(RES);
- chomp $resHead;
- $resHead =~ s/^#//;
- $resHead =~ s/\s+$//;
- my %resCol = map { $_ => $colCnt++ } split(/\t/, $resHead);
- my @keyColsN = qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Gene_Coding Transcript_ID Exon_Rank/;
- #my @keyColsN = qw/QUAL Gene_Name Codon_Change Amino_Acid_change vaf DP AD Tag CHROM POS ID REF ALT Context Effect_Impact Functional_Class Amino_Acid_length Coding Transcript Exon/;
-# my @keyColsN = qw/CHROM POS ID REF ALT QUAL DP AD vaf Context Effect_Impact Functional_Class Codon_Change Amino_Acid_change Amino_Acid_length Gene_Name Coding Transcript Exon Tag/;
- my @keyColsI = map { $resCol{$_} } @keyColsN;
-# print STDERR join(":", @keyColsN) . "\n";
-# print STDERR join(":", @keyColsI) . "\n";
-# sleep 10;
- print STDERR "Processing results\n";
- while () {
- chomp;
- my @row = split(/\t/, $_);
- my $cpos = join(":", @row[0, 1]);
- if (exists $link{$cpos}) {
-# my $key = join(":", @row[0 .. 4,6, 9,10,11,12,14 .. 21]);
- my $key = join(":", @row[@keyColsI]);
- #print STDERR join(":", @keyColsI) . "\n";
-# print STDERR join(":", @row) . "\n"; sleep 1;
- if (not exists($uniq{$key})) {
- foreach my $locus (keys(%{$link{$cpos}})) {
- next if ($link{$cpos}{$locus} eq "LSO");
- push @{$QC{$locus}{RES}}, [@row, $link{$cpos}{$locus}];
- # print STDERR "Adding $key to $locus\n\n"; #sleep 1;
- }
- $uniq{$key} = 0;
- }
- else {
- # print STDERR $key . " : Exists\n\n"; #sleep 1;
- }
- }
- }
- close RES;
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- print OUT " ";
- #print OUT " ";
- print OUT "Download: TSV ";
- print OUT "XLS
";
- print OUT "\n";
-# my @colnames = qw/depth chr pos id ref var qual DP AD vaf context impact effectClass codonChange AAChange RefSeqLength geneName RefSeqClass RefSeqID Exon Tag/;
- my @colnames = ("depth", @keyColsN);
- print OUT "" . join("", map { "$_ " } ("Amplicon", "c","c2", "b", @colnames)) . " ";
- print OUT2 join("\t", "Amplicon", @colnames) . "\n";
- my $excelAref = [["Amplicon", @colnames]];
- my %rescnt = ();
- foreach my $locus (sort keys(%QC)) {
- # my @targets = keys(%{$QC{$locus}{QC}});
-# print OUT "";
- my $nres = 1;
- $nres = scalar(@{$QC{$locus}{RES}}) if ($QC{$locus}{RES});
-
- print OUT "$locus \n";
- print OUT "c \n";
- print OUT "c2 \n";
- print OUT "b \n";
- print OUT "$QC{$locus}{QC}->[-1] \n"; #";
-
- foreach my $res (@{$QC{$locus}{RES}}) {
- print OUT " \n";
- print OUT join(" \n", @{$res}[@keyColsI]);
- print OUT " \n";
- print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]) . "\n";
- push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], @{$res}[@keyColsI]];
- my $pl = $res->[$keyColsI[7]];
- my $ref = $res->[$keyColsI[11]];
- my $var = $res->[$keyColsI[12]];
-# print STDERR "$pl $ref $var\n"; sleep 1;
- if (length($ref) == length($var)) {
- $rescnt{$pl . "snp"}++;
- }
- else {
- $rescnt{$pl . "indel"}++;
- }
-# $rescnt++;
- }
- print STDERR $locus . ":" . $nres . "\n";
- print STDERR $locus . ":" . join("-", @{$QC{$locus}{RES}}) . "\n";
- if (scalar(@{$QC{$locus}{RES}}) == 0) { #$nres == 0) {
- print OUT2 join("\t", $locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)) . "\n";
- push @$excelAref, [$locus, $QC{$locus}{QC}->[-1], ("-") x scalar(@keyColsI)];
- }
- print OUT " \n";
- }
- print OUT "
\n";
- print OUT "\n";
- close OUT;
- close OUT2;
- $excel->write_col(0,0, $excelAref);
- $excelBook->close();
- my @resCnt = map { $rescnt{$_} || 0 } qw/Customsnp Customindel/;
- my $pctGood = sprintf("%.2f", $amp100 / scalar(keys(%QC)) * 100);
- print INDEX "$sample BAM $resCnt[0] $resCnt[1] $readCnt $pctGood \n";
-
-}
-
-print INDEX "
";
-print INDEX " ";
-print INDEX " ";
-print INDEX " ";
-print INDEX "