Mercurial > repos > mingchen0919 > aurora_fastqc_site
changeset 1:645291efd2e7 draft
working version
author | mingchen0919 |
---|---|
date | Tue, 27 Feb 2018 10:36:24 -0500 |
parents | f74fdae99053 |
children | 7abe0e473013 |
files | 01_evaluation_overview.Rmd 02_per_base_sequence_quality.Rmd 03_per_tile_sequence_quality.Rmd 04_per_sequence_quality_score.Rmd 05_per_base_sequence_content.Rmd 06_per_sequence_gc_content.Rmd 07_per_base_n_content.Rmd 08_sequence_length_distribution.Rmd 09_sequence_duplication_levels.Rmd 10_adapter_content.Rmd 11_kmer_content.Rmd _site.yml fastqc_report.Rmd fastqc_site.xml fastqc_site_render.R |
diffstat | 15 files changed, 482 insertions(+), 391 deletions(-) [+] |
line wrap: on
line diff
--- a/01_evaluation_overview.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/01_evaluation_overview.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -1,5 +1,20 @@ +--- +title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- - +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` # Run FastQC @@ -8,7 +23,6 @@ cp ${X_r} ${X_d}/read_1.fq cp ${X_R} ${X_d}/read_2.fq -mkdir -p read_1 read_2 cat >temp.sh <<EOL fastqc \\ -q \\
--- a/02_per_base_sequence_quality.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/02_per_base_sequence_quality.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,62 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + + +```{r 'function definition', echo=FALSE} +# Define a function to extract outputs for each module from fastqc output +extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { + f = readLines(fastqc_data) + start_line = grep(module_name, f) + end_module_lines = grep('END_MODULE', f) + end_line = end_module_lines[which(end_module_lines > start_line)[1]] + module_data = f[(start_line+1):(end_line-1)] + writeLines(module_data, '/tmp/temp.txt') + read.csv('/tmp/temp.txt', sep = '\t', header = header, comment.char = comment.char) +} +``` + +# Per base sequence quality + +```{r 'per base sequence quality', fig.width=10} +## reads 1 +pbsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence quality') +pbsq_1$id = 1:length(pbsq_1$X.Base) +pbsq_1$trim = 'before' + +## reads 2 +pbsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence quality') +pbsq_2$id = 1:length(pbsq_2$X.Base) +pbsq_2$trim = 'after' + +comb_pbsq = rbind(pbsq_1, pbsq_2) +comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) + +p = ggplot(data = comb_pbsq) + + geom_boxplot(mapping = aes(x = id, + lower = Lower.Quartile, + upper = Upper.Quartile, + middle = Median, + ymin = X10th.Percentile, + ymax = X90th.Percentile, + fill = "yellow"), + stat = 'identity') + + geom_line(mapping = aes(x = id, y = Mean, color = "red")) + + scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + + scale_fill_identity() + + scale_color_identity() + + ylim(0, max(comb_pbsq$Upper.Quartile) + 5) + + xlab('Position in read (bp)') + + facet_grid(. ~ trim) + + theme(axis.text.x = element_text(angle=45)) +p + +```
--- a/03_per_tile_sequence_quality.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/03_per_tile_sequence_quality.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,44 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Per tile sequence quality + +```{r 'per tile sequence quality', fig.width=10} +## check if 'per tile sequence quality' module exits or not +check_ptsq = grep('Per tile sequence quality', readLines(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'))) +if (length(check_ptsq) > 0) { + ## reads 1 + ptsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per tile sequence quality') + ptsq_1$trim = 'before' + + ## reads 2 + ptsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per tile sequence quality') + ptsq_2$trim = 'after' + + comb_ptsq = rbind(ptsq_1, ptsq_2) + comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) + comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) + + # convert integers to charaters + comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) + + p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + + geom_raster() + + facet_grid(. ~ trim) + + xlab('Position in read (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) + ggplotly(p) +} else { + print('No "per tile sequence quality" data') +} +``` \ No newline at end of file
--- a/04_per_sequence_quality_score.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/04_per_sequence_quality_score.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,35 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + + +# Per sequence quality score + +```{r 'Per sequence quality score', fig.width=10} +## reads 1 +psqs_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence quality scores') +psqs_1$trim = 'before' + +## reads 2 +psqs_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence quality scores') +psqs_2$trim = 'after' + +comb_psqs = rbind(psqs_1, psqs_2) +comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) + +p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +```
--- a/05_per_base_sequence_content.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/05_per_base_sequence_content.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,43 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + + +# Per base sequence content + +```{r 'Per base sequence content', fig.width=10} +## reads 1 +pbsc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence content') +pbsc_1$id = 1:length(pbsc_1$X.Base) + +melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) +melt_pbsc_1$trim = 'before' + + +## reads 2 +pbsc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence content') +pbsc_2$id = 1:length(pbsc_2$X.Base) + +melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) +melt_pbsc_2$trim = 'after' + +comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) +comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) + +p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + + ylim(0, 100) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- a/06_per_sequence_gc_content.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/06_per_sequence_gc_content.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,33 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Per sequence GC content + +```{r 'Per sequence GC content', fig.width=10} +## reads 1 +psGCc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence GC content') +psGCc_1$trim = 'before' + +## reads 2 +psGCc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence GC content') +psGCc_2$trim = 'after' + +comb_psGCc = rbind(psGCc_1, psGCc_2) +comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) + +p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- a/07_per_base_n_content.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/07_per_base_n_content.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,38 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Per base N content + +```{r 'Per base N content', fig.width=10} +## reads 1 +pbNc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base N content') +pbNc_1$id = 1:length(pbNc_1$X.Base) +pbNc_1$trim = 'before' + +## reads 2 +pbNc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base N content') +pbNc_2$id = 1:length(pbNc_2$X.Base) +pbNc_2$trim = 'after' + +comb_pbNc = rbind(pbNc_1, pbNc_2) +comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) + +p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + + facet_grid(. ~ trim) + + ylim(0, 1) + + xlab('N-Count') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- a/08_sequence_length_distribution.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/08_sequence_length_distribution.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,37 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Sequence Length Distribution + +```{r 'Sequence Length Distribution', fig.width=10} +## reads 1 +sld_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') +sld_1$id = 1:length(sld_1$X.Length) +sld_1$trim = 'before' + +## reads 2 +sld_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') +sld_2$id = 1:length(sld_2$X.Length) +sld_2$trim = 'after' + +comb_sld = rbind(sld_1, sld_2) +comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) + +p = ggplot(data = comb_sld, aes(x = id, y = Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + + facet_grid(. ~ trim) + + xlab('Sequence Length (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- a/09_sequence_duplication_levels.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/09_sequence_duplication_levels.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,45 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + + +# Sequence Duplication Levels + +```{r 'Sequence Duplication Levels', fig.width=10} +## reads 1 +sdl_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_1$id = 1:length(sdl_1$Duplication_Level) + +melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) +melt_sdl_1$trim = 'before' + + +## reads 2 +sdl_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_2$id = 1:length(sdl_2$Duplication_Level) + +melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) +melt_sdl_2$trim = 'after' + +comb_sdl = rbind(melt_sdl_1, melt_sdl_2) +comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) + +p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + + geom_line() + + scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + + facet_grid(. ~ trim) + + xlab('Sequence Duplication Level') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- a/10_adapter_content.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/10_adapter_content.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,41 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Adapter Content + +```{r 'Adapter Content', fig.width=10} +## reads 1 +ac_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Adapter Content') +ac_1$id = 1:length(ac_1$X.Position) + +melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) +melt_ac_1$trim = 'before' + +## reads 2 +ac_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Adapter Content') +ac_2$id = 1:length(ac_2$X.Position) + +melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) +melt_ac_2$trim = 'after' + +comb_ac = rbind(melt_ac_1, melt_ac_2) +comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) + +p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_ac$id), max(comb_ac$id)) + + ylim(0, 1) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- a/11_kmer_content.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/11_kmer_content.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -0,0 +1,26 @@ +--- +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = as.logical(opt$X_e), + error = TRUE, + eval = TRUE +) +``` + +# Kmer Content {.tabset} + +## Before + +```{r 'Kmer Content (before)', fig.width=10} +kc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Kmer Content') +knitr::kable(kc_1) +``` + +## After +```{r 'Kmer Content (after)', fig.width=10} +kc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Kmer Content') +knitr::kable(kc_2) +```
--- a/_site.yml Tue Feb 27 00:39:56 2018 -0500 +++ b/_site.yml Tue Feb 27 10:36:24 2018 -0500 @@ -1,5 +1,5 @@ name: "FastQC Website" -output_dir: opt$X_d +output_dir: "../_site" navbar: title: "FastQC" type: inverse @@ -8,29 +8,29 @@ icon: fa-home href: index.html - text: "Evaluation Overview" - href: x01_evaluation_overview.html + href: 01_evaluation_overview.html - text: "Evaluation by data module" -# menu: -# - text: "Per Base Sequence Quality" -# href: x02_per_base_sequence_quality.html -# - text: "Per Tile Sequence Quality" -# href: x03_per_tile_sequence_quality.html -# - text: "Per Sequence Quality Score" -# href: x04_per_sequence_quality_score.html -# - text: "Per Base Sequence Content" -# href: x05_per_base_sequence_content.html -# - text: "Per Sequence GC Content" -# href: x06_per_sequence_gc_content.html -# - text: "Per Base N Content" -# href: x07_per_base_n_content.html -# - text: "Sequence Length Distribution" -# href: x08_sequence_length_distribution.html -# - text: "Sequence Duplication Levels" -# href: x09_sequence_duplication_levels.html -# - text: "Adapter Content" -# href: x10_adapter_content.html -# - text: "Kmer Content" -# href: x11_kmer_content.html + menu: + - text: "Per Base Sequence Quality" + href: 02_per_base_sequence_quality.html + - text: "Per Tile Sequence Quality" + href: 03_per_tile_sequence_quality.html + - text: "Per Sequence Quality Score" + href: 04_per_sequence_quality_score.html + - text: "Per Base Sequence Content" + href: 05_per_base_sequence_content.html + - text: "Per Sequence GC Content" + href: 06_per_sequence_gc_content.html + - text: "Per Base N Content" + href: 07_per_base_n_content.html + - text: "Sequence Length Distribution" + href: 08_sequence_length_distribution.html + - text: "Sequence Duplication Levels" + href: 09_sequence_duplication_levels.html + - text: "Adapter Content" + href: 10_adapter_content.html + - text: "Kmer Content" + href: 11_kmer_content.html output: html_document: theme: cosmo
--- a/fastqc_report.Rmd Tue Feb 27 00:39:56 2018 -0500 +++ b/fastqc_report.Rmd Tue Feb 27 10:36:24 2018 -0500 @@ -17,372 +17,22 @@ ``` -```{bash eval=TRUE,echo=FALSE} -#create extra file directory -mkdir -p ${X_d} -``` -# Run FastQC - -```{bash eval=TRUE,echo=FALSE} -cd ${X_d} -cp ${X_r} ${X_d}/read_1.fq -cp ${X_R} ${X_d}/read_2.fq - -mkdir -p read_1 read_2 -cat >temp.sh <<EOL -fastqc \\ - -q \\ - -c ${X_c} \\ - -l ${X_l} \\ - ${X_d}/read_1.fq > /dev/null 2>&1 - -fastqc \\ - -q \\ - -c ${X_c} \\ - -l ${X_l} \\ - ${X_d}/read_2.fq > /dev/null 2>&1 -EOL - -grep -v None temp.sh > fastqc.sh - -# run fastqc -sh fastqc.sh - -# unzip outputs -unzip -q read_1_fastqc.zip -unzip -q read_2_fastqc.zip -``` - -```{r} -# display fastqc job script -fastqc_sh = paste0(opt$X_d, '/fastqc.sh') -tags$code(tags$pre(readChar(fastqc_sh, file.info(fastqc_sh)$size ))) -``` - -# Raw FastQC reports - -## Before trimming -```{r eval=TRUE} -ori_html = tags$a(href = 'read_1_fastqc/fastqc_report.html', opt$X_n) -ori_fastqc_data = tags$a(href = 'read_1_fastqc/fastqc_data.txt', 'fastqc_data.txt') -ori_summary = tags$a(href = 'read_1_fastqc/summary.txt', 'summary.txt') -tags$ul( - tags$li(ori_html), - tags$li(ori_fastqc_data), - tags$li(ori_summary) - ) -``` - -## After trimming -```{r eval=TRUE} -ori_html = tags$a(href = 'read_2_fastqc/fastqc_report.html', opt$X_n) -ori_fastqc_data = tags$a(href = 'read_2_fastqc/fastqc_data.txt', 'fastqc_data.txt') -ori_summary = tags$a(href = 'read_2_fastqc/summary.txt', 'summary.txt') -tags$ul( - tags$li(ori_html), - tags$li(ori_fastqc_data), - tags$li(ori_summary) - ) -``` -# Fastqc Output Visualization - -## Overview - -```{r eval=TRUE} -read_1_summary = read.csv(paste0(opt$X_d, '/read_1_fastqc/summary.txt'), header = FALSE, sep = '\t')[, 2:1] -read_2_summary = read.csv(paste0(opt$X_d, '/read_2_fastqc/summary.txt'), header = FALSE, sep = '\t')[, 1] -combined_summary = cbind(read_1_summary, read_2_summary) -names(combined_summary) = c('MODULE', paste0(opt$X_n, '(before)'), paste0(opt$X_N, '(after)')) -combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' -combined_summary[combined_summary == 'WARN'] = 'WARN (!)' -knitr::kable(combined_summary) -``` - -## Visualization by data module {.tabset} - -* Define a function to extract outputs for each module from fastqc output - -```{r 'function definition'} -extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { - f = readLines(fastqc_data) - start_line = grep(module_name, f) - end_module_lines = grep('END_MODULE', f) - end_line = end_module_lines[which(end_module_lines > start_line)[1]] - module_data = f[(start_line+1):(end_line-1)] - writeLines(module_data, '/tmp/temp.txt') - read.csv('/tmp/temp.txt', sep = '\t', header = header, comment.char = comment.char) -} -``` - -### Per base sequence quality - -```{r 'per base sequence quality', fig.width=10} -## reads 1 -pbsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence quality') -pbsq_1$id = 1:length(pbsq_1$X.Base) -pbsq_1$trim = 'before' - -## reads 2 -pbsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence quality') -pbsq_2$id = 1:length(pbsq_2$X.Base) -pbsq_2$trim = 'after' - -comb_pbsq = rbind(pbsq_1, pbsq_2) -comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) +## -p = ggplot(data = comb_pbsq) + - geom_boxplot(mapping = aes(x = id, - lower = Lower.Quartile, - upper = Upper.Quartile, - middle = Median, - ymin = X10th.Percentile, - ymax = X90th.Percentile, - fill = "yellow"), - stat = 'identity') + - geom_line(mapping = aes(x = id, y = Mean, color = "red")) + - scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + - scale_fill_identity() + - scale_color_identity() + - ylim(0, max(comb_pbsq$Upper.Quartile) + 5) + - xlab('Position in read (bp)') + - facet_grid(. ~ trim) + - theme(axis.text.x = element_text(angle=45)) -p - -``` - -### Per tile sequence quality - -```{r 'per tile sequence quality', fig.width=10} -## check if 'per tile sequence quality' module exits or not -check_ptsq = grep('Per tile sequence quality', readLines(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'))) -if (length(check_ptsq) > 0) { - ## reads 1 - ptsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per tile sequence quality') - ptsq_1$trim = 'before' - - ## reads 2 - ptsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per tile sequence quality') - ptsq_2$trim = 'after' - - comb_ptsq = rbind(ptsq_1, ptsq_2) - comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) - comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) - - # convert integers to charaters - comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) - - p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + - geom_raster() + - facet_grid(. ~ trim) + - xlab('Position in read (bp)') + - ylab('') + - theme(axis.text.x = element_text(angle=45)) - ggplotly(p) -} else { - print('No "per tile sequence quality" data') -} +## -``` - -### Per sequence quality score - -```{r 'Per sequence quality score', fig.width=10} -## reads 1 -psqs_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence quality scores') -psqs_1$trim = 'before' - -## reads 2 -psqs_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence quality scores') -psqs_2$trim = 'after' - -comb_psqs = rbind(psqs_1, psqs_2) -comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) - -p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + - geom_line(color = 'red') + - facet_grid(. ~ trim) + - xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + - xlab('Mean Sequence Qaulity (Phred Score)') + - ylab('') -ggplotly(p) -``` - - -### Per base sequence content - -```{r 'Per base sequence content', fig.width=10} -## reads 1 -pbsc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence content') -pbsc_1$id = 1:length(pbsc_1$X.Base) - -melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) -melt_pbsc_1$trim = 'before' - - -## reads 2 -pbsc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence content') -pbsc_2$id = 1:length(pbsc_2$X.Base) - -melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) -melt_pbsc_2$trim = 'after' - -comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) -comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) - -p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + - geom_line() + - facet_grid(. ~ trim) + - xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + - ylim(0, 100) + - xlab('Position in read (bp)') + - ylab('') -ggplotly(p) -``` - -### Per sequence GC content - -```{r 'Per sequence GC content', fig.width=10} -## reads 1 -psGCc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence GC content') -psGCc_1$trim = 'before' - -## reads 2 -psGCc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence GC content') -psGCc_2$trim = 'after' - -comb_psGCc = rbind(psGCc_1, psGCc_2) -comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) - -p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + - geom_line(color = 'red') + - facet_grid(. ~ trim) + - xlab('Mean Sequence Qaulity (Phred Score)') + - ylab('') -ggplotly(p) -``` +## -### Per base N content - -```{r 'Per base N content', fig.width=10} -## reads 1 -pbNc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base N content') -pbNc_1$id = 1:length(pbNc_1$X.Base) -pbNc_1$trim = 'before' - -## reads 2 -pbNc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base N content') -pbNc_2$id = 1:length(pbNc_2$X.Base) -pbNc_2$trim = 'after' - -comb_pbNc = rbind(pbNc_1, pbNc_2) -comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) - -p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + - geom_line(color = 'red') + - scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + - facet_grid(. ~ trim) + - ylim(0, 1) + - xlab('N-Count') + - ylab('') + - theme(axis.text.x = element_text(angle=45)) -ggplotly(p) -``` - - -### Sequence Length Distribution - -```{r 'Sequence Length Distribution', fig.width=10} -## reads 1 -sld_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') -sld_1$id = 1:length(sld_1$X.Length) -sld_1$trim = 'before' - -## reads 2 -sld_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') -sld_2$id = 1:length(sld_2$X.Length) -sld_2$trim = 'after' - -comb_sld = rbind(sld_1, sld_2) -comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) - -p = ggplot(data = comb_sld, aes(x = id, y = Count)) + - geom_line(color = 'red') + - scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + - facet_grid(. ~ trim) + - xlab('Sequence Length (bp)') + - ylab('') + - theme(axis.text.x = element_text(angle=45)) -ggplotly(p) -``` - -### Sequence Duplication Levels +## -```{r 'Sequence Duplication Levels', fig.width=10} -## reads 1 -sdl_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') -names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') -sdl_1$id = 1:length(sdl_1$Duplication_Level) - -melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) -melt_sdl_1$trim = 'before' - - -## reads 2 -sdl_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') -names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') -sdl_2$id = 1:length(sdl_2$Duplication_Level) - -melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) -melt_sdl_2$trim = 'after' - -comb_sdl = rbind(melt_sdl_1, melt_sdl_2) -comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) +## -p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + - geom_line() + - scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + - facet_grid(. ~ trim) + - xlab('Sequence Duplication Level') + - ylab('') + - theme(axis.text.x = element_text(angle=45)) -ggplotly(p) -``` - -### Adapter Content - -```{r 'Adapter Content', fig.width=10} -## reads 1 -ac_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Adapter Content') -ac_1$id = 1:length(ac_1$X.Position) - -melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) -melt_ac_1$trim = 'before' - -## reads 2 -ac_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Adapter Content') -ac_2$id = 1:length(ac_2$X.Position) - -melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) -melt_ac_2$trim = 'after' - -comb_ac = rbind(melt_ac_1, melt_ac_2) -comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) - -p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + - geom_line() + - facet_grid(. ~ trim) + - xlim(min(comb_ac$id), max(comb_ac$id)) + - ylim(0, 1) + - xlab('Position in read (bp)') + - ylab('') -ggplotly(p) -``` +## ### Kmer Content {.tabset}
--- a/fastqc_site.xml Tue Feb 27 00:39:56 2018 -0500 +++ b/fastqc_site.xml Tue Feb 27 10:36:24 2018 -0500 @@ -25,7 +25,7 @@ <command><![CDATA[ - Rscript '${__tool_directory__}/fastqc_report_render.R' + Rscript '${__tool_directory__}/fastqc_site_render.R' -e $echo -r $reads_1 -n '$reads_1.name' @@ -38,7 +38,7 @@ -d $report.files_path -s $sink_message - -p '${__tool_directory__}/fastqc_site.Rmd' + -t '${__tool_directory__}' ]]></command> <inputs>
--- a/fastqc_site_render.R Tue Feb 27 00:39:56 2018 -0500 +++ b/fastqc_site_render.R Tue Feb 27 10:36:24 2018 -0500 @@ -19,27 +19,50 @@ # getopt_specification_matrix(extract_short_flags('fastqc_report.xml')) %>% # write.table(file = 'spec.txt', sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) + +# get arguments into R spec_matrix = as.matrix( data.frame(stringsAsFactors=FALSE, long_flags = c("X_e", "X_r", "X_n", "X_R", "X_N", "X_c", "X_l", - "X_o", "X_d", "X_s", "X_p"), + "X_o", "X_d", "X_s", "X_t"), short_flags = c("e", "r", "n", "R", "N", "c", "l", "o", "d", "s", - "p"), + "t"), argument_mask_flags = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), data_type_flags = c("character", "character", "character", "character", "character", "character", "character", "character", "character", "character", "character") ) ) -# get arguments into R opt = getopt(spec_matrix) -# define system environment variables + +# using passed arguments in R to define system environment variables do.call(Sys.setenv, opt[-1]) -# render website -# render(input = opt$X_p, output_file = opt$X_o) -render_site() -file.copy(paste0(opt$X_d, '/index.html'), opt$X_o, recursive = TRUE) +# create the output associated directory to store all outputs +dir.create(opt$X_d, recursive = TRUE) + +#-----------------render site-------------- +# copy site generating materials into folder "_site" within the output associated directory +file.copy(opt$X_t, opt$X_d, recursive = TRUE) +# render site to the output associated directory +render_site(input = paste0(opt$X_d, '/aurora_fastqc_site')) +# remove site generating materials from output associated directory +unlink(paste0(opt$X_d, '/aurora_fastqc_site'), recursive = TRUE) +# move _site/* into output associated directory +move_cmd = paste0('mv ', opt$X_d, '/_site/* ', opt$X_d) +system(move_cmd) +#------------------------------------------ + +#-----link index.html to output----- +cp_index = paste0('cp ', opt$X_d, '/index.html ', opt$X_o) +system(cp_index) +#----------------------------------- + +#==============the end============== + + + +# file.copy(paste0(opt$X_d, '/index.html'), opt$X_o, recursive = TRUE) ##--------end of code rendering .Rmd templates---------------- sink()