# HG changeset patch # User mingchen0919 # Date 1527608230 14400 # Node ID 48e4e22dab0e8fd4290f8f04b560a554baaa9b53 # Parent 9b1ebab47c0de2a9d39f48f66994bd75a37cd73c planemo upload diff -r 9b1ebab47c0d -r 48e4e22dab0e command-line-arguments.csv --- a/command-line-arguments.csv Tue May 29 10:38:12 2018 -0400 +++ b/command-line-arguments.csv Tue May 29 11:37:10 2018 -0400 @@ -1,3 +1,5 @@ short flag,argument mask,data type,variable name o,1,character,report -d,1,character,report.files_path \ No newline at end of file +d,1,character,report.files_path +v,1,character,vcf_input +s,1,character,fSNPs_matrix \ No newline at end of file diff -r 9b1ebab47c0d -r 48e4e22dab0e expose-outputs-to-galaxy-history.sh --- a/expose-outputs-to-galaxy-history.sh Tue May 29 10:38:12 2018 -0400 +++ b/expose-outputs-to-galaxy-history.sh Tue May 29 11:37:10 2018 -0400 @@ -9,3 +9,7 @@ if [ -e "index.html" ]; then cp index.html ${REPORT} fi + +if [ -e "fSNPS.csv" ]; then + cp fSNPS.csv ${X_s} +fi \ No newline at end of file diff -r 9b1ebab47c0d -r 48e4e22dab0e rmarkdown_report.Rmd --- a/rmarkdown_report.Rmd Tue May 29 10:38:12 2018 -0400 +++ b/rmarkdown_report.Rmd Tue May 29 11:37:10 2018 -0400 @@ -55,3 +55,65 @@ +## Load Data + +```{r} +VCF <- read.vcfR(opt$X_v) # import the VCF in R +``` + +## Create a Genotype Matrix + +The VCF object contains several additional information concerning genetic variant quality that are not needed in this exercise. We want to obtain a Genotype Matrix out of the VCF object, we can do it as shown here below: + +```{r} +GT <- vcfR2loci(VCF) + +GT[1:10,1:10] # The GT matrix shows the genotype for each individual (row) and genetic variant (column) + +dim(GT) +``` + +For our calculations, we prefer having the genotypes coded as 0,1,2 (homozygous 0/0, heterozygous, homozygous 1/1). The code here below performs this transformation creating the SNPs matrix. + +```{r} +GT <- as.matrix(GT) + +GT[GT=='0/0'] <- 0 +GT[GT=='0/1'] <- 1 +GT[GT=='1/0'] <- 1 +GT[GT=='1/1'] <- 2 + +SNPS <- GT +SNPS <- apply(SNPS,2,as.numeric) +rownames(SNPS) <- rownames(GT) +``` + + +The SNPS matrix has the genotype values coded as 0,1,2: + +```{r} +SNPS[1:10,1:10] +``` + + +## Check for genetic diversity metrics + +Before starting, we create a fSNPS matrix identical to the SNPS matrix. We will perform all the calculation on the fSNPS matrix keeping the SNPS matrix as backup. + +```{r} +fSNPS <- SNPS +``` + + +There are R packages that allow to compute this metrics directly with built-in function. In this exercise we use the hand-written formula to better show the meaning of each procedure. + +### Missingness by SNP + +For each SNP, it computes the % of missing observation across the individuals. + +```{r} +MNsnp <- apply(fSNPS, 2, function(x) { sum(is.na(x))/length(x) }) +hist(MNsnp, breaks=10) # We can observe that most of the SNPS have a Missingness below 0.01 (i.e. ~ 2 individual ot of 160). +fSNPS <- fSNPS[,MNsnp<0.01] # We apply a 1% threshold +write.csv(fSNPS, file = "fSNPS.csv", quote = FALSE, col.names = TRUE) +``` diff -r 9b1ebab47c0d -r 48e4e22dab0e rmarkdown_report.xml --- a/rmarkdown_report.xml Tue May 29 10:38:12 2018 -0400 +++ b/rmarkdown_report.xml Tue May 29 11:37:10 2018 -0400 @@ -31,12 +31,17 @@ -o $report -d $report.files_path + -v $vcf_input + -s $fSNPs_matrix + ]]> + +