# HG changeset patch
# User mingchen0919
# Date 1527608230 14400
# Node ID 48e4e22dab0e8fd4290f8f04b560a554baaa9b53
# Parent 9b1ebab47c0de2a9d39f48f66994bd75a37cd73c
planemo upload
diff -r 9b1ebab47c0d -r 48e4e22dab0e command-line-arguments.csv
--- a/command-line-arguments.csv Tue May 29 10:38:12 2018 -0400
+++ b/command-line-arguments.csv Tue May 29 11:37:10 2018 -0400
@@ -1,3 +1,5 @@
short flag,argument mask,data type,variable name
o,1,character,report
-d,1,character,report.files_path
\ No newline at end of file
+d,1,character,report.files_path
+v,1,character,vcf_input
+s,1,character,fSNPs_matrix
\ No newline at end of file
diff -r 9b1ebab47c0d -r 48e4e22dab0e expose-outputs-to-galaxy-history.sh
--- a/expose-outputs-to-galaxy-history.sh Tue May 29 10:38:12 2018 -0400
+++ b/expose-outputs-to-galaxy-history.sh Tue May 29 11:37:10 2018 -0400
@@ -9,3 +9,7 @@
if [ -e "index.html" ]; then
cp index.html ${REPORT}
fi
+
+if [ -e "fSNPS.csv" ]; then
+ cp fSNPS.csv ${X_s}
+fi
\ No newline at end of file
diff -r 9b1ebab47c0d -r 48e4e22dab0e rmarkdown_report.Rmd
--- a/rmarkdown_report.Rmd Tue May 29 10:38:12 2018 -0400
+++ b/rmarkdown_report.Rmd Tue May 29 11:37:10 2018 -0400
@@ -55,3 +55,65 @@
+## Load Data
+
+```{r}
+VCF <- read.vcfR(opt$X_v) # import the VCF in R
+```
+
+## Create a Genotype Matrix
+
+The VCF object contains several additional information concerning genetic variant quality that are not needed in this exercise. We want to obtain a Genotype Matrix out of the VCF object, we can do it as shown here below:
+
+```{r}
+GT <- vcfR2loci(VCF)
+
+GT[1:10,1:10] # The GT matrix shows the genotype for each individual (row) and genetic variant (column)
+
+dim(GT)
+```
+
+For our calculations, we prefer having the genotypes coded as 0,1,2 (homozygous 0/0, heterozygous, homozygous 1/1). The code here below performs this transformation creating the SNPs matrix.
+
+```{r}
+GT <- as.matrix(GT)
+
+GT[GT=='0/0'] <- 0
+GT[GT=='0/1'] <- 1
+GT[GT=='1/0'] <- 1
+GT[GT=='1/1'] <- 2
+
+SNPS <- GT
+SNPS <- apply(SNPS,2,as.numeric)
+rownames(SNPS) <- rownames(GT)
+```
+
+
+The SNPS matrix has the genotype values coded as 0,1,2:
+
+```{r}
+SNPS[1:10,1:10]
+```
+
+
+## Check for genetic diversity metrics
+
+Before starting, we create a fSNPS matrix identical to the SNPS matrix. We will perform all the calculation on the fSNPS matrix keeping the SNPS matrix as backup.
+
+```{r}
+fSNPS <- SNPS
+```
+
+
+There are R packages that allow to compute this metrics directly with built-in function. In this exercise we use the hand-written formula to better show the meaning of each procedure.
+
+### Missingness by SNP
+
+For each SNP, it computes the % of missing observation across the individuals.
+
+```{r}
+MNsnp <- apply(fSNPS, 2, function(x) { sum(is.na(x))/length(x) })
+hist(MNsnp, breaks=10) # We can observe that most of the SNPS have a Missingness below 0.01 (i.e. ~ 2 individual ot of 160).
+fSNPS <- fSNPS[,MNsnp<0.01] # We apply a 1% threshold
+write.csv(fSNPS, file = "fSNPS.csv", quote = FALSE, col.names = TRUE)
+```
diff -r 9b1ebab47c0d -r 48e4e22dab0e rmarkdown_report.xml
--- a/rmarkdown_report.xml Tue May 29 10:38:12 2018 -0400
+++ b/rmarkdown_report.xml Tue May 29 11:37:10 2018 -0400
@@ -31,12 +31,17 @@
-o $report
-d $report.files_path
+ -v $vcf_input
+ -s $fSNPs_matrix
+
]]>
+
+