# HG changeset patch
# User cropgeeks
# Date 1524242526 14400
# Node ID ae6b94d10bffade9b75455d396235df065706092
# Parent 27b626c1c12019507ef83840f7c02343c0fef49a
Uploaded
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_1.R
--- a/dartseq_seeduk_1.R Fri Apr 20 11:15:45 2018 -0400
+++ b/dartseq_seeduk_1.R Fri Apr 20 12:42:06 2018 -0400
@@ -65,129 +65,3 @@
#[1] "Completed"
#
#
-gl_call_rate <- gl.filter.callrate(gl,method = 'loc', t=0.75)
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-#
-#Initial no. of loci = 113138
-# No. of loci deleted = 31509
-#Summary of filtered dataset
-# Call Rate > 0.75
-# No. of loci: 81629
-# No. of individuals: 376
-# No. of populations: 0
-#
-gl_rep <- gl.filter.repavg(gl_call_rate,t=0.98)
-#Reporting for a genlight object
-#Note: RepAvg is a DArT statistic reporting reproducibility averaged across alleles for each locus.
-
-#Initial no. of loci = 81629
-#No. of loci deleted = 6446
-#Summary of filtered dataset
-# Reproducibility >= 0.98
-# No. of loci: 75183
-# No. of individuals: 376
-# No. of populations: 0
-
-gl.report.callrate(gl_rep,method='ind' )
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Individuals no missing values = 0 [0%] across loci
-#Individuals with less than 5% missing values = 161 [42.8%]
-#Individuals with less than 10% missing values = 245 [65.2%]
-#Individuals with less than 15% missing values = 301 [80.1%]
-#Individuals with less than 20% missing values = 337 [89.6%]
-#Individuals with less than 25% missing values = 347 [92.3%]
-#Individuals with less than 30% missing values = 358 [95.2%]
-#Individuals with less than 35% missing values = 359 [95.5%]
-#Individuals with less than 40% missing values = 364 [96.8%]
-#Individuals with less than 45% missing values = 372 [98.9%]
-#Individuals with less than 50% missing values = 373 [99.2%]
-#Individuals with less than 55% missing values = 374 [99.5%]
-#Individuals with less than 60% missing values = 375 [99.7%]
-#[1] "Completed"
-
-gl_final <- gl.filter.callrate(gl_rep,method = 'ind', t=0.8)
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Initial no. of individuals = 376
-#Filtering a genlight object
-# no. of individuals deleted = 39
-#Individuals retained = 337
-#List of individuals deleted because of low call rate
-# 908017247001_E_5 908017247001_F_4 908017247002_A_10 908017247002_B_4 908017247002_B_5 908017247002_C_3 908017247002_D_12 908017247002_D_2 908017247002_D_6 908017247002_D_9 908017247002_E_6 908017247002_E_7 908017247002_E_9 908017247002_F_2 908017247002_F_6 908017247002_G_8 908017247002_H_10 908017247002_H_7 908017247002_H_8 908017247003_B_8 908017247003_C_8 908017247003_D_8 908017247003_E_8 908017247003_F_8 908017247003_G_6 908017247003_G_8 908017247003_H_7 908017247004_C_11 908017247004_D_11 908017247004_D_8 908017247004_D_9 908017247004_E_10 908017247004_E_11 908017247004_E_9 908017247004_F_11 908017247004_F_12 908017247004_F_6 908017247004_G_11 908017247004_H_11
-# from populations
-
-#Summary of filtered dataset
-# Call Rate > 0.8
-# No. of loci: 75183
- # No. of individuals: 337
- # No. of populations: 0
-#
-gl2gds(gl_final,outfile="gl2gds.gds")
-#Converting gl object to gds formatted file gl2gds.gds
-
-#Structure of gds file
-
-#The file name: /data/projects/seed/dart_calls/gl2gds.gds
-#The total number of samples: 268
-#The total number of SNPs: 113138
-#SNP genotypes are stored in SNP-major mode (Sample X SNP).
-#The SNP positions are not in ascending order on chromosome 1.
-#File: /data/projects/seed/dart_calls/gl2gds.gds (32.8M)
-#+ [ ] *
-#|--+ https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id { Str8 268 ZIP_ra(13.7%), 641B }
-#|--+ https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id { Str8 113138 ZIP_ra(37.9%), 637.3K }
-#|--+ https://protect-eu.mimecast.com/s/0diWClOjDH12EMtyg-Gp?domain=snp.rs.id { Int32 113138 ZIP_ra(78.4%), 346.6K }
-#|--+ snp.position { Float64 113138 ZIP_ra(14.9%), 131.5K }
-#|--+ snp.chromosome { Int32 113138 ZIP_ra(0.10%), 481B }
-#|--+ snp.allele { Str8 113138 ZIP_ra(14.4%), 63.6K }
-#|--+ genotype { Bit2 268x113138, 7.2M } *
-#\--+ loc.metrics [ data.frame ] *
-# |--+ AlleleID { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
-# |--+ CloneID { Int32 113138 ZIP_ra(78.4%), 346.6K }
-# |--+ ClusterTempIndex { Int32 113138 ZIP_ra(63.6%), 281.1K }
-# |--+ AlleleSequence { Int32,factor 113138 ZIP_ra(68.9%), 304.4K } *
-# |--+ ClusterConsensusSequence { Int32,factor 113138 ZIP_ra(66.2%), 292.5K } *
-# |--+ ClusterSize { Int32 113138 ZIP_ra(7.27%), 32.1K }
-# |--+ AlleleSeqDist { Int32 113138 ZIP_ra(8.49%), 37.5K }
-# |--+ SNP { Int32,factor 113138 ZIP_ra(38.3%), 169.2K } *
-# |--+ SnpPosition { Int32 113138 ZIP_ra(26.0%), 115.1K }
-# |--+ CallRate { Float64 113138 ZIP_ra(2.84%), 25.1K }
-# |--+ OneRatioRef { Float64 113138 ZIP_ra(32.7%), 289.2K }
-# |--+ OneRatioSnp { Float64 113138 ZIP_ra(36.1%), 318.8K }
-# |--+ FreqHomRef { Float64 113138 ZIP_ra(36.6%), 323.6K }
-# |--+ FreqHomSnp { Float64 113138 ZIP_ra(32.6%), 288.4K }
-# |--+ FreqHets { Float64 113138 ZIP_ra(20.0%), 177.2K }
-# |--+ PICRef { Float64 113138 ZIP_ra(29.9%), 264.1K }
-# |--+ PICSnp { Float64 113138 ZIP_ra(33.7%), 297.7K }
-# |--+ AvgPIC { Float64 113138 ZIP_ra(44.0%), 388.6K }
-# |--+ AvgCountRef { Float64 113138 ZIP_ra(55.3%), 489.1K }
-# |--+ AvgCountSnp { Float64 113138 ZIP_ra(36.6%), 323.8K }
-# |--+ RatioAvgCountRefAvgCountSnp { Float64 113138 ZIP_ra(57.6%), 509.2K }
-# |--+ FreqHetsMinusFreqMinHom { Float64 113138 ZIP_ra(31.6%), 279.2K }
-# |--+ AlleleCountsCorrelation { Float64 113138 ZIP_ra(48.2%), 425.8K }
-# |--+ aggregateTagsTotal { Int32 113138 ZIP_ra(0.10%), 481B }
-# |--+ DerivedCorrMinusSeedCorr { Int32 113138 ZIP_ra(0.10%), 478B }
-# |--+ RepRef { Float64 113138 ZIP_ra(2.50%), 22.1K }
-# |--+ RepSNP { Float64 113138 ZIP_ra(2.56%), 22.7K }
-# |--+ RepAvg { Float64 113138 ZIP_ra(0.38%), 3.4K }
-# |--+ PicRepRef { Float64 113138 ZIP_ra(3.02%), 26.7K }
-# |--+ PicRepSNP { Float64 113138 ZIP_ra(3.59%), 31.7K }
-# |--+ TotalPicRepRefTest { Int32 113138 ZIP_ra(9.95%), 44.0K }
-# |--+ TotalPicRepSnpTest { Int32 113138 ZIP_ra(10.2%), 45.2K }
-# |--+ clone { Int32,factor 113138 ZIP_ra(67.8%), 299.5K } *
-# \--+ uid { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
-#NULL
-
-#Workaround to convert Dart format to 0-1-2 format
-library("SNPRelate")
-genofile <- snpgdsOpen("./gl2gds.gds")
-#snpgdsGDS2BED(genofile, bed.fn="test", snp.id=snpset)
-#Error in .InitFile(gdsobj, https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id = https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id, https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id = https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id) :
-# object 'snpset' not found
-snpgdsGDS2BED(genofile, bed.fn="test")
-
-
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_1b.R
--- a/dartseq_seeduk_1b.R Fri Apr 20 11:15:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,67 +0,0 @@
-# yum install gdal-devel libcurl-devel libpng-devel mesa-libGL-devel mesa-libGLU-devel proj-nad proj-epsg
-#
-# source("https://bioconductor.org/biocLite.R")
-# biocLite("SNPRelate")
-# biocLite("qvalue")
-#
-# install.packages("dartR")
-# install.packages("amap")
-
-args = commandArgs(trailingOnly=TRUE)
-
-
-library("dartR")
-
-#Read DarT data
-gl <- gl.read.dart(filename=args[1], nas = "-", topskip = 5, lastmetric = "TotalPicRepSnpTest", probar = TRUE)
-gl <- gl.filter.monomorphs(gl)
-
-#Trying to determine if one row or two row format...
-#Found 2 row(s) format. Proceed...
-#Added the following covmetrics:
-#AlleleID CloneID ClusterTempIndex AlleleSequence ClusterConsensusSequence ClusterSize AlleleSeqDist SNP SnpPosition CallRate OneRatioRef OneRatioSnp FreqHomRef FreqHomSnp FreqHets PICRef PICSnp AvgPIC AvgCountRef AvgCountSnp RatioAvgCountRefAvgCountSnp FreqHetsMinusFreqMinHom AlleleCountsCorrelation aggregateTagsTotal DerivedCorrMinusSeedCorr RepRef RepSNP RepAvg PicRepRef PicRepSNP TotalPicRepRefTest TotalPicRepSnpTest .
-#Number of rows per Clone. Should be only 2 s: 2
-# Recognised: 376 individuals and 113138 SNPs in a 2 row format using /data/projects/seed/dart_calls/BBSRC-Panel-DArTSEQ-SNPs.csv
-#Start conversion....
-#Format is 2 rows.
-#Please note conversion of bigger data sets will take some time!
-#Once finished, we recommend to save the object using save(object, file="object.rdata")
-# |======================================================================| 100%
-#>
-gl.report.callrate(gl)
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-# Loci with no missing values = 499 [0.4%]
-# < 5% missing values = 23669 [20.9%]
-# < 10% missing values = 45298 [40%]
-# < 15% missing values = 60678 [53.6%]
-# < 20% missing values = 72478 [64.1%]
-# < 25% missing values = 81629 [72.1%]
-# < 30% missing values = 89227 [78.9%]
-# < 35% missing values = 95969 [84.8%]
-# < 40% missing values = 101973 [90.1%]
-# < 45% missing values = 107590 [95.1%]
-# < 50% missing values = 113138 [100%]
-#[1] "Completed"
-gl.report.callrate(gl,method='ind' )
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Individuals no missing values = 0 [0%] across loci
-#Individuals with less than 5% missing values = 1 [0.3%]
-#Individuals with less than 10% missing values = 73 [19.4%]
-#Individuals with less than 15% missing values = 194 [51.6%]
-#Individuals with less than 20% missing values = 268 [71.3%]
-#Individuals with less than 25% missing values = 320 [85.1%]
-#Individuals with less than 30% missing values = 341 [90.7%]
-#Individuals with less than 35% missing values = 352 [93.6%]
-#Individuals with less than 40% missing values = 358 [95.2%]
-#Individuals with less than 45% missing values = 366 [97.3%]
-#Individuals with less than 50% missing values = 371 [98.7%]
-#Individuals with less than 55% missing values = 372 [98.9%]
-#Individuals with less than 60% missing values = 374 [99.5%]
-#Individuals with less than 65% missing values = 375 [99.7%]
-#[1] "Completed"
-#
-#
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_1c.R
--- a/dartseq_seeduk_1c.R Fri Apr 20 11:15:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,193 +0,0 @@
-# yum install gdal-devel libcurl-devel libpng-devel mesa-libGL-devel mesa-libGLU-devel proj-nad proj-epsg
-#
-# source("https://bioconductor.org/biocLite.R")
-# biocLite("SNPRelate")
-# biocLite("qvalue")
-#
-# install.packages("dartR")
-# install.packages("amap")
-
-args = commandArgs(trailingOnly=TRUE)
-
-
-library("dartR")
-
-#Read DarT data
-gl <- gl.read.dart(filename=args[1], nas = "-", topskip = 5, lastmetric = "TotalPicRepSnpTest", probar = TRUE)
-gl <- gl.filter.monomorphs(gl)
-
-#Trying to determine if one row or two row format...
-#Found 2 row(s) format. Proceed...
-#Added the following covmetrics:
-#AlleleID CloneID ClusterTempIndex AlleleSequence ClusterConsensusSequence ClusterSize AlleleSeqDist SNP SnpPosition CallRate OneRatioRef OneRatioSnp FreqHomRef FreqHomSnp FreqHets PICRef PICSnp AvgPIC AvgCountRef AvgCountSnp RatioAvgCountRefAvgCountSnp FreqHetsMinusFreqMinHom AlleleCountsCorrelation aggregateTagsTotal DerivedCorrMinusSeedCorr RepRef RepSNP RepAvg PicRepRef PicRepSNP TotalPicRepRefTest TotalPicRepSnpTest .
-#Number of rows per Clone. Should be only 2 s: 2
-# Recognised: 376 individuals and 113138 SNPs in a 2 row format using /data/projects/seed/dart_calls/BBSRC-Panel-DArTSEQ-SNPs.csv
-#Start conversion....
-#Format is 2 rows.
-#Please note conversion of bigger data sets will take some time!
-#Once finished, we recommend to save the object using save(object, file="object.rdata")
-# |======================================================================| 100%
-#>
-gl.report.callrate(gl)
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-# Loci with no missing values = 499 [0.4%]
-# < 5% missing values = 23669 [20.9%]
-# < 10% missing values = 45298 [40%]
-# < 15% missing values = 60678 [53.6%]
-# < 20% missing values = 72478 [64.1%]
-# < 25% missing values = 81629 [72.1%]
-# < 30% missing values = 89227 [78.9%]
-# < 35% missing values = 95969 [84.8%]
-# < 40% missing values = 101973 [90.1%]
-# < 45% missing values = 107590 [95.1%]
-# < 50% missing values = 113138 [100%]
-#[1] "Completed"
-gl.report.callrate(gl,method='ind' )
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Individuals no missing values = 0 [0%] across loci
-#Individuals with less than 5% missing values = 1 [0.3%]
-#Individuals with less than 10% missing values = 73 [19.4%]
-#Individuals with less than 15% missing values = 194 [51.6%]
-#Individuals with less than 20% missing values = 268 [71.3%]
-#Individuals with less than 25% missing values = 320 [85.1%]
-#Individuals with less than 30% missing values = 341 [90.7%]
-#Individuals with less than 35% missing values = 352 [93.6%]
-#Individuals with less than 40% missing values = 358 [95.2%]
-#Individuals with less than 45% missing values = 366 [97.3%]
-#Individuals with less than 50% missing values = 371 [98.7%]
-#Individuals with less than 55% missing values = 372 [98.9%]
-#Individuals with less than 60% missing values = 374 [99.5%]
-#Individuals with less than 65% missing values = 375 [99.7%]
-#[1] "Completed"
-#
-#
-gl_call_rate <- gl.filter.callrate(gl,method = 'loc', t=as.double(args[2]))
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-#
-#Initial no. of loci = 113138
-# No. of loci deleted = 31509
-#Summary of filtered dataset
-# Call Rate > 0.75
-# No. of loci: 81629
-# No. of individuals: 376
-# No. of populations: 0
-#
-gl_rep <- gl.filter.repavg(gl_call_rate,t=as.double(args[3]))
-#Reporting for a genlight object
-#Note: RepAvg is a DArT statistic reporting reproducibility averaged across alleles for each locus.
-
-#Initial no. of loci = 81629
-#No. of loci deleted = 6446
-#Summary of filtered dataset
-# Reproducibility >= 0.98
-# No. of loci: 75183
-# No. of individuals: 376
-# No. of populations: 0
-
-gl.report.callrate(gl_rep,method='ind' )
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Individuals no missing values = 0 [0%] across loci
-#Individuals with less than 5% missing values = 161 [42.8%]
-#Individuals with less than 10% missing values = 245 [65.2%]
-#Individuals with less than 15% missing values = 301 [80.1%]
-#Individuals with less than 20% missing values = 337 [89.6%]
-#Individuals with less than 25% missing values = 347 [92.3%]
-#Individuals with less than 30% missing values = 358 [95.2%]
-#Individuals with less than 35% missing values = 359 [95.5%]
-#Individuals with less than 40% missing values = 364 [96.8%]
-#Individuals with less than 45% missing values = 372 [98.9%]
-#Individuals with less than 50% missing values = 373 [99.2%]
-#Individuals with less than 55% missing values = 374 [99.5%]
-#Individuals with less than 60% missing values = 375 [99.7%]
-#[1] "Completed"
-
-gl_final <- gl.filter.callrate(gl_rep,method = 'ind', t=as.double(args[4]))
-#Reporting for a genlight object
-#Note: Missing values most commonly arise from restriction site mutation.
-
-#Initial no. of individuals = 376
-#Filtering a genlight object
-# no. of individuals deleted = 39
-#Individuals retained = 337
-#List of individuals deleted because of low call rate
-# 908017247001_E_5 908017247001_F_4 908017247002_A_10 908017247002_B_4 908017247002_B_5 908017247002_C_3 908017247002_D_12 908017247002_D_2 908017247002_D_6 908017247002_D_9 908017247002_E_6 908017247002_E_7 908017247002_E_9 908017247002_F_2 908017247002_F_6 908017247002_G_8 908017247002_H_10 908017247002_H_7 908017247002_H_8 908017247003_B_8 908017247003_C_8 908017247003_D_8 908017247003_E_8 908017247003_F_8 908017247003_G_6 908017247003_G_8 908017247003_H_7 908017247004_C_11 908017247004_D_11 908017247004_D_8 908017247004_D_9 908017247004_E_10 908017247004_E_11 908017247004_E_9 908017247004_F_11 908017247004_F_12 908017247004_F_6 908017247004_G_11 908017247004_H_11
-# from populations
-
-#Summary of filtered dataset
-# Call Rate > 0.8
-# No. of loci: 75183
- # No. of individuals: 337
- # No. of populations: 0
-#
-gl2gds(gl_final,outfile="gl2gds.gds")
-#Converting gl object to gds formatted file gl2gds.gds
-
-#Structure of gds file
-
-#The file name: /data/projects/seed/dart_calls/gl2gds.gds
-#The total number of samples: 268
-#The total number of SNPs: 113138
-#SNP genotypes are stored in SNP-major mode (Sample X SNP).
-#The SNP positions are not in ascending order on chromosome 1.
-#File: /data/projects/seed/dart_calls/gl2gds.gds (32.8M)
-#+ [ ] *
-#|--+ https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id { Str8 268 ZIP_ra(13.7%), 641B }
-#|--+ https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id { Str8 113138 ZIP_ra(37.9%), 637.3K }
-#|--+ https://protect-eu.mimecast.com/s/0diWClOjDH12EMtyg-Gp?domain=snp.rs.id { Int32 113138 ZIP_ra(78.4%), 346.6K }
-#|--+ snp.position { Float64 113138 ZIP_ra(14.9%), 131.5K }
-#|--+ snp.chromosome { Int32 113138 ZIP_ra(0.10%), 481B }
-#|--+ snp.allele { Str8 113138 ZIP_ra(14.4%), 63.6K }
-#|--+ genotype { Bit2 268x113138, 7.2M } *
-#\--+ loc.metrics [ data.frame ] *
-# |--+ AlleleID { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
-# |--+ CloneID { Int32 113138 ZIP_ra(78.4%), 346.6K }
-# |--+ ClusterTempIndex { Int32 113138 ZIP_ra(63.6%), 281.1K }
-# |--+ AlleleSequence { Int32,factor 113138 ZIP_ra(68.9%), 304.4K } *
-# |--+ ClusterConsensusSequence { Int32,factor 113138 ZIP_ra(66.2%), 292.5K } *
-# |--+ ClusterSize { Int32 113138 ZIP_ra(7.27%), 32.1K }
-# |--+ AlleleSeqDist { Int32 113138 ZIP_ra(8.49%), 37.5K }
-# |--+ SNP { Int32,factor 113138 ZIP_ra(38.3%), 169.2K } *
-# |--+ SnpPosition { Int32 113138 ZIP_ra(26.0%), 115.1K }
-# |--+ CallRate { Float64 113138 ZIP_ra(2.84%), 25.1K }
-# |--+ OneRatioRef { Float64 113138 ZIP_ra(32.7%), 289.2K }
-# |--+ OneRatioSnp { Float64 113138 ZIP_ra(36.1%), 318.8K }
-# |--+ FreqHomRef { Float64 113138 ZIP_ra(36.6%), 323.6K }
-# |--+ FreqHomSnp { Float64 113138 ZIP_ra(32.6%), 288.4K }
-# |--+ FreqHets { Float64 113138 ZIP_ra(20.0%), 177.2K }
-# |--+ PICRef { Float64 113138 ZIP_ra(29.9%), 264.1K }
-# |--+ PICSnp { Float64 113138 ZIP_ra(33.7%), 297.7K }
-# |--+ AvgPIC { Float64 113138 ZIP_ra(44.0%), 388.6K }
-# |--+ AvgCountRef { Float64 113138 ZIP_ra(55.3%), 489.1K }
-# |--+ AvgCountSnp { Float64 113138 ZIP_ra(36.6%), 323.8K }
-# |--+ RatioAvgCountRefAvgCountSnp { Float64 113138 ZIP_ra(57.6%), 509.2K }
-# |--+ FreqHetsMinusFreqMinHom { Float64 113138 ZIP_ra(31.6%), 279.2K }
-# |--+ AlleleCountsCorrelation { Float64 113138 ZIP_ra(48.2%), 425.8K }
-# |--+ aggregateTagsTotal { Int32 113138 ZIP_ra(0.10%), 481B }
-# |--+ DerivedCorrMinusSeedCorr { Int32 113138 ZIP_ra(0.10%), 478B }
-# |--+ RepRef { Float64 113138 ZIP_ra(2.50%), 22.1K }
-# |--+ RepSNP { Float64 113138 ZIP_ra(2.56%), 22.7K }
-# |--+ RepAvg { Float64 113138 ZIP_ra(0.38%), 3.4K }
-# |--+ PicRepRef { Float64 113138 ZIP_ra(3.02%), 26.7K }
-# |--+ PicRepSNP { Float64 113138 ZIP_ra(3.59%), 31.7K }
-# |--+ TotalPicRepRefTest { Int32 113138 ZIP_ra(9.95%), 44.0K }
-# |--+ TotalPicRepSnpTest { Int32 113138 ZIP_ra(10.2%), 45.2K }
-# |--+ clone { Int32,factor 113138 ZIP_ra(67.8%), 299.5K } *
-# \--+ uid { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
-#NULL
-
-#Workaround to convert Dart format to 0-1-2 format
-library("SNPRelate")
-genofile <- snpgdsOpen("./gl2gds.gds")
-#snpgdsGDS2BED(genofile, bed.fn="test", snp.id=snpset)
-#Error in .InitFile(gdsobj, https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id = https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id, https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id = https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id) :
-# object 'snpset' not found
-snpgdsGDS2BED(genofile, bed.fn="test")
-
-
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_2.R
--- a/dartseq_seeduk_2.R Fri Apr 20 11:15:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-args = commandArgs(trailingOnly=TRUE)
-
-library("amap")
-library("labdsv")
-
-data <- read.table("plink.raw")
-distances <- Dist(data[2:338,7:75189], method = "euclidean", nbproc = 144)
-pco_results <- pco(distances,k=10)
-#Variance explained by first three PCOs
-pco_results$eig[1]/sum(pco_results$eig[pco_results$eig>0])
-#[1] 0.2565937
-pco_results$eig[2]/sum(pco_results$eig[pco_results$eig>0])
-#[1] 0.06878127
-pco_results$eig[3]/sum(pco_results$eig[pco_results$eig>0])
-#[1] 0.04340111
-write.csv(pco_results$points,file=args[1])
-write.csv(data[,2],file="PCO_sample_names.csv")
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_2a.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dartseq_seeduk_2a.R Fri Apr 20 12:42:06 2018 -0400
@@ -0,0 +1,193 @@
+# yum install gdal-devel libcurl-devel libpng-devel mesa-libGL-devel mesa-libGLU-devel proj-nad proj-epsg
+#
+# source("https://bioconductor.org/biocLite.R")
+# biocLite("SNPRelate")
+# biocLite("qvalue")
+#
+# install.packages("dartR")
+# install.packages("amap")
+
+args = commandArgs(trailingOnly=TRUE)
+
+
+library("dartR")
+
+#Read DarT data
+gl <- gl.read.dart(filename=args[1], nas = "-", topskip = 5, lastmetric = "TotalPicRepSnpTest", probar = TRUE)
+gl <- gl.filter.monomorphs(gl)
+
+#Trying to determine if one row or two row format...
+#Found 2 row(s) format. Proceed...
+#Added the following covmetrics:
+#AlleleID CloneID ClusterTempIndex AlleleSequence ClusterConsensusSequence ClusterSize AlleleSeqDist SNP SnpPosition CallRate OneRatioRef OneRatioSnp FreqHomRef FreqHomSnp FreqHets PICRef PICSnp AvgPIC AvgCountRef AvgCountSnp RatioAvgCountRefAvgCountSnp FreqHetsMinusFreqMinHom AlleleCountsCorrelation aggregateTagsTotal DerivedCorrMinusSeedCorr RepRef RepSNP RepAvg PicRepRef PicRepSNP TotalPicRepRefTest TotalPicRepSnpTest .
+#Number of rows per Clone. Should be only 2 s: 2
+# Recognised: 376 individuals and 113138 SNPs in a 2 row format using /data/projects/seed/dart_calls/BBSRC-Panel-DArTSEQ-SNPs.csv
+#Start conversion....
+#Format is 2 rows.
+#Please note conversion of bigger data sets will take some time!
+#Once finished, we recommend to save the object using save(object, file="object.rdata")
+# |======================================================================| 100%
+#>
+gl.report.callrate(gl)
+#Reporting for a genlight object
+#Note: Missing values most commonly arise from restriction site mutation.
+
+# Loci with no missing values = 499 [0.4%]
+# < 5% missing values = 23669 [20.9%]
+# < 10% missing values = 45298 [40%]
+# < 15% missing values = 60678 [53.6%]
+# < 20% missing values = 72478 [64.1%]
+# < 25% missing values = 81629 [72.1%]
+# < 30% missing values = 89227 [78.9%]
+# < 35% missing values = 95969 [84.8%]
+# < 40% missing values = 101973 [90.1%]
+# < 45% missing values = 107590 [95.1%]
+# < 50% missing values = 113138 [100%]
+#[1] "Completed"
+gl.report.callrate(gl,method='ind' )
+#Reporting for a genlight object
+#Note: Missing values most commonly arise from restriction site mutation.
+
+#Individuals no missing values = 0 [0%] across loci
+#Individuals with less than 5% missing values = 1 [0.3%]
+#Individuals with less than 10% missing values = 73 [19.4%]
+#Individuals with less than 15% missing values = 194 [51.6%]
+#Individuals with less than 20% missing values = 268 [71.3%]
+#Individuals with less than 25% missing values = 320 [85.1%]
+#Individuals with less than 30% missing values = 341 [90.7%]
+#Individuals with less than 35% missing values = 352 [93.6%]
+#Individuals with less than 40% missing values = 358 [95.2%]
+#Individuals with less than 45% missing values = 366 [97.3%]
+#Individuals with less than 50% missing values = 371 [98.7%]
+#Individuals with less than 55% missing values = 372 [98.9%]
+#Individuals with less than 60% missing values = 374 [99.5%]
+#Individuals with less than 65% missing values = 375 [99.7%]
+#[1] "Completed"
+#
+#
+gl_call_rate <- gl.filter.callrate(gl,method = 'loc', t=as.double(args[2]))
+#Reporting for a genlight object
+#Note: Missing values most commonly arise from restriction site mutation.
+#
+#Initial no. of loci = 113138
+# No. of loci deleted = 31509
+#Summary of filtered dataset
+# Call Rate > 0.75
+# No. of loci: 81629
+# No. of individuals: 376
+# No. of populations: 0
+#
+gl_rep <- gl.filter.repavg(gl_call_rate,t=as.double(args[3]))
+#Reporting for a genlight object
+#Note: RepAvg is a DArT statistic reporting reproducibility averaged across alleles for each locus.
+
+#Initial no. of loci = 81629
+#No. of loci deleted = 6446
+#Summary of filtered dataset
+# Reproducibility >= 0.98
+# No. of loci: 75183
+# No. of individuals: 376
+# No. of populations: 0
+
+gl.report.callrate(gl_rep,method='ind' )
+#Reporting for a genlight object
+#Note: Missing values most commonly arise from restriction site mutation.
+
+#Individuals no missing values = 0 [0%] across loci
+#Individuals with less than 5% missing values = 161 [42.8%]
+#Individuals with less than 10% missing values = 245 [65.2%]
+#Individuals with less than 15% missing values = 301 [80.1%]
+#Individuals with less than 20% missing values = 337 [89.6%]
+#Individuals with less than 25% missing values = 347 [92.3%]
+#Individuals with less than 30% missing values = 358 [95.2%]
+#Individuals with less than 35% missing values = 359 [95.5%]
+#Individuals with less than 40% missing values = 364 [96.8%]
+#Individuals with less than 45% missing values = 372 [98.9%]
+#Individuals with less than 50% missing values = 373 [99.2%]
+#Individuals with less than 55% missing values = 374 [99.5%]
+#Individuals with less than 60% missing values = 375 [99.7%]
+#[1] "Completed"
+
+gl_final <- gl.filter.callrate(gl_rep,method = 'ind', t=as.double(args[4]))
+#Reporting for a genlight object
+#Note: Missing values most commonly arise from restriction site mutation.
+
+#Initial no. of individuals = 376
+#Filtering a genlight object
+# no. of individuals deleted = 39
+#Individuals retained = 337
+#List of individuals deleted because of low call rate
+# 908017247001_E_5 908017247001_F_4 908017247002_A_10 908017247002_B_4 908017247002_B_5 908017247002_C_3 908017247002_D_12 908017247002_D_2 908017247002_D_6 908017247002_D_9 908017247002_E_6 908017247002_E_7 908017247002_E_9 908017247002_F_2 908017247002_F_6 908017247002_G_8 908017247002_H_10 908017247002_H_7 908017247002_H_8 908017247003_B_8 908017247003_C_8 908017247003_D_8 908017247003_E_8 908017247003_F_8 908017247003_G_6 908017247003_G_8 908017247003_H_7 908017247004_C_11 908017247004_D_11 908017247004_D_8 908017247004_D_9 908017247004_E_10 908017247004_E_11 908017247004_E_9 908017247004_F_11 908017247004_F_12 908017247004_F_6 908017247004_G_11 908017247004_H_11
+# from populations
+
+#Summary of filtered dataset
+# Call Rate > 0.8
+# No. of loci: 75183
+ # No. of individuals: 337
+ # No. of populations: 0
+#
+gl2gds(gl_final,outfile="gl2gds.gds")
+#Converting gl object to gds formatted file gl2gds.gds
+
+#Structure of gds file
+
+#The file name: /data/projects/seed/dart_calls/gl2gds.gds
+#The total number of samples: 268
+#The total number of SNPs: 113138
+#SNP genotypes are stored in SNP-major mode (Sample X SNP).
+#The SNP positions are not in ascending order on chromosome 1.
+#File: /data/projects/seed/dart_calls/gl2gds.gds (32.8M)
+#+ [ ] *
+#|--+ https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id { Str8 268 ZIP_ra(13.7%), 641B }
+#|--+ https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id { Str8 113138 ZIP_ra(37.9%), 637.3K }
+#|--+ https://protect-eu.mimecast.com/s/0diWClOjDH12EMtyg-Gp?domain=snp.rs.id { Int32 113138 ZIP_ra(78.4%), 346.6K }
+#|--+ snp.position { Float64 113138 ZIP_ra(14.9%), 131.5K }
+#|--+ snp.chromosome { Int32 113138 ZIP_ra(0.10%), 481B }
+#|--+ snp.allele { Str8 113138 ZIP_ra(14.4%), 63.6K }
+#|--+ genotype { Bit2 268x113138, 7.2M } *
+#\--+ loc.metrics [ data.frame ] *
+# |--+ AlleleID { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
+# |--+ CloneID { Int32 113138 ZIP_ra(78.4%), 346.6K }
+# |--+ ClusterTempIndex { Int32 113138 ZIP_ra(63.6%), 281.1K }
+# |--+ AlleleSequence { Int32,factor 113138 ZIP_ra(68.9%), 304.4K } *
+# |--+ ClusterConsensusSequence { Int32,factor 113138 ZIP_ra(66.2%), 292.5K } *
+# |--+ ClusterSize { Int32 113138 ZIP_ra(7.27%), 32.1K }
+# |--+ AlleleSeqDist { Int32 113138 ZIP_ra(8.49%), 37.5K }
+# |--+ SNP { Int32,factor 113138 ZIP_ra(38.3%), 169.2K } *
+# |--+ SnpPosition { Int32 113138 ZIP_ra(26.0%), 115.1K }
+# |--+ CallRate { Float64 113138 ZIP_ra(2.84%), 25.1K }
+# |--+ OneRatioRef { Float64 113138 ZIP_ra(32.7%), 289.2K }
+# |--+ OneRatioSnp { Float64 113138 ZIP_ra(36.1%), 318.8K }
+# |--+ FreqHomRef { Float64 113138 ZIP_ra(36.6%), 323.6K }
+# |--+ FreqHomSnp { Float64 113138 ZIP_ra(32.6%), 288.4K }
+# |--+ FreqHets { Float64 113138 ZIP_ra(20.0%), 177.2K }
+# |--+ PICRef { Float64 113138 ZIP_ra(29.9%), 264.1K }
+# |--+ PICSnp { Float64 113138 ZIP_ra(33.7%), 297.7K }
+# |--+ AvgPIC { Float64 113138 ZIP_ra(44.0%), 388.6K }
+# |--+ AvgCountRef { Float64 113138 ZIP_ra(55.3%), 489.1K }
+# |--+ AvgCountSnp { Float64 113138 ZIP_ra(36.6%), 323.8K }
+# |--+ RatioAvgCountRefAvgCountSnp { Float64 113138 ZIP_ra(57.6%), 509.2K }
+# |--+ FreqHetsMinusFreqMinHom { Float64 113138 ZIP_ra(31.6%), 279.2K }
+# |--+ AlleleCountsCorrelation { Float64 113138 ZIP_ra(48.2%), 425.8K }
+# |--+ aggregateTagsTotal { Int32 113138 ZIP_ra(0.10%), 481B }
+# |--+ DerivedCorrMinusSeedCorr { Int32 113138 ZIP_ra(0.10%), 478B }
+# |--+ RepRef { Float64 113138 ZIP_ra(2.50%), 22.1K }
+# |--+ RepSNP { Float64 113138 ZIP_ra(2.56%), 22.7K }
+# |--+ RepAvg { Float64 113138 ZIP_ra(0.38%), 3.4K }
+# |--+ PicRepRef { Float64 113138 ZIP_ra(3.02%), 26.7K }
+# |--+ PicRepSNP { Float64 113138 ZIP_ra(3.59%), 31.7K }
+# |--+ TotalPicRepRefTest { Int32 113138 ZIP_ra(9.95%), 44.0K }
+# |--+ TotalPicRepSnpTest { Int32 113138 ZIP_ra(10.2%), 45.2K }
+# |--+ clone { Int32,factor 113138 ZIP_ra(67.8%), 299.5K } *
+# \--+ uid { Int32,factor 113138 ZIP_ra(68.9%), 304.3K } *
+#NULL
+
+#Workaround to convert Dart format to 0-1-2 format
+library("SNPRelate")
+genofile <- snpgdsOpen("./gl2gds.gds")
+#snpgdsGDS2BED(genofile, bed.fn="test", snp.id=snpset)
+#Error in .InitFile(gdsobj, https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id = https://protect-eu.mimecast.com/s/cfduCj27LTYnmOHWrcoC?domain=sample.id, https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id = https://protect-eu.mimecast.com/s/byfzCk59DIkOBwfVgChE?domain=snp.id) :
+# object 'snpset' not found
+snpgdsGDS2BED(genofile, bed.fn="test")
+
+
diff -r 27b626c1c120 -r ae6b94d10bff dartseq_seeduk_2b.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dartseq_seeduk_2b.R Fri Apr 20 12:42:06 2018 -0400
@@ -0,0 +1,17 @@
+args = commandArgs(trailingOnly=TRUE)
+
+library("amap")
+library("labdsv")
+
+data <- read.table("plink.raw",header=T)
+distances <- Dist(data[,7:dim(data)[2]], method = "euclidean", nbproc = 144)
+pco_results <- pco(distances,k=10)
+#Variance explained by first three PCOs
+pco_results$eig[1]/sum(pco_results$eig[pco_results$eig>0])
+#[1] 0.2565937
+pco_results$eig[2]/sum(pco_results$eig[pco_results$eig>0])
+#[1] 0.06878127
+pco_results$eig[3]/sum(pco_results$eig[pco_results$eig>0])
+#[1] 0.04340111
+write.csv(pco_results$points,file=args[1])
+
diff -r 27b626c1c120 -r ae6b94d10bff ukseed.xml
--- a/ukseed.xml Fri Apr 20 11:15:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-
- run a dartseq analysis
-
- bioconductor-snprelate
- plink
- r-amap
- r-dartr
- r-labdsv
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-In **UK-SeeD Data Analysis Infrastructure**, a BBSRC-Newton funded project, we have deployed an advanced computing hardware and software platform for the
-analysis of large genomics datasets for wheat varieties. The platform integrates computing resources and bioinformatics expertise to
-enable crop geneticists to implement sophisticated data analysis algorithms to improve the use of genetic resources for wheat and
-other important crops. The computing platform is distributed across the partners’ sites with hardware deployed at CIMMYT (Mexico) and
-the Earlham Institute (UK).
-
-|LOGOS|
-
-.. |LOGOS| image:: https://ics.hutton.ac.uk/resources/ukseed/logos.png
-
-
-
-
-
\ No newline at end of file
diff -r 27b626c1c120 -r ae6b94d10bff ukseed_stage1.xml
--- a/ukseed_stage1.xml Fri Apr 20 11:15:45 2018 -0400
+++ b/ukseed_stage1.xml Fri Apr 20 12:42:06 2018 -0400
@@ -10,7 +10,7 @@
$output
+ Rscript --vanilla $__tool_directory__/dartseq_seeduk_1.R $input > $output
]]>
diff -r 27b626c1c120 -r ae6b94d10bff ukseed_stage2.xml
--- a/ukseed_stage2.xml Fri Apr 20 11:15:45 2018 -0400
+++ b/ukseed_stage2.xml Fri Apr 20 12:42:06 2018 -0400
@@ -10,11 +10,11 @@