# HG changeset patch # User iuc # Date 1601492739 0 # Node ID e52300a0000f3e0e5839ec4c87df1065c6d5fb40 # Parent 4f98ba28a3f2ceea015548de6db73afd08e978f0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 94e69abb568077267eb8b15ef624624e2899a750" diff -r 4f98ba28a3f2 -r e52300a0000f macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,30 @@ + + + 1.0 + 19.09 + + + + + + + + + + + + + + + + @misc{None, + journal = {None}, + author = {1. Stuber T}, + title = {Manuscript in preparation}, + year = {None}, + url = {https://github.com/USDA-VS/vSNP},} + + + + + diff -r 4f98ba28a3f2 -r e52300a0000f test-data/NC_002945v4.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/NC_002945v4.fasta Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,101 @@ +>NC_002945.4 Mycobacterium bovis AF2122/97 genome assembly, chromosome: Mycobacterium_bovis_AF2122/97 +TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCCGAACTTAACGGCGACC +CTAAGGTTGACGACGGACCCAGCAGTGATGCTAATCTCAGCGCTCCGCTGACCCCTCAGCAAAGGGCTTG +GCTCAATCTCGTCCAGCCATTGACCATCGTCGAGGGGTTTGCTCTGTTATCCGTGCCGAGCAGCTTTGTC +CAAAACGAAATCGAGCGCCATCTGCGGGCCCCGATTACCGACGCTCTCAGCCGCCGACTCGGACATCAGA +TCCAACTCGGGGTCCGCATCGCTCCGCCGGCGACCGACGAAGCCGACGACACTACCGTGCCGCCTTCCGA +AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG +GGCGATAACCAGCACAGTTGGCCAAGTTACTTCACCGAGCGCCCGCGCAATACCGATTCCGCTACCGCTG +GCGTAACCAGCCTTAACCGTCGCTACACCTTTGATACGTTCGTTATCGGCGCCTCCAACCGGTTCGCGCA +CGCCGCCGCCTTGGCGATCGCAGAAGCACCCGCCCGCGCTTACAACCCCCTGTTCATCTGGGGCGAGTCC +GGTCTCGGCAAGACACACCTGCTACACGCGGCAGGCAACTATGCCCAACGGTTGTTCCCGGGAATGCGGG +TCAAATATGTCTCCACCGAGGAATTCACCAACGACTTCATTAACTCGCTCCGCGATGACCGCAAGGTCGC +ATTCAAACGCAGCTACCGCGACGTAGACGTGCTGTTGGTCGACGACATCCAATTCATTGAAGGCAAAGAG +GGTATTCAAGAGGAGTTCTTCCACACCTTCAACACCTTGCACAATGCCAACAAGCAAATCGTCATCTCAT +CTGACCGCCCACCCAAGCAGCTCGCCACCCTCGAGGACCGGCTGAGAACCCGCTTTGAGTGGGGGCTGAT +CACTGACGTACAACCACCCGAGCTGGAGACCCGCATCGCCATCTTGCGCAAGAAAGCACAGATGGAACGG +CTCGCGATCCCCGACGATGTCCTCGAACTCATCGCCAGCAGTATCGAACGCAATATCCGTGAACTCGAGG +GCGCGCTGATCCGGGTCACCGCGTTCGCCTCATTGAACAAAACACCAATCGACAAAGCGCTGGCCGAGAT +TGTGCTTCGCGATCTGATCGCCGACGCCAACACCATGCAAATCAGCGCGGCGACGATCATGGCTGCCACC +GCCGAATACTTCGACACTACCGTCGAAGAGCTTCGCGGGCCCGGCAAGACCCGAGCACTGGCCCAGTCAC +GACAGATTGCGATGTACCTGTGTCGTGAGCTCACCGATCTTTCGTTGCCCAAAATCGGCCAAGCGTTCGG +CCGTGATCACACAACCGTCATGTACGCCCAACGCAAGATCCTGTCCGAGATGGCCGAGCGCCGTGAGGTC +TTTGATCACGTCAAAGAACTCACCACTCGCATCCGTCAGCGCTCCAAGCGCTAGCACGGCGTGTTCTTCC +GACAACGTTCTTAAAAAAACTTCTCTCTCCCAGGTCACACCAGTCACAGAGATTGGCTGTGAGTGTCGCT +GTGCACAAACCGCGCACAGACTCATACAGTCCCGGCGGTTCCGTTCACAACCCACGCCTCATCCCCACCG +ACCCAACACACACCCCACAGTCATCGCCACCGTCATCCACAACTCCGACCGACGTCGACCTGCACCAAGA +CCAGACTGTCCCCAAACTGCACACCCTCTAATACTGTTACCGAGATTTCTTCGTCGTTTGTTCTTGGAAA +GACAGCGCTGGGGATCGTTCGCTGGATACCACCCGCATAACTGGCTCGTCGCGGTGGGTCAGAGGTCAAT +GATGAACTTTCAAGTTGACGTGAGAAGCTCTACGGTTGTTGTTCGACTGCTGTTGCGGCCGTCGTGGCGG +GTCACGCGTCATGGGCGTTCGTCGTTGGCAGTCCCCACGCTAGCGGGGCGCTAGCCACGGGATCGAACTC +ATCGTGAGGTGAAAGGGCGCAATGGACGCGGCTACGACAAGAGTTGGCCTCACCGACTTGACGTTTCGTT +TGCTACGAGAGTCTTTCGCCGATGCGGTGTCGTGGGTGGCTAAAAATCTGCCAGCCAGGCCCGCGGTGCC +GGTGCTCTCCGGCGTGTTGTTGACCGGCTCGGACAACGGTCTGACGATTTCCGGATTCGACTACGAGGTT +TCCGCCGAGGCCCAGGTTGGCGCTGAAATTGTTTCTCCTGGAAGCGTTTTAGTTTCTGGCCGATTGTTGT +CCGATATTACCCGGGCGTTGCCTAACAAGCCCGTAGGCGTTCATGTCGAAGGTAACCGGGTCGCATTGAC +CTGCGGTAACGCCAGGTTTTCGCTACCGACGATGCCAGTCGAGGATTATCCGACGCTGCCGACGCTGCCG +GAAGAGACCGGATTGTTGCCTGCGGAATTATTCGCCGAGGCAATCAGTCAGGTCGCTATCGCCGCCGGCC +GGGACGACACGCTGCCTATGTTGACCGGCATCCGGGTCGAAATCCTCGGTGAGACGGTGGTTTTGGCCGC +TACCGACAGGTTTCGCCTGGCTGTTCGAGAACTGAAGTGGTCGGCGTCGTCGCCAGATATCGAAGCGGCT +GTGCTGGTCCCGGCCAAGACGCTGGCCGAGGCCGCCAAAGCGGGCATCGGCGGCTCTGACGTTCGTTTGT +CGTTGGGTACTGGGCCGGGGGTGGGCAAGGATGGCCTGCTCGGTATCAGTGGGAACGGCAAGCGCAGCAC +CACGCGACTTCTTGATGCCGAGTTCCCGAAGTTTCGGCAGTTGCTACCAACCGAACACACCGCGGTGGCC +ACCATGGACGTGGCCGAGTTGATCGAAGCGATCAAGCTGGTTGCGTTGGTAGCTGATCGGGGCGCGCAGG +TGCGCATGGAGTTCGCTGATGGCAGCGTGCGGCTTTCTGCGGGTGCCGATGATGTTGGACGAGCCGAGGA +AGATCTTGTTGTTGACTATGCCGGTGAACCATTGACGATTGCGTTTAACCCAACCTATCTAACGGACGGT +TTGAGTTCGTTGCGCTCGGAGCGAGTGTCTTTCGGGTTTACGACTGCGGGTAAGCCTGCCTTGCTACGTC +CGGTGTCCGGGGACGATCGCCCTGTGGCGGGTCTGAATGGCAACGGTCCGTTCCCGGCGGTGTCGACGGA +CTATGTCTATCTGTTGATGCCGGTTCGGTTGCCGGGCTGAGCACTTGGCGCCCGGGTAGGTGTACGTCCG +TCATTTGGGGCTGCGTGACTTCCGGTCCTGGGCATGTGTAGATCTGGAATTGCATCCAGGGCGGACGGTT +TTTGTTGGGCCTAACGGTTATGGTAAGACGAATCTTATTGAGGCACTGTGGTATTCGACGACGTTAGGTT +CGCACCGCGTTAGCGCCGATTTGCCGTTGATCCGGGTAGGTACCGATCGTGCGGTGATCTCCACGATCGT +GGTGAACGACGGTAGAGAATGTGCCGTCGACCTCGAGATCGCCACGGGGCGAGTCAACAAAGCGCGATTG +AATCGATCATCGGTCCGAAGTACACGTGATGTGGTCGGAGTGCTTCGAGCTGTGTTGTTTGCCCCTGAGG +ATCTGGGGTTGGTTCGTGGGGATCCCGCTGACCGGCGGCGCTATCTGGATGATCTGGCGATCGTGCGTAG +GCCTGCGATCGCTGCGGTACGAGCCGAATATGAGAGGGTGGTGCGCCAGCGGACGGCGTTATTGAAGTCC +GTACCTGGAGCACGGTATCGGGGTGACCGGGGTGTGTTTGACACTCTTGAGGTATGGGACAGTCGTTTGG +CGGAGCACGGGGCTGAACTGGTGGCCGCCCGCATCGATTTGGTCAACCAGTTGGCACCGGAAGTGAAGAA +GGCATACCAGCTGTTGGCGCCGGAATCGCGATCGGCGTCTATCGGTTATCGGGCCAGCATGGATGTAACC +GGTCCCAGCGAGCAGTCAGATACCGATCGGCAATTGTTAGCAGCTCGGCTGTTGGCGGCGCTGGCGGCCC +GTCGGGATGCCGAACTCGAGCGTGGGGTTTGTCTAGTTGGTCCGCACCGTGACGACCTAATACTGCGACT +AGGCGATCAACCCGCGAAAGGATTTGCTAGCCATGGGGAGGCGTGGTCGTTGGCGGTGGCACTGCGGTTG +GCGGCCTATCAACTGTTACGCGTTGATGGTGGTGAGCCGGTGTTGTTGCTCGACGACGTGTTCGCCGAAC +TGGATGTCATGCGCCGTCGAGCGTTGGCGACGGCGGCCGAGTCCGCCGAACAGGTGTTGGTGACTGCCGC +GGTGCTCGAGGATATTCCCGCCGGCTGGGACGCCAGGCGGGTGCACATCGATGTGCGTGCCGATGACACC +GGATCGATGTCGGTGGTTCTGCCATGACGGGTTCTGTTGACCGGCCCGACCAGAATCGCGGTGAGCGATT +AATGAAGTCACCAGGGTTGGATTTGGTCAGGCGCACCCTGGACGAAGCTCGTGCTGCTGCCCGCGCGCGC +GGACAAGACGCCGGTCGAGGGCGGGTCGCTTCCGTTGCGTCGGGTCGGGTGGCCGGACGGCGACGAAGCT +GGTCGGGTCCGGGGCCCGACATTCGTGATCCACAACCGCTGGGTAAGGCCGCTCGTGAGCTGGCAAAGAA +ACGCGGCTGGTCGGTGCGGGTCGCCGAGGGTATGGTGCTCGGCCAGTGGTCTGCGGTGGTCGGCCACCAG +ATCGCCGAACATGCACGCCCGACTGCGCTAAACGACGGGGTGTTGAGCGTGATTGCGGAGTCGACGGCGT +GGGCGACGCAGTTGAGGATCATGCAGGCCCAGCTTCTGGCCAAGATCGCCGCAGCGGTTGGCAACGATGT +GGTGCGATCGCTAAAGATCACCGGGCCGGCGGCACCATCGTGGCGCAAGGGGCCTCGCCATATTGCCGGT +AGGGGTCCGCGCGACACCTACGGATAACACGTCGATCGGCCCAGAACAAGGCGCTCCGGTCCCGGCCTGA +GAGCCTCGAGGACGAAGCGGATCCGTATGCCGGACGTCGGGACGCACCAGGAAGAAAGATGTCCGACGCA +CGGCGCGGTTAGATGGGTAAAAACGAGGCCAGAAGATCGGCCCTGGCGCCCGATCACGGTACAGTGGTGT +GCGACCCCCTGCGGCGACTCAACCGCATGCACGCAACCCCTGAGGAGAGTATTCGGATCGTGGCTGCCCA +GAAAAAGAAGGCCCAAGACGAATACGGCGCTGCGTCTATCACCATTCTCGAAGGGCTGGAGGCCGTCCGC +AAACGTCCCGGCATGTACATTGGCTCGACCGGTGAGCGCGGTTTACACCATCTCATTTGGGAGGTGGTCG +ACAACGCGGTCGACGAGGCGATGGCCGGTTATGCAACCACAGTGAACGTAGTGCTGCTTGAGGATGGCGG +TGTCGAGGTCGCCGACGACGGCCGCGGCATTCCGGTCGCCACCCACGCCTCCGGCATACCGACCGTCGAC +GTGGTGATGACACAACTACATGCCGGCGGCAAGTTCGACTCGGACGCGTATGCGATATCTGGTGGTCTGC +ACGGCGTCGGCGTGTCGGTGGTTAACGCGCTATCCACCCGGCTCGAAGTCGAGATCAAGCGCGACGGGTA +CGAGTGGTCTCAGGTTTATGAGAAGTCGGAACCCCTGGGCCTCAAGCAAGGGGCGCCGACCAAGAAGACG +GGGTCAACGGTACGGTTCTGGGCCGACCCCGCTGTTTTCGAAACCACGGAATACGACTTCGAAACCGTCG +CCCGCCGGCTGCAAGAGATGGCGTTCCTCAACAAGGGGCTGACCATCAACCTGACCGACGAGAGGGTGAC +CCAAGACGAGGTCGTCGACGAAGTGGTCAGCGACGTCGCCGAGGCGCCGAAGTCGGCAAGTGAACGCGCA +GCCGAATCCACTGCACCGCACAAAGTTAAGAGCCGCACCTTTCACTATCCGGGTGGCCTGGTGGACTTCG +TGAAACACATCAACCGCACCAAGAACGCGATTCATAGCAGCATCGTGGACTTTTCCGGCAAGGGCACCGG +GCACGAGGTGGAGATCGCGATGCAATGGAACGCCGGGTATTCGGAGTCGGTGCACACCTTCGCCAACACC +ATCAACACCCACGAGGGCGGCACCCACGAAGAGGGCTTCCGCAGCGCGCTGACGTCGGTGGTGAACAAGT +ACGCCAAGGACCGCAAGCTACTGAAGGACAAGGACCCCAACCTCACCGGTGACGATATCCGGGAAGGCCT +GGCCGCTGTGATCTCGGTGAAGGTCAGCGAACCGCAGTTCGAGGGCCAGACCAAGACCAAGTTGGGCAAC +ACCGAGGTCAAATCGTTTGTGCAGAAGGTCTGTAATGAACAGCTGACCCACTGGTTTGAAGCCAACCCCA +CCGACTCGAAAGTCGTTGTGAACAAGGCTGTGTCCTCGGCGCAAGCCCGTATCGCGGCACGTAAGGCACG +AGAGTTGGTGCGGCGTAAGAGCGCCACCGACATCGGTGGATTGCCCGGCAAGCTGGCCGATTGCCGTTCC +ACGGATCCGCGCAAGTCCGAACTGTATGTCGTAGAAGGTGACTCGGCCGGCGGTTCTGCAAAAAGCGGTC +GCGATTCGATGTTCCAGGCGATACTTCCGCTGCGCGGCAAGATCATCAATGTGGAGAAAGCGCGCATCGA +CCGGGTGCTAAAGAACACCGAAGTTCAGGCGATCATCACGGCGCTGGGCACCGGGATCCACGACGAGTTC +GATATCGGCAAGCTGCGCTACCACAAGATCGTGCTGATGGCCGACGCCGATGTTGACGGCCAACATATTT +CCACGCTGTTGTTGACGTTGTTGTTCCGGTTCATGCGGCCGCTCATCGAGAACGGGCATGTGTTTTTGGC +ACAACCGCCGCTGTACAAACTCAAGTGGCAGCGCAGTGACCCGGAATTCGCATACTCCGACCGCGAGCGC diff -r 4f98ba28a3f2 -r e52300a0000f test-data/bam_input.bam Binary file test-data/bam_input.bam has changed diff -r 4f98ba28a3f2 -r e52300a0000f test-data/fasta_indexes.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_indexes.loc Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,1 @@ +89 89 Mycobacterium_AF2122 ${__HERE__}/NC_002945v4.fasta diff -r 4f98ba28a3f2 -r e52300a0000f test-data/output_metrics.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_metrics.tabular Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,2 @@ +# File Number of Good SNPs Average Coverage Genome Coverage + 0 diff -r 4f98ba28a3f2 -r e52300a0000f test-data/output_vcf.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_vcf.vcf Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,100 @@ +##fileformat=VCFv4.2 +##fileDate=20200302 +##source=freeBayes v1.3.1-dirty +##reference=/home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa +##contig= +##phasing=none +##commandline="freebayes --region NC_002945.4:0..4349904 --bam b_0.bam --fasta-reference /home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa --vcf ./vcf_output/part_NC_002945.4:0..4349904.vcf -u -n 0 --haplotype-length -1 --min-repeat-size 5 --min-repeat-entropy 1 -m 1 -q 0 -R 0 -Y 0 -e 1 -F 0.05 -C 2 -G 1 --min-alternate-qsum 0" +##filter="QUAL > 0" +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 13-1941-6 +NC_002945.4 1 . N . . . . GT ./. +NC_002945.4 2 . N . . . . GT ./. +NC_002945.4 3 . N . . . . GT ./. +NC_002945.4 4 . N . . . . GT ./. +NC_002945.4 5 . N . . . . GT ./. +NC_002945.4 6 . N . . . . GT ./. +NC_002945.4 7 . N . . . . GT ./. +NC_002945.4 8 . N . . . . GT ./. +NC_002945.4 9 . N . . . . GT ./. +NC_002945.4 10 . N . . . . GT ./. +NC_002945.4 11 . N . . . . GT ./. +NC_002945.4 12 . N . . . . GT ./. +NC_002945.4 13 . N . . . . GT ./. +NC_002945.4 14 . N . . . . GT ./. +NC_002945.4 15 . N . . . . GT ./. +NC_002945.4 16 . N . . . . GT ./. +NC_002945.4 17 . N . . . . GT ./. +NC_002945.4 18 . N . . . . GT ./. +NC_002945.4 19 . N . . . . GT ./. +NC_002945.4 20 . N . . . . GT ./. +NC_002945.4 21 . N . . . . GT ./. +NC_002945.4 22 . N . . . . GT ./. +NC_002945.4 23 . N . . . . GT ./. +NC_002945.4 24 . N . . . . GT ./. +NC_002945.4 25 . N . . . . GT ./. +NC_002945.4 26 . N . . . . GT ./. +NC_002945.4 27 . N . . . . GT ./. +NC_002945.4 28 . N . . . . GT ./. +NC_002945.4 29 . N . . . . GT ./. +NC_002945.4 30 . N . . . . GT ./. +NC_002945.4 31 . N . . . . GT ./. +NC_002945.4 32 . N . . . . GT ./. +NC_002945.4 33 . N . . . . GT ./. +NC_002945.4 34 . N . . . . GT ./. +NC_002945.4 35 . N . . . . GT ./. +NC_002945.4 36 . N . . . . GT ./. +NC_002945.4 37 . N . . . . GT ./. diff -r 4f98ba28a3f2 -r e52300a0000f test-data/vcf_input.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vcf_input.vcf Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,64 @@ +##fileformat=VCFv4.2 +##fileDate=20200302 +##source=freeBayes v1.3.1-dirty +##reference=/home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa +##contig= +##phasing=none +##commandline="freebayes --region NC_002945.4:0..4349904 --bam b_0.bam --fasta-reference /home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa --vcf ./vcf_output/part_NC_002945.4:0..4349904.vcf -u -n 0 --haplotype-length -1 --min-repeat-size 5 --min-repeat-entropy 1 -m 1 -q 0 -R 0 -Y 0 -e 1 -F 0.05 -C 2 -G 1 --min-alternate-qsum 0" +##filter="QUAL > 0" +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 13-1941-6 +NC_002945.4 2898437 . T G 0.263449 . AB=0;ABP=0;AC=0;AF=0;AN=2;AO=2;CIGAR=1X;DP=2;DPB=2;DPRA=0;EPP=3.0103;EPPR=0;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=0;NS=1;NUMALT=1;ODDS=2.77259;PAIRED=1;PAIREDR=0;PAO=0;PQA=0;PQR=0;PRO=0;QA=0;QR=0;RO=0;RPL=2;RPP=7.35324;RPPR=0;RPR=0;RUN=1;SAF=1;SAP=3.0103;SAR=1;SRF=0;SRP=0;SRR=0;TYPE=snp;technology.ILLUMINA=1 GT:DP:AD:RO:QR:AO:QA:GL 0/0:2:0,2:0:0:2:0:0,-0.60206,-8.68589e-09 diff -r 4f98ba28a3f2 -r e52300a0000f tool-data/fasta_indexes.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/fasta_indexes.loc.sample Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,29 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Samtools indexed sequences data files. You will need +#to create these data files and then create a fasta_indexes.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The fasta_indexes.loc +#file has this format (white space characters are TAB characters): +# +# +# +#So, for example, if you had hg19 Canonical indexed stored in +# +# /depot/data2/galaxy/hg19/sam/, +# +#then the fasta_indexes.loc entry would look like this: +# +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa +# +#and your /depot/data2/galaxy/hg19/sam/ directory +#would contain hg19canon.fa and hg19canon.fa.fai files. +# +#Your fasta_indexes.loc file should include an entry per line for +#each index set you have stored. The file in the path does actually +#exist, but it should never be directly used. Instead, the name serves +#as a prefix for the index file. For example: +# +#hg18canon hg18 Human (Homo sapiens): hg18 Canonical /depot/data2/galaxy/hg18/sam/hg18canon.fa +#hg18full hg18 Human (Homo sapiens): hg18 Full /depot/data2/galaxy/hg18/sam/hg18full.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /depot/data2/galaxy/hg19/sam/hg19full.fa diff -r 4f98ba28a3f2 -r e52300a0000f tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Fri May 08 16:58:19 2020 +0000 +++ b/tool_data_table_conf.xml.sample Wed Sep 30 19:05:39 2020 +0000 @@ -1,4 +1,10 @@ + + + value, dbkey, name, path + +
+ value, name, path, description diff -r 4f98ba28a3f2 -r e52300a0000f tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,6 @@ + +
+ value, dbkey, name, path + +
+
diff -r 4f98ba28a3f2 -r e52300a0000f vsnp_add_zero_coverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vsnp_add_zero_coverage.py Wed Sep 30 19:05:39 2020 +0000 @@ -0,0 +1,189 @@ +#!/usr/bin/env python + +import argparse +import multiprocessing +import os +import queue +import re +import shutil + +import pandas +import pysam +from Bio import SeqIO + +INPUT_BAM_DIR = 'input_bam_dir' +INPUT_VCF_DIR = 'input_vcf_dir' +OUTPUT_VCF_DIR = 'output_vcf_dir' +OUTPUT_METRICS_DIR = 'output_metrics_dir' + + +def get_base_file_name(file_path): + base_file_name = os.path.basename(file_path) + if base_file_name.find(".") > 0: + # Eliminate the extension. + return os.path.splitext(base_file_name)[0] + elif base_file_name.endswith("_vcf"): + # The "." character has likely + # changed to an "_" character. + return base_file_name.rstrip("_vcf") + return base_file_name + + +def get_coverage_and_snp_count(task_queue, reference, output_metrics, output_vcf, timeout): + while True: + try: + tup = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + bam_file, vcf_file = tup + # Create a coverage dictionary. + coverage_dict = {} + coverage_list = pysam.depth(bam_file, split_lines=True) + for line in coverage_list: + chrom, position, depth = line.split('\t') + coverage_dict["%s-%s" % (chrom, position)] = depth + # Convert it to a data frame. + coverage_df = pandas.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) + # Create a zero coverage dictionary. + zero_dict = {} + for record in SeqIO.parse(reference, "fasta"): + chrom = record.id + total_len = len(record.seq) + for pos in list(range(1, total_len + 1)): + zero_dict["%s-%s" % (str(chrom), str(pos))] = 0 + # Convert it to a data frame with depth_x + # and depth_y columns - index is NaN. + zero_df = pandas.DataFrame.from_dict(zero_dict, orient='index', columns=["depth"]) + coverage_df = zero_df.merge(coverage_df, left_index=True, right_index=True, how='outer') + # depth_x "0" column no longer needed. + coverage_df = coverage_df.drop(columns=['depth_x']) + coverage_df = coverage_df.rename(columns={'depth_y': 'depth'}) + # Covert the NaN to 0 coverage and get some metrics. + coverage_df = coverage_df.fillna(0) + coverage_df['depth'] = coverage_df['depth'].apply(int) + total_length = len(coverage_df) + average_coverage = coverage_df['depth'].mean() + zero_df = coverage_df[coverage_df['depth'] == 0] + total_zero_coverage = len(zero_df) + total_coverage = total_length - total_zero_coverage + genome_coverage = "{:.2%}".format(total_coverage / total_length) + # Process the associated VCF input. + column_names = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"] + vcf_df = pandas.read_csv(vcf_file, sep='\t', header=None, names=column_names, comment='#') + good_snp_count = len(vcf_df[(vcf_df['ALT'].str.len() == 1) & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)]) + base_file_name = get_base_file_name(vcf_file) + if total_zero_coverage > 0: + header_file = "%s_header.csv" % base_file_name + with open(header_file, 'w') as outfile: + with open(vcf_file) as infile: + for line in infile: + if re.search('^#', line): + outfile.write("%s" % line) + vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1] + vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1] + vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map(str) + "-" + vcf_df_snp['POS'].map(str) + vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE') + cat_df = pandas.concat([vcf_df_snp, zero_df], axis=1, sort=False) + cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth']) + cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']].fillna('.') + cat_df['REF'] = cat_df['REF'].fillna('N') + cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT') + cat_df['Sample'] = cat_df['Sample'].fillna('./.') + cat_df['temp'] = cat_df.index.str.rsplit('-', n=1) + cat_df[['CHROM', 'POS']] = pandas.DataFrame(cat_df.temp.values.tolist(), index=cat_df.index) + cat_df = cat_df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample']] + cat_df['POS'] = cat_df['POS'].astype(int) + cat_df = cat_df.sort_values(['CHROM', 'POS']) + body_file = "%s_body.csv" % base_file_name + cat_df.to_csv(body_file, sep='\t', header=False, index=False) + if output_vcf is None: + output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) + else: + output_vcf_file = output_vcf + with open(output_vcf_file, "w") as outfile: + for cf in [header_file, body_file]: + with open(cf, "r") as infile: + for line in infile: + outfile.write("%s" % line) + else: + if output_vcf is None: + output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) + else: + output_vcf_file = output_vcf + shutil.copyfile(vcf_file, output_vcf_file) + bam_metrics = [base_file_name, "", "%4f" % average_coverage, genome_coverage] + vcf_metrics = [base_file_name, str(good_snp_count), "", ""] + if output_metrics is None: + output_metrics_file = os.path.join(OUTPUT_METRICS_DIR, "%s.tabular" % base_file_name) + else: + output_metrics_file = output_metrics + metrics_columns = ["File", "Number of Good SNPs", "Average Coverage", "Genome Coverage"] + with open(output_metrics_file, "w") as fh: + fh.write("# %s\n" % "\t".join(metrics_columns)) + fh.write("%s\n" % "\t".join(bam_metrics)) + fh.write("%s\n" % "\t".join(vcf_metrics)) + task_queue.task_done() + + +def set_num_cpus(num_files, processes): + num_cpus = int(multiprocessing.cpu_count()) + if num_files < num_cpus and num_files < processes: + return num_files + if num_cpus < processes: + half_cpus = int(num_cpus / 2) + if num_files < half_cpus: + return num_files + return half_cpus + return processes + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=False, default=None, help='Output metrics text file') + parser.add_argument('--output_vcf', action='store', dest='output_vcf', required=False, default=None, help='Output VCF file') + parser.add_argument('--reference', action='store', dest='reference', help='Reference dataset') + parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') + + args = parser.parse_args() + + # The assumption here is that the list of files + # in both INPUT_BAM_DIR and INPUT_VCF_DIR are + # equal in number and named such that they are + # properly matched if the directories contain + # more than 1 file (i.e., hopefully the bam file + # names and vcf file names will be something like + # Mbovis-01D6_* so they can be # sorted and properly + # associated with each other). + bam_files = [] + for file_name in sorted(os.listdir(INPUT_BAM_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_BAM_DIR, file_name)) + bam_files.append(file_path) + vcf_files = [] + for file_name in sorted(os.listdir(INPUT_VCF_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name)) + vcf_files.append(file_path) + + multiprocessing.set_start_method('spawn') + queue1 = multiprocessing.JoinableQueue() + num_files = len(bam_files) + cpus = set_num_cpus(num_files, args.processes) + # Set a timeout for get()s in the queue. + timeout = 0.05 + + # Add each associated bam and vcf file pair to the queue. + for i, bam_file in enumerate(bam_files): + vcf_file = vcf_files[i] + queue1.put((bam_file, vcf_file)) + + # Complete the get_coverage_and_snp_count task. + processes = [multiprocessing.Process(target=get_coverage_and_snp_count, args=(queue1, args.reference, args.output_metrics, args.output_vcf, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue1.join() + + if queue1.empty(): + queue1.close() + queue1.join_thread() diff -r 4f98ba28a3f2 -r e52300a0000f vsnp_build_tables.py --- a/vsnp_build_tables.py Fri May 08 16:58:19 2020 +0000 +++ b/vsnp_build_tables.py Wed Sep 30 19:05:39 2020 +0000 @@ -3,10 +3,11 @@ import argparse import multiprocessing import os +import queue +import re + import pandas -import queue import pandas.io.formats.excel -import re from Bio import SeqIO INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir' @@ -32,7 +33,7 @@ # Create an annotation file. annotation_file = "%s_annotations.csv" % group with open(annotation_file, "a") as fh: - for index, row in positions.iterrows(): + for _, row in positions.iterrows(): pos = row.position try: aaa = pro.iloc[pro.index.get_loc(int(pos))][['chrom', 'locus', 'product', 'gene']] diff -r 4f98ba28a3f2 -r e52300a0000f vsnp_build_tables.xml --- a/vsnp_build_tables.xml Fri May 08 16:58:19 2020 +0000 +++ b/vsnp_build_tables.xml Wed Sep 30 19:05:39 2020 +0000 @@ -1,5 +1,8 @@ - + + + macros.xml + biopython pandas @@ -52,10 +55,7 @@ ]]> - - - - + @@ -192,15 +192,6 @@ * **Choose the category for the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single SNPs json, average MQ json and newick files, or collections of each) based on the selected option. * **Use Genbank file** - Select "yes" to annotate the tables using the information in the Genbank file. Locally cached files, if available, provide the most widely used annotations, but more custom Genbank files can be chosen from the current history. - - - @misc{None, - journal = {None}, - author = {1. Stuber T}, - title = {Manuscript in preparation}, - year = {None}, - url = {https://github.com/USDA-VS/vSNP},} - - +