Mercurial > repos > bcrain-completegenomics > testing1
changeset 15:a006c9df8747 draft
Deleted selected files
author | bcrain-completegenomics |
---|---|
date | Thu, 07 Jun 2012 14:43:19 -0400 |
parents | e2e802a079f0 |
children | 4450c9ca9451 |
files | cgatools_suite/.DS_Store cgatools_suite/._datatypes_conf.xml cgatools_suite/._tool_data_table_conf.xml cgatools_suite/datatypes_conf.xml cgatools_suite/lib/galaxy/datatypes/._completegenomics.py cgatools_suite/lib/galaxy/datatypes/completegenomics.py cgatools_suite/tool-data/._cg_crr_files.loc.sample cgatools_suite/tool-data/cg_crr_files.loc.sample cgatools_suite/tool_data_table_conf.xml cgatools_suite/tools/cgatools/.DS_Store cgatools_suite/tools/cgatools/._calldiff.xml cgatools_suite/tools/cgatools/._join.xml cgatools_suite/tools/cgatools/._junctiondiff.xml cgatools_suite/tools/cgatools/._listtestvariants.xml cgatools_suite/tools/cgatools/._listvariants.xml cgatools_suite/tools/cgatools/._snpdiff.xml cgatools_suite/tools/cgatools/._testing.pl cgatools_suite/tools/cgatools/._testvariants.xml cgatools_suite/tools/cgatools/._varfilter.xml cgatools_suite/tools/cgatools/._varfilter_wrapper.pl cgatools_suite/tools/cgatools/calldiff.xml cgatools_suite/tools/cgatools/join.xml cgatools_suite/tools/cgatools/junctiondiff.xml cgatools_suite/tools/cgatools/listtestvariants.xml cgatools_suite/tools/cgatools/listvariants.xml cgatools_suite/tools/cgatools/snpdiff.xml cgatools_suite/tools/cgatools/testing.pl cgatools_suite/tools/cgatools/testvariants.xml cgatools_suite/tools/cgatools/varfilter.xml cgatools_suite/tools/cgatools/varfilter_wrapper.pl cgatools_suite/tools/cgi_scripts/.DS_Store cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl |
diffstat | 39 files changed, 0 insertions(+), 2884 deletions(-) [+] |
line wrap: on
line diff
--- a/cgatools_suite/datatypes_conf.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<?xml version="1.0"?> -<datatypes> - <datatype_files> - <datatype_file name="completegenomics.py"/> - </datatype_files> - - <registration> - <!-- - Add the following section to datatypes_conf.xml file in your Galaxy distribution if you are adding Complete Genomics tools manually to your Galaxy instance - --> - <!-- Start Complete Genomics Datatypes --> - <datatype extension="cg_var" type="galaxy.datatypes.tabular:CG_Var" display_in_upload="true" /> - <datatype extension="cg_mastervar" type="galaxy.datatypes.tabular:CG_MasterVar" display_in_upload="true" /> - <datatype extension="cg_gene" type="galaxy.datatypes.tabular:CG_Gene" display_in_upload="true" /> - <!-- End Complete Genomics Datatypes --> - </registration> - <sniffers> - </sniffers> -</datatypes>
--- a/cgatools_suite/lib/galaxy/datatypes/completegenomics.py Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -""" -Complete Genomics datatypes -Birgit Crain - Complete Genomics, Inc -""" - -import pkg_resources -pkg_resources.require( "bx-python" ) - -import logging -from galaxy.datatypes import data -from galaxy import util -from cgi import escape -from galaxy.datatypes import metadata -from galaxy.datatypes import tabular -from galaxy.datatypes.metadata import MetadataElement -from galaxy.datatypes.tabular import Tabular -import galaxy_utils.sequence.vcf -from galaxy.datatypes.sniff import * - -log = logging.getLogger(__name__) - -class CG_Var( Tabular ): - file_ext = 'cg_var' - def __init__(self, **kwd): - """Initialize CG_Var datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['locus', 'ploidy', 'allele', 'chromosome', 'begin', 'end', - 'varType', 'reference', 'alleleSeq', 'varScoreVAF', - 'varScoreEAF', 'varQuality', 'hapLink', 'xRef' - ] - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names ) - -class CG_MasterVar( Tabular ): - file_ext = 'cg_mastervar' - def __init__(self, **kwd): - """Initialize CG_MasterVar datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['locus', 'ploidy', 'chromosome', 'begin', 'end', 'zygosity', - 'varType', 'reference', 'allele1Seq', 'allele2Seq', - 'allele1VarScoreVAF', 'allele2VarScoreVAF', 'allele1VarScoreEAF', - 'allele2VarScoreEAF', 'allele1VarQuality', 'allele2VarQuality', - 'allele1HapLink', 'allele2HapLink', 'allele1XRef', 'allele2XRef', - 'evidenceIntervalId', 'allele1ReadCount', 'allele2ReadCount', - 'referenceAlleleRead', 'totalReadCount', 'allele1Gene', - 'allele2Gene pfam', 'miRBaseId', 'repeatMasker', 'segDupOverlap', - 'relativeCoverageDiploid', 'calledPloidy', - 'relativeCoverageNondiploid', 'calledLevel' - ] - - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names ) - -class CG_Gene( Tabular ): - file_ext = 'cg_gene' - def __init__(self, **kwd): - """Initialize CG_Gene datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['index', 'locus', 'allele', 'chromosome', 'begin', 'end', - 'varType', 'reference', 'call', 'xRef', 'geneId', - 'mrnaAcc', 'proteinAcc', 'symbol', 'orientation', 'component', - 'componentIndex', 'hasCodingRegion', 'impact', 'nucleotidePos', - 'proteinPos', 'annotationRefSequence', 'sampleSequence', - 'genomeRefSequence', 'pfam' - ] - - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names )
--- a/cgatools_suite/tool-data/cg_crr_files.loc.sample Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use .crr reference files. You will need to download or create -#the .crr reference files and then create a cg_crr_files.loc file -#similar to this one (store it in this directory) that points to -#the location of the files. The cg_crr_files.loc -#file has this format (white space characters are TAB characters): -# -#<value> <dbkey> <name> <path> -# -#hg19 hg19 hg19.crr /Users/bcrain/Documents/hg19.crr -
--- a/cgatools_suite/tool_data_table_conf.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -<tables> - <!-- - Add the following section to tool_data_table_conf.xml file in your Galaxy distribution if you are adding Complete Genomics tools manually to your Galaxy instance - --> - <!-- Start Location of cgatools crr files --> - <table name="cg_crr_files" comment_char="#"> - <columns>value, dbkey, name, path</columns> - <file path="tool-data/cg_crr_files.loc" /> - </table> - <!-- End Location of cgatools crr files --> -</tables> -
--- a/cgatools_suite/tools/cgatools/calldiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,343 +0,0 @@ -<tool id="cga_calldiff" name="calldiff(beta)" version="0.0.1"> - - <description>compares two Complete Genomics variant files.</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools calldiff --beta - --reference ${crr.fields.path} - --variantsA $data_sources.inputA - --variantsB $data_sources.inputB - $validation - $diploid - --locus-stats-column-count $column - --max-hypothesis-count $hypothesis - --output-prefix cg_ - --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` - #if $somatic.report6 == "SomaticOutput" - --genome-rootA $somatic.genomeA - --genome-rootB $somatic.genomeB - --calibration-root $somatic.calibration - #end if - </command> - - <outputs> - <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} on ${on_string}: SuperlocusOutput"> - <filter>(report1 == 'SuperlocusOutput')</filter> - </data> - <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} on ${on_string}: SuperlocusStats"> - <filter>(report2 == 'SuperlocusStats')</filter> - </data> - <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} on ${on_string}: LocusOutput"> - <filter>(report3 == 'LocusOutput')</filter> - </data> - <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} on ${on_string}: LocusStats"> - <filter>(report4 == 'LocusStats')</filter> - </data> - <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} on ${on_string}: VariantsA"> - <filter>(report5 == 'VariantOutput')</filter> - </data> - <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} on ${on_string}: VariantsB"> - <filter>(report5 == 'VariantOutput')</filter> - </data> - <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} on ${on_string}: SomaticOutput"> - <filter>(somatic['report6'] == 'SomaticOutput')</filter> - </data> - </outputs> - - <inputs> - <!--form field to select crr file--> - <param name="crr" type="select" label="Genome build"> - <options from_data_table="cg_crr_files" /> - </param> - - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input varfiles?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <param name="inputA" type="data" format="cg_var" label="Dataset A"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - <param name="inputB" type="data" format="cg_var" label="Dataset B"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="inputA" type="text" label="Variant file A (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/> - <param name="inputB" type="text" label="Variant file B (path/file_name)" size="300" help="Variant files can be compressed (gz, bz2)."/> - </when> - </conditional> - - <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model."> - <option value="">no</option> - <option value="--diploid">yes</option> - </param> - - <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15"/> - - <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32"/> - - <param name="validation" type="select" label="Reference cover validation" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file."> - <option value="">on</option> - <option value="--no-reference-cover-validation">off</option> - </param> - - <param name="report1" type="select" label="Report SuperlocusOutput"> - <option value="">no</option> - <option value="SuperlocusOutput">yes</option> - </param> - <param name="report2" type="select" label="Report SuperlocusStats"> - <option value="">no</option> - <option value="SuperlocusStats">yes</option> - </param> - <param name="report3" type="select" label="Report LocusOutput"> - <option value="">no</option> - <option value="LocusOutput">yes</option> - </param> - <param name="report4" type="select" label="Report LocusStats"> - <option value="">no</option> - <option value="LocusStats">yes</option> - </param> - <param name="report5" type="select" label="Report VariantOutput" help="Both variant files annotated by comparison results.If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report."> - <option value="">no</option> - <option value="VariantOutput">yes</option> - </param> - - <conditional name="somatic"> - <param name="report6" type="select" label="Report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x."> - <option value="">no</option> - <option value="SomaticOutput">yes</option> - </param> - <when value="SomaticOutput"> - <param name="genomeA" type="text" size="300" label="Directory for genome A (path/dir)" help="The 'A' genome directory, for example /data/GS00118-DNA_A01; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories."/> - <param name="genomeB" type="text" size="300" label="Directory for genome B (path/dir)" help="The 'B' genome directory"/> - <param name="calibration" type="text" size="300" label="Directory calibration data (path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz"/> - </when> - </conditional> - - </inputs> - - <help> - -**What it does** - -This tool compares two Complete Genomics variant files. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - calldiff - Compares two Complete Genomics variant files. - - DESCRIPTION - Compares two Complete Genomics variant files. Divides the genome up into - superloci of nearby variants, then compares the superloci. Also refines the - comparison to determine per-call or per-locus comparison results. - - Comparison results are usually described by a semi-colon separated string, - one per allele. Each allele's comparison result is one of the following - classifications: - - ref-identical The alleles of the two variant files are identical, and - they are consistent with the reference. - alt-identical The alleles of the two variant files are identical, and - they are inconsistent with the reference. - ref-consistent The alleles of the two variant files are consistent, - and they are consistent with the reference. - alt-consistent The alleles of the two variant files are consistent, - and they are inconsistent with the reference. - onlyA The alleles of the two variant files are inconsistent, - and only file A is inconsistent with the reference. - onlyB The alleles of the two variant files are inconsistent, - and only file B is inconsistent with the reference. - mismatch The alleles of the two variant files are inconsistent, - and they are both inconsistent with the reference. - phase-mismatch The two variant files would be consistent if the - hapLink field had been empty, but they are - inconsistent. - ploidy-mismatch The superlocus did not have uniform ploidy. - - In some contexts, this classification is rolled up into a simplified - classification, which is one of "identical", "consistent", "onlyA", - "onlyB", or "mismatch". - - A good place to start looking at the results is the superlocus-output file. - It has columns defined as follows: - - SuperlocusId An identifier given to the superlocus. - Chromosome The name of the chromosome. - Begin The 0-based offset of the start of the superlocus. - End The 0-based offset of the base one past the end of the - superlocus. - Classification The match classification of the superlocus. - Reference The reference sequence. - AllelesA A semicolon-separated list of the alleles (one per - haplotype) for variant file A, for the phasing with the - best comparison result. - AllelesB A semicolon-separated list of the alleles (one per - haplotype) for variant file B, for the phasing with the - best comparison result. - - The locus-output file contains, for each locus in file A and file B that is - not consistent with the reference, an annotated set of calls for the locus. - The calls are annotated with the following columns: - - SuperlocusId The id of the superlocus containing the locus. - File The variant file (A or B). - LocusClassification The locus classification is determined by the - varType column of the call that is inconsistent - with the reference, concatenated with a - modifier that describes whether the locus is - heterozygous, homozygous, or contains no-calls. - If there is no one variant in the locus (i.e., - it is heterozygous alt-alt), the locus - classification begins with "other". - LocusDiffClassification The match classification for the locus. This is - defined to be the best of the comparison of the - locus to the same region in the other file, or - the comparison of the superlocus. - - The somatic output file contains a list of putative somatic variations of - genome A. The output includes only those loci that can be classified as - snp, del, ins or sub in file A, and are called reference in the file B. - Every locus is annotated with the following columns: - - VarCvgA The totalReadCount from file A for this locus - (computed on the fly if file A is not a - masterVar file). - VarScoreA The varScoreVAF from file A, or varScoreEAF if - the "--diploid" option is used. - RefCvgB The maximum of the uniqueSequenceCoverage - values for the locus in genome B. - RefScoreB Minimum of the reference scores of the locus in - genome B. - SomaticCategory The category used for determining the - calibrated scores and the SomaticRank. - VarScoreACalib The calibrated variant score of file A, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - VarScoreBCalib The calibrated reference score of file B, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - SomaticRank The estimated rank of this somatic mutation, - amongst all true somatic mutations within this - SomaticCategory. The value is a number between - 0 and 1; a value of 0.012 means, for example, - that an estimated 1.2% of the true somatic - mutations in this somaticCategory have a - somaticScore less than the somaticScore for - this mutation. See user guide for more - information. - SomaticScore An integer that provides a total order on - quality for all somatic mutations. It is equal - to -10*log10( P(false)/P(true) ), under the - assumption that this genome has a rate of - somatic mutation equal to 1/Mb for - SomaticCategory snp, 1/10Mb for SomaticCategory - ins, 1/10Mb for SomaticCategory del, and 1/20Mb - for SomaticCategory sub. The computation is - based on the assumptions described in the user - guide, and is affected by choice of variant - model selected by using or not using the - "--diploid" option. - SomaticQuality Equal to VQHIGH for all somatic mutations where - SomaticScore >= -10. Otherwise, this column is - empty. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variantsA arg - The "A" input variant file. - - --variantsB arg - The "B" input variant file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) - Comma-separated list of reports to generate. (Beware any reports whose - name begins with "Debug".) A report is one of: - SuperlocusOutput Report for superlocus classification. - SuperlocusStats Report for superlocus classification stats. - LocusOutput Report for locus classification. - LocusStats Report for locus stats. - VariantOutput Both variant files annotated by comparison - results.If the somatic output report is - requested, file A is also annotated with the - same score ranks as produced in that report. - SomaticOutput Report for the list of simple variations that - are present only in file "A", annotated with - the score that indicates the probability of - the variation being truly somatic. Requires - beta, genome-rootA, and genome-rootB options - to be provided as well. Note: generating this - report slows calldiff by 10x-20x. - DebugCallOutput Report for call classification. - DebugSuperlocusOutput Report for debug superlocus information. - DebugSomaticOutput Report for distribution estimates used for - somatic rescoring. Only produced if - SomaticOutput is also turned on. - - --diploid - Uses varScoreEAF instead of varScoreVAF in somatic score computations. - Also, uses diploid variant model instead of variable allele mixture - model. - - --locus-stats-column-count arg (=15) - The number of columns for locus compare classification in the locus - stats file. - - --max-hypothesis-count arg (=32) - The maximum number of possible phasings to consider for a superlocus. - - --no-reference-cover-validation - Turns off validation that all bases of a chromosome are covered by - calls of the variant file. - - --genome-rootA arg - The "A" genome directory, for example /data/GS00118-DNA_A01; this - directory is expected to contain ASM/REF and ASM/EVIDENCE - subdirectories. - - --genome-rootB arg - The "B" genome directory. - - --calibration-root arg - The directory containing calibration data. For example, there should - exist a file calibration-root/0.0.0/metrics.tsv. - - --beta - This flag enables the SomaticOutput report, which is beta - functionality. - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/join.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ -<tool id="cga_join" name="join(beta)" version="0.0.1"> - - <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools join --beta - --input $input1 - --input $input2 - --output $output - --output-mode $outmode - $dump - --select $col - #for $m in $matched <!--get all matched columns--> - --match ${m.match} - #end for - </command> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <inputs> - <!--form field to select input file A--> - <param name="input1" type="data" format="tabular" label="Select first input file (A)"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="0" - message="cgatools is not currently available for this build."/> - </param> - - <!--form field to select input file B--> - <param name="input2" type="data" format="tabular" label="Select second input file (B)"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="0" - message="cgatools is not currently available for this build."/> - </param> - - <!--form field to specify columns to match--> - <repeat name="matched" title="Matched column"> - <param name="match" type="text" label="Enter column A:column B"/> - </repeat> - - <!--form field to specify columns to print--> - <param name="col" type="text" value="A.*,B.*" label="Specify columns to print from file A and B in format A.col_name1,A.col_name2,B.col_name1" /> - - <!--form field to select output-mode--> - <param name="outmode" type="select" label="Select output mode"> - <option value="full" selected="true">full (1 line for each match of records in A and B)</option> - <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option> - <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option> - </param> - - <!--form field to select columns to match--> - <param name="dump" type="select" label="Select records to print"> - <option value="--always-dump" selected="true">print all records of A even if not matched in B</option> - <option value="">print only records of A that are matched in B</option> - </param> - </inputs> - - <help> - -**What it does** - -This tool joins two tab-delimited files based on equal fields or overlapping regions. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - join - Joins two tab-delimited files based on equal fields or overlapping regions. - - DESCRIPTION - Joins two tab-delimited files based on equal fields or overlapping regions. - By default, an output record is produced for each match found between file - A and file B, but output format can be controlled by the --output-mode - parameter. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --input arg - File name to use as input (may be passed in as arguments at the end of - the command), or omitted for stdin). There must be exactly two input - files to join. If only one file is specified by name, file A is taken - to be stdin and file B is the named file. File B is read fully into - memory, and file A is streamed. File A's columns appear first in the - output. - - --output arg (=STDOUT) - The output file name (may be omitted for stdout). - - --match arg - A match specification, which is a column from A and a column from B - separated by a colon. - - --overlap arg - - -m [ --output-mode ] arg (=full) - Output mode, one of the following: - full Print an output record for each match found between - file A and file B. - compact Print at most one record for each record of file A, - joining the file B values by a semicolon and - suppressing repeated B values and empty B values. - compact-pct Same as compact, but for each distinct B value, - annotate with the percentage of the A record that is - overlapped by B records with that B value. Percentage - is rounded up to nearest integer. - - --overlap-mode arg (=strict) - Overlap mode, one of the following: - strict Range A and B overlap if A.begin < B.end and - B.begin < A.end. - allow-abutting-points Range A and B overlap they meet the strict - requirements, or if A.begin <= B.end and - B.begin <= A.end and either A or B has zero - length. - - --select arg (=A.*,B.*) - Set of fields to select for output. - - -a [ --always-dump ] - Dump every record of A, even if there are no matches with file B. - - --overlap-fraction-A arg (=0) - Minimum fraction of A region overlap for filtering output. - - --boundary-uncertainty-A arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) - - --overlap-fraction-B arg (=0) - Minimum fraction of B region overlap for filtering output. - - --boundary-uncertainty-B arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) - - SUPPORTED FORMAT_VERSION - Any - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/junctiondiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ -<tool id="cga_junctiondiff" name="junctiondiff(beta)" version="0.0.1"> - - <description>reports difference between junction calls</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools junctiondiff --beta -h - </command> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <inputs> - </inputs> - - <help> - -**What it does** - -This tool reports difference between junction calls of Complete Genomics junctions files - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. - - DESCRIPTION - junctiondiff takes two junction files A and B as input and produces the - following output: - - "diff-inputFileName" - the junctions from an input file A that are not - present in input file B. - - "report.txt" - a brief summary report (if --statout is used) - - Two junctions are considered equivalent if: - - they come from different files - - left and right positions of one junction are not more than "--distance" - bases apart from the corresponding positions of another junction - - the junction scores are equal or above the scoreThreshold - - they are on the same strands - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - -s [ --reference ] arg - Reference file. - - -a [ --junctionsA ] arg - input junction file A. - - -b [ --junctionsB ] arg - input junction file B. - - -A [ --scoreThresholdA ] arg (=10) - score threshold value for the input file A. - - -B [ --scoreThresholdB ] arg (=0) - score threshold value for the input file B. - - -d [ --distance ] arg (=200) - Max distance between coordinates of potentially compatible junctions. - - -l [ --minlength ] arg (=500) - Minimum deletion junction length to be included into the difference - file. - - -o [ --output-prefix ] arg - The path prefix for all the output reports. - - -S [ --statout ] - (Debug) Report various input file statistics. Experimental feature. - - SUPPORTED FORMAT_VERSION - 1.5 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/listtestvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,239 +0,0 @@ -<tool id="cga_listtestvariants" name="listvariants(beta)-testvariants(beta)" version="1.0.1"> -<!-- -This tool creates a GUI for cgatools listvariants and testvariants from Complete Genomics, Inc. -to be run consecutively with the same input files. -written 5-29-2012 by bcrain@completegenomics.com ---> - - <description></description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output1 - #if $include_list.listing == "yes" <!--only added when yes--> - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var file--> - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - ; - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output2 - --input $output1 - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - </command> - - <outputs> - <data format="tabular" name="output1" label="listvariants output"/> - <data format="tabular" name="output2" label="testvariants output"/> - </outputs> - - <inputs> - <!--form field to select crr file--> - <param name="crr" type="select" label="Genome build"> - <options from_data_table="cg_crr_files" /> - </param> - - <!--form field to select long variants option--> - <param name="longvar" type="select" label="List long variants?"> - <option value="" selected="true">no</option> - <option value="--list-long-variants">yes</option> - </param> - - <!--form fields to include existing variant list--> - <conditional name="include_list"> - <param name="listing" type="select" label="Include variant listing?"> - <option value="no" selected="true">no</option> - <option value="yes">yes</option> - </param> - <when value="yes"> - <param name="list" type="data" format="tabular" label="Variant listing"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </when> - </conditional> - - <!--conditional to select input file type--> - <conditional name="file_types"> - <param name="file_type" type="select" label="Select the input file type"> - <option value="var" selected="true">var files</option> - <option value="mastervar">mastervar files</option> - </param> - - <when value="var"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input var files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_var" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="mastervar"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input mastervar files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_mastervar" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - </conditional> - </inputs> - - <help> - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/listvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ -<tool id="cga_listvariant" name="listvariants(beta)" version="0.0.1"> -<!-- -This tool creates a GUI for cgatools listvariants from Complete Genomics, Inc. -written 5-29-2012 by bcrain@completegenomics.com ---> - - <description>lists all called variants</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output - #if $include_list.listing == "yes" <!--only added when yes--> - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - </command> - - <inputs> - <!--form field to select crr file--> - <param name="crr" type="select" label="Genome build"> - <options from_data_table="cg_crr_files" /> - </param> - - <!--form field to select long variants option--> - <param name="longvar" type="select" label="List long variants?"> - <option value="" selected="true">no</option> - <option value="--list-long-variants">yes</option> - </param> - - <!--form fields to include existing variant list--> - <conditional name="include_list"> - <param name="listing" type="select" label="Include variant listing?"> - <option value="no" selected="true">no</option> - <option value="yes">yes</option> - </param> - <when value="yes"> - <param name="list" type="data" format="tabular" label="Variant listing"/> - </when> - </conditional> - - <!--conditional to select input file type--> - <conditional name="file_types"> - <param name="file_type" type="select" label="Select the input file type"> - <option value="var" selected="true">var files</option> - <option value="mastervar">mastervar files</option> - </param> - - <when value="var"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input var files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_var" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - <!--<validator type="expression" message="Dataset does not match selected build.">$dbkey == $crr.fields.dbkey</validator>--> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="mastervar"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input mastervar files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_mastervar" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - </conditional> - </inputs> - - <outputs> - <data format="tabular" name="output"/> - </outputs> - - <help> - -**What it does** - -This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/snpdiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -<tool id="cga_snpdiff" name="snpdiff" version="0.0.1"> - - <description>compares snp calls to a Complete Genomics variant file.</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools snpdiff --beta -h - </command> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <inputs> - </inputs> - - <help> - -**What it does** - -This tool ompares snp calls to a Complete Genomics variant file. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - snpdiff - Compares snp calls to a Complete Genomics variant file. - - DESCRIPTION - Compares the snp calls in the "genotypes" file to the calls in a Complete - Genomics variant file. The genotypes file is a tab-delimited file with at - least the following columns (additional columns may be given): - - Chromosome (Required) The name of the chromosome. - Offset0Based (Required) The 0-based offset in the chromosome. - GenotypesStrand (Optional) The strand of the calls in the Genotypes - column (+ or -, defaults to +). - Genotypes (Optional) The calls, one per allele. The following - calls are recognized: - A,C,G,T A called base. - N A no-call. - - A deleted base. - . A non-snp variation. - - The output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The calls made by the variant file, one per allele. - The character codes are the same as is described for - the Genotypes column. - DiscordantAlleles (Only if Genotypes is present) The number of - Genotypes alleles that are discordant with calls in - the VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then each genotype allele is compared - against the haploid call of the VariantFile. - NoCallAlleles (Only if Genotypes is present) The number of - Genotypes alleles that were no-called by the - VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then a VariantFile no-call is counted twice. - - The verbose output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The call made by the variant file for one allele (there is - a line in this file for each allele). The character codes - are the same as is described for the Genotypes column. - [CALLS] The rest of the columns are pasted in from the VariantFile, - describing the variant file line used to make the call. - - The stats output is a comma-separated file with several tables describing - the results of the snp comparison, for each diploid genotype. The tables - all describe the comparison result (column headers) versus the genotype - classification (row labels) in different ways. The "Locus classification" - tables have the most detailed match classifications, while the "Locus - concordance" tables roll these match classifications up into "discordance" - and "no-call". A locus is considered discordant if it is discordant for - either allele. A locus is considered no-call if it is concordant for both - alleles but has a no-call on either allele. The "Allele concordance" - describes the comparison result on a per-allele basis. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variants arg - The input variant file. - - --genotypes arg - The input genotypes file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=Output,Verbose,Stats) - Comma-separated list of reports to generate. A report is one of: - Output The output genotypes file. - Verbose The verbose output file. - Stats The stats output file. - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/testing.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#!/usr/bin/perl - -print "$0 @ARGV\n"; -open OUT, ">@ARGV[0]"; -print "test1 ok\ttest1 ok\ntest1 ok\ttest1 ok\n"; -print OUT "test ok\ttest ok\ntest ok\ttest ok\n"; -close OUT; -open OUT, ">somefile"; -print OUT "test2 ok\ttest2 ok\ntest2 ok\ttest2 ok\n"; -close OUT; \ No newline at end of file
--- a/cgatools_suite/tools/cgatools/testvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ -<tool id="cga_testvariants" name="testvariants(beta)" version="0.0.1"> -<!-- -This tool creates a GUI for cgatools testvariants from Complete Genomics, Inc. -written 5-29-2012 by bcrain@completegenomics.com ---> - - <description>test for the presence of variants</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command> <!--run executable--> - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output - --input $listing - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var/mastervar file--> - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - </command> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <inputs> - <!--form field to select crr file--> - <param name="crr" type="select" label="Genome build"> - <options from_data_table="cg_crr_files" /> - </param> - - <!--form fields to select variant list--> - <param name="listing" type="data" format="tabular" label="Select variant list"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - - <!--conditional to select input file type--> - <conditional name="file_types"> - <param name="file_type" type="select" label="Select the input file type"> - <option value="var" selected="true">var files</option> - <option value="mastervar">mastervar files</option> - </param> - - <when value="var"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input var files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_var" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="mastervar"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input mastervar files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_mastervar" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of mastervar files (/path/file)" size="200" help="file with list of mastervar files (/path/varfile), mastervar files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - </conditional> - </inputs> - - <help> - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/varfilter.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -<tool id="cga_varfilter" name="varfilter(beta)" version="0.0.1"> -<!-- -This tool creates a GUI for cgatools varfilter from Complete Genomics, Inc. -The function is called via a Perl script vartools_wrapper.pl, designed to generate the correctly formated filters to append the input file on the command line. -written 6-1-2012 by bcrain@completegenomics.com ---> - - <description>copies input file, applying filters.</description> <!--adds description in toolbar--> - - <requirements> - <requirement type="binary">cgatools</requirement> - </requirements> - - <command interpreter="perl"> - varfilter_wrapper.pl - --reference $crr.fields.path - --output $output - --input $file_types.data_sources.input - #for $f in $filters - --zygosity $f.zygosity - --vartype $f.vartype - --varscorevaf x$f.varscorevaf - --varscoreeaf x$f.varscoreeaf - --varquality $f.varquality - #end for - </command> - - <outputs> - <data format="cg_var" name="output" /> - </outputs> - - <inputs> - <!--form field to select crr file--> - <param name="crr" type="select" label="Genome build"> - <options from_data_table="cg_crr_files" /> - </param> - - <!--conditional to select input file type--> - <conditional name="file_types"> - <param name="file_type" type="select" label="Select the input file type"> - <option value="var" selected="true">var files</option> - <option value="mastervar">mastervar files</option> - </param> - - <when value="var"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where is the input var file?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <param name="input" type="data" format="cg_var" label="Var file"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="input" type="text" label="Var file (/path/file)" size="200" help="var file can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="mastervar"> - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where is the input mastervar file?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <param name="input" type="data" format="cg_mastervar" label="Mastervar file"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="input" type="text" label="Mastervar file (/path/file)" size="200" help="mastervar file can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - </conditional> - - <!-- formfields to add filters --> - <repeat name="filters" title="Filter"> - <param name="zygosity" type="select" label="Filter out call (set to no-call) IF locus IS"> - <option value="NA">- all loci -</option> - <option value="hom">homozygous</option> - <option value="het">heterzygous</option> - </param> - - <param name="vartype" type="select" label="AND varType IS"> - <option value="NA">- any varType -</option> - <option value="snp">snp</option> - <option value="ins">ins</option> - <option value="del">del</option> - <option value="sub">sub</option> - <option value="ref">ref</option> - </param> - - <param name="varscorevaf" type="text" label="AND varScoreVAF IS LESS THAN"/> - <param name="varscoreeaf" type="text" label="AND varScoreEAF IS LESS THAN"/> - - <param name="varquality" type="select" label="AND varQuality IS NOT"> - <option value="NA"> </option> - <option value="VQHigh">VQHigh</option> - <option value="VQLOW">VQLOW</option> - </param> - </repeat> - </inputs> - - <help> - -**What it does** - -This tool copies input var file or masterVar file to output, applying specified filters. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - varfilter - Copies input var file or masterVar file to output, applying - specified filters. - - DESCRIPTION - Copies input var file or masterVar file to output, applying specified - filters (which are available to all cgatools commands that read a var file - or masterVar file as input). Filters are specified by appending the filter - specification to the var file name on the command line. For example: - - /path/to/var.tsv.bz2#varQuality!=VQHIGH - - The preceding example filters out any calls marked as VQLOW. The filter - specification follows the "#" sign, and consists of a list of filters to - apply, separated by a comma. Each filter is a colon-separated list of call - selectors. Any scored call that passes all the colon-separated call - selectors for one or more of the comma-separated filters is turned into a - no-call. The following call selectors are available: - - hom Selects only calls in homozygous loci. - het Selects any scored call not selected by the hom selector. - varType=XX Selects calls whose varType is XX. - varScoreVAF<XX Selects calls whose varScoreVAF<XX. - varScoreEAF<XX Selects calls whose varScoreEAF<XX. - varQuality!=XX Selects calls whose varQuality is not XX. - - Here is an example that filters homozygous SNPs with varScoreVAF < 25 and - heterozygous insertions with varScoreEAF < 50: - - - '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' - - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta flag. - - --reference arg - The reference crr file. - - --input arg - The input var file or masterVar file (typically with filters specified). - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - SUPPORTED FORMAT_VERSION - 0.3 or later - </help> -</tool>
--- a/cgatools_suite/tools/cgatools/varfilter_wrapper.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); -$| = 1; # set autoflush to screen - -# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. -# The wrapper generates the filter(s) in the correct format to be used with the input file. -# written 6-1-2012 by bcrain@completegenomics.com - - -#print join("\n", @ARGV), "\n"; -&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); - -my $append = ''; - -for (my $i = 0; $i <= $#opt_zygosity; $i ++) -{ - my $filter = ''; - unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} - unless ($opt_vartype[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varType=' . $opt_vartype[$i]; - } - unless ($opt_varscorevaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscorevaf[$i] =~ s/^x//; - $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; - } - unless ($opt_varscoreeaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscoreeaf[$i] =~ s/^x//; - $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; - } - unless ($opt_varquality[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varQuality!=' . $opt_varquality[$i]; - } - - if ($filter ne '') - { - if ($append eq '') {$append = '#' . $filter;} - else {$append .= ',' . $filter;} - } -} -print "cgatools varfilter ---beta ---reference $opt_reference ---output $opt_output ---input '${opt_input}${append}'\n"; - -`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file
Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml has changed
Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl has changed
--- a/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -<tool id="pl_calculatefreq" name="Calculate_Variant_Frequencies" version="0.0.1"> - - <description>in cgatools-testvariants file</description> <!--adds description in toolbar--> - - <command interpreter="perl"> - Calculate_TestVariants_Variant_Frequencies_0_1_0.pl - --Input $input - --First_Genome_Field_Nr $first_col - --Last_Genome_Field_Nr $last_col - --Output1 $output1 - --Output2 $output2 - </command> - - <outputs> - <data format="tabular" name="output1" label="TestVariant_Frequencies on "/> - <data format="tabular" name="output2" label="TestVariant_Frequencies_Short on "/> - </outputs> - - <inputs> - <param name="input" type="data" format="tabular" label="TestVariants input file"> - <validator type="unspecified_build" /> - </param> - <param name="first_col" type="text" label="What column number is the first genome"/> - <param name="last_col" type="text" label="What column number is the last genome"/> - </inputs> - - - <help> - -**What it does** - -This tool calculates the allele frequencies for all variants present in the testvariant file. - ------ - -**Instructions**:: - - Calculate the frequencies of variants in a testvariants output file - Two values calculated: - Frequency vs all alleles - Frequency vs called alleles - - Input: testvariants file - Outputs: - All data to *-Freq.tsv, including scores and quals - vars and freqs to *-Freq_Short.tsv - Exceptions to *-Freq_Log - Stats to *-Freq_Stats - - - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input input_file \ - --First_Genome_Field_Nr col_nr1 \ - --Last_Genome_Field_Nr col_nr2 - --Output1 output1 \ - --Output2 output_short \ - eg - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input /data/Family_Quartet_testvariants.tsv \ - --Output /data/Family_Quartet_testvariants - --First_Genome_Field_Nr 9 \ - --Last_Genome_Field_Nr 11 - --Output1 /data/Family_Quartet_testvariants - --Output2 /data/Family_Quartet_testvariants_short - - </help> -</tool>
--- a/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,271 +0,0 @@ -#!/usr/bin/perl -use strict; -#use feature "say"; -#use File::Basename; -$| = 1; - -# Get_TestVariants_Variant_Frequencies -# Calculate the frequencies of variants in a testvariants output file -# Two values calculated: -# Frequency vs all alleles -# Frequency vs called alleles - -# Input is a testvariants file -# Outputs: -# All data to *-Freq.tsv, including scores and quals -# vars and freqs to *-Freq_Short.tsv -# Exceptions to *-Freq_Log -# Stats to *-Freq_Stats - -# Format: -# perl prog file dir -# ie -# perl Get_TestVariants_Variant_Frequencies \ -# --Input input_file \ -# --First_Genome_Field_Nr col_nr1 \ -# --Last_Genome_Field_Nr col_nr2 -# --Output1 output1 \ -# --Output2 output2 - -# eg -# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \ -# --Input /data/Family_Quartet_testvariants.tsv \ -# --First_Genome_Field_Nr 9 \ -# --Last_Genome_Field_Nr 11 -# --Output1 output1 \ -# --Output2 output2 - - -# Rick Tearle 2010-11 - -my $Time -= time; # start time -my $Debug = 0; - -# Parsing and storing input parameters -# Only childfields can be repeated -print "$0 @ARGV\nProcessing input parameters\n"; -my %ExpectedParams = GetExpectedParams (); -my %EnteredParams = GetEnteredParams (); - -# Setting up prog paras from input paras -my $FileIn = $EnteredParams{input}; -unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file -#my $FileOut = $EnteredParams{output}; # -#$DirectoryOut =~ s/\/$//; # remove trailing slash if present -#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file -#print "$FileIn\n$DirectoryOut\n"; -#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions - -# my $FileOut1 = $FileOut."-Freq.tsv"; -# my $FileOut2 = $FileOut."-Freq_Short.tsv"; -# my $FileOut3 = $FileOut."-Freq_Stats.tsv"; -# my $FileOut4 = $FileOut."-Freq_Log.tsv"; - -print "\nOpening Input File:\n\t$FileIn\n"; -my $IN = OpenFile ($FileIn); # open the file with correct file format - -#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit; -open my $OUT1, ">", $EnteredParams{output1}; -open my $OUT2, ">", $EnteredParams{output2}; -#open my $OUT3, ">", $EnteredParams{output3}; -#open my $OUT4, ">", $EnteredParams{output4}; - -# Get col header and genomes fields -my $ColHeader = <$IN>; # get col header -chomp $ColHeader; -my @ColHeader = split /\t/, $ColHeader; -my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";} -if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";} -if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";} -if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";} -if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";} -my $NrGenomes = $StopGenomes - $StartGenomes + 1; -#print "$StartGenomes\t$StopGenomes\n"; #exit; -#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n"; -#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n"; - -# print column headers -print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n"; -print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n"; -print join("\t",@ColHeader),"\n"; -print "First Genome Field: $ColHeader[$StartGenomes]\n"; -print "Last Genome Field: $ColHeader[$StopGenomes]\n"; -print "Nr Genomes: $NrGenomes\n\n"; - -print "\nProcessing Variants....\n"; -my $VariantCount = 0; # variant locus counter, not used -my %AllFreqCounts; # storing histogram of all freq counts -my %CalledFreqCounts; # storing histogram of called freq counts -my $Warnings; -while (<$IN>) -{ - # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM] - my $Line = $_; # save line for output below - chomp $Line; - my @F = split /\t/, $Line; # split in to fields - $VariantCount++; # increment variant counter - my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s - my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s - my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s - my $CountN = () = $UseFields =~ /N/g; # count the number of Ns - my $NrAlleles = $Count1 + $Count0 + $CountN; # total count - unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus - { - print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning - #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n"; - #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings - } - my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles - my $CalledFreq = sprintf("%0.3f",0); - if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any - $AllFreqCounts{$AllFreq}++; # increment all freq histogram - $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram - #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit; - print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var - print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var - #exit if $VariantCount > 20; -} -close $OUT1; -close $OUT2; - -# Print frequency histograms -print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";} - -print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";} - -$Time += time; -print "\ntime $Time\n"; - -########################################################################### -# SUBS # -########################################################################### - -sub GetExpectedParams -{ - my %Hash = - ( - "input" => -1, - "output_dir" => -1, - ); # store parameters and values - return %Hash; -} - -sub GetEnteredParams -{ - # Processing @ARGV - my %Hash; - - my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array - #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug; - #print "Key\tVal\n" if $Debug; #exit; - for my $n (1..$#ARGVs) # parse each - { - $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces - my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val - $Key = lc $Key; - $Hash{$Key} = $Val; # make a hash entry out of key and val - #print "$Key\t$EnteredParams{$Key}\n" if $Debug; - } - #print int(keys %Hash),"\n" if $Debug; - #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";} - #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug; - #exit if $Debug; - return %Hash; # hash now has each -- entry param, with associated values -} - -sub SaveArrayAsString -{ - my $FH = shift; - my $Fields = shift; - #print "$Fields\n"; - print $FH join("\t",@$Fields),"\n"; -} - -sub ConcatenateVariants -{ - my $ArrayIn = shift; # ptr to array - my $StateFieldNr = shift; # field to process - #print int(@$ArrayIn),"\n"; - my @ArrayOut; # array to store records out - my $Nr = -1; - foreach my $Entry (@$ArrayIn) - { - } - return \@ArrayOut; # return ptr to array -} - -sub LoadStateRecord -{ - my $Out = shift; - my $In = shift; - my $StateFieldNr = shift; - - $Out->{State} = $$In[$StateFieldNr]; # get state for new record - $Out->{Chr} = $$In[1]; # get chr - $Out->{Begin} = $$In[2]; # get begin of state range - $Out->{End} = $$In[3]; # get current end of state range - $Out->{Records}++; # record added to new count -} - -sub OpenFile -{ - my $File = shift; - my $FH; - open ($FH, "$File") or die ("$!: can't open file $File"); - return $FH; -} - -sub OpenFileold -{ - my $File = shift; - my $FH; - - if ($File =~ /.bz2$/) - { - open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.gz$/) - { - open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.tsv$/) - { - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - else - { - die ("$!: do not recognise file type $File"); - } - return $FH; -} - -sub LoadNewRecord -{ - my $In = shift; - my $Out = shift; - $Out->{Chr} = $In->{Chr}; - $Out->{State} = $In->{State}; - $Out->{Begin} = $In->{Begin}; - $Out->{End} = $In->{End}; - $Out->{Records} = $In->{Records}; -} - -sub NewStateRecord -{ - my $Record = - { - Chr => "", - Begin => -1, - End => -1, - State => "", - Records => 0, - MIEs => 0, - StateErrors => 0, - Length => -1, - }; - return $Record; -}
--- a/cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,323 +0,0 @@ -<tool id="pl_listuniquevariants" name="List_Unique_Variants" version="0.0.1"> - - <description>with annotations from gene or var files</description> <!--adds description in toolbar--> - - <command interpreter="perl"> <!--run executable--> - #if $file_types.file_type =="var2" - List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output - --Var_Type $file_types.variants - $file_types.scoresVAF - $file_types.scoresEAF - $file_types.varQuality - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var file--> - --Input_File ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - #else if $file_types.file_type =="var1" - List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output - --Var_Type $file_types.variants - $file_types.scores - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles <!--get each var file--> - --Input_File ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - #else if $file_types.file_type =="gene" - List_Unique_Variants_2_1_0.pl --File_Type G --Output_File $output - --Var_Type $file_types.variants - --Component $file_types.component - --Impact $file_types.impact - #if $file_types.data_sources.data_source == "in" - #for $g in $file_types.data_sources.genefiles <!--get each var file--> - --Input_File ${g.input} - #end for - #else - `cat $file_types.data_sources.genelist` - #end if - #end if - </command> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <inputs> - <conditional name="file_types"> - <!--form field to select file type--> - <param name="file_type" type="select" label="Select the input file type"> - <option value="var2" selected="True">var files, format 2.x</option> - <option value="var1">var files, format 1.x</option> - <option value="gene">gene files</option> - </param> - - <when value="var2"> - <!--form field to select all variant types to annotate--> - <param name="variants" label="Select variant types to include" type="select" multiple="true" > - <!--<validator type="no_options" message="Please select at least one variant type."/>--> - <option value="All" selected="true">All</option> - <option value="snp">snp</option> - <option value="ins">ins</option> - <option value="del">del</option> - <option value="sub">sub</option> - <option value="ref">ref</option> - </param> - - <!--form field to select varScoresVAF--> - <param name="scoresVAF" type="select" label="Include varScoreVAF?"> - <option value="--Scores_VAF yes" selected="true">yes</option> - <option value="--Scores_VAF no">no</option> - </param> - <!--form field to select varScoresEAF--> - <param name="scoresEAF" type="select" label="Include varScoreEAF?"> - <option value="--Scores_EAF yes" selected="true">yes</option> - <option value="--Scores_EAF no">no</option> - </param> - <!--form field to select varQuality--> - <param name="varQuality" type="select" label="Include varQuality?"> - <option value="--Score_Qualities yes" selected="true">yes</option> - <option value="--Score_Qualities no">no</option> - </param> - - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input var files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_var" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="var1"> - <!--form field to select all variant types to annotate--> - <param name="variants" label="Select variant types to include" type="select" multiple="true" > - <!--<validator type="no_options" message="Please select at least one variant type."/>--> - <option value="All" selected="true">All</option> - <option value="snp">snp</option> - <option value="ins">ins</option> - <option value="del">del</option> - <option value="sub">sub</option> - <option value="ref">ref</option> - </param> - - <!--form field to select scores--> - <param name="scores" type="select" label="Include totalScore?"> - <option value="--Scores yes" selected="true">yes</option> - <option value="--Scores no">no</option> - </param> - - <!--conditional to select variant file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input var files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="varfiles" title="Variant files"> - <param name="input" type="data" format="cg_var" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="varlist" type="text" label="List of variant files (/path/file)" size="200" help="file with list of var files (/path/varfile), var files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - <when value="gene"> - <!--form field to select all variant types to annotate--> - <param name="variants" label="Select variant types to include" type="select" multiple="true" > - <!--<validator type="no_options" message="Please select at least one variant type."/>--> - <option value="All" selected="true">All</option> - <option value="snp">snp</option> - <option value="ins">ins</option> - <option value="del">del</option> - <option value="sub">sub</option> - <option value="ref">ref</option> - </param> - - <!--form field to select component in gene file--> - <param name="component" type="select" label="Select component types to include" multiple="true" > - <option value="All" selected="true">All</option> - <option value="CDS">CDS</option> - <option value="INTRON">INTRON</option> - <option value="DONOR">DONOR</option> - <option value="ACCEPTOR">ACCEPTOR</option> - <option value="TSS-UPSTREAM">TSS-UPSTREAM</option> - <option value="SPAN5">SPAN5</option> - <option value="SPAN3">SPAN3</option> - <option value="SPAN">SPAN</option> - <option value="UTR5">UTR5</option> - <option value="UTR3">UTR3</option> - <option value="UTR">UTR</option> - </param> - - <!--form field to select impact in gene file--> - <param name="impact" type="select" label="Select impact types to include" multiple="true" > - <option value="All" selected="true">All</option> - <option value="NO-CHANGE">NO-CHANGE</option> - <option value="SYNONYMOUS">SYNONYMOUS</option> - <option value="MISSENES">MISSENES</option> - <option value="NONSENSE">NONSENSE</option> - <option value="NONSSTOP">NONSSTOP</option> - <option value="DELETE">DELETE</option> - <option value="INSERT">INSERT</option> - <option value="DELETE+">DELETE+</option> - <option value="INSERT+">INSERT+</option> - <option value="FRAMESHIFT">FRAMESHIFT</option> - <option value="MISSTART">MISSTART</option> - <option value="DISRUPT">DISRUPT</option> - <option value="UNKNOWN-VNC">UNKNOWN-VNC</option> - <option value="UNKNOWN-INC">UNKNOWN-INC</option> - <option value="UNKNOWN-TR">UNKNOWN-TR</option> - </param> - - <!--conditional to select gene file input--> - <conditional name="data_sources"> - <param name="data_source" type="select" label="Where are the input gene files?"> - <option value="in" selected="true">imported into Galaxy</option> - <option value="out">located outside Galaxy (available only for local Galaxy instances)</option> - </param> - <when value="in"> - <!--form field to select variant files--> - <repeat name="genefiles" title="Gene files"> - <param name="input" type="data" format="cg_gene" label="Dataset"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" - metadata_name="dbkey" metadata_column="1" - message="cgatools is not currently available for this build."/> - </param> - </repeat> - </when> - <when value="out"> - <!--form field to select crr file--> - <param name="genelist" type="text" label="List of gene files (/path/file)" size="200" help="file with list of gene files (/path/genefile), gene files can be compressed (gz, bz2)."/> - </when> - </conditional> - </when> - - </conditional> - </inputs> - - - <help> - -**What it does** - -This tool identifies all called variants present in the var or gene files and generates annotated variant list. - ------ - -**Instructions**:: - - List Unique Variants for Pipeline 1.x and 2.x - [Uses header if available, checks for position of xref field if not] - Take one or more var or gene files - Extract a non-redundant set of variants - - For var files: - The fields used to define non-redundant variants are are: - chromosome begin end varType reference alleleSeq xRef - User can nominate class(es) of varType to filter on - Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn - them off (separately) - Scores and qualities stored in separate fields, all values for a variant across - a set of genomes. - Values for different genomes separated by ':', for two hom entries for the same - genome by '|' - Output is accepted by testvariants to generate a variant table, all fields kept - in testvariants output - - For gene files: - The fields used to define non-redundant gene variants are: - chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol - orientation component componentIndex codingRegionKnown impact nucleotidePos - proteinPos annotationRefSequence sampleSequence genomeRefSequence - User can nominate class(es) of varType, component or impact to filter on - All gene entries kept ie multiple entries if multiple transcripts - - NB Now treating xref as a separate component in var recs, as it is not consistent - between X and Y vars - Not fixed for gene recs yet - - perl List_Unique_Variants_2_0_11.pl - --File_Type [V|G] - --Input_File input_file_1 [set of var or gene files] - --Input_File input_file_2 - ... - --Input_File input_file_n - --Output_File filename - --Var_Type [For both file types, 'All' or any value from the varType field, - multiple values allowed, separated by comma] - --Component [Gene file specific,'All' or any value from component field of gene - file, multiple allowed; 'All" is default] - --Impact All [Gene file specific,'All' or any value from impact field of gene - file, multiple allowed; 'All" is default] - --Scores [1.x var file specific, yes|no, yes is default] - --Scores_VAF [2.0 var file specific, yes|no, yes is default] - --Scores_EAF [2.0 var file specific, yes|no, yes is default] - --Score_Qualities [yes|no, yes is default] - eg - perl List_Unique_Variants_2_0_11.pl \ - --File_Type V \ - --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ - --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ - --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ - --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ - --Var_Type All - --Component All - --Impact All - --Scores_VAF yes \ - --Scores_EAF yes \ - --Score_Qualities yes - - var fields - 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq - totalScore hapLink xRef - 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq - varScoreVAF varScoreEAF varQuality hapLink xRef - - gene fields - 1.x index locus allele chromosome begin end varType reference call xRef geneId - mrnaAcc proteinAcc symbol orientation component componentIndex - codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence - sampleSequence genomeRefSequence - 2.0 index locus allele chromosome begin end varType reference call xRef geneId - mrnaAcc proteinAcc symbol orientation component componentIndex hasCodingRegion - impact nucleotidePos proteinPos annotationRefSequence sampleSequence - genomeRefSequence pfam - - Parsing and storing input parameters - Only input_file fields can be repeated - input paramaters are case insensitive - - - </help> -</tool>
--- a/cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,583 +0,0 @@ -#!/usr/bin/perl -use strict; -use File::Basename; -$| = 1; - -# List Unique Variants for Pipeline 1.x and 2.x -# [Uses header if available, checks for position of xref field if not] -# Take one or more var or gene files -# Extract a non-redundant set of variants - -# For var files: -# The fields used to define non-redundant variants are are: -# chromosome begin end varType reference alleleSeq xRef -# User can nominate class(es) of varType to filter on -# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately) -# Scores and qualities stored in separate fields, all values for a variant across a set of genomes. -# Values for different genomes separated by ':', for two hom entries for the same genome by '|' -# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output - -# For gene files: -# The fields used to define non-redundant gene variants are: -# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence -# User can nominate class(es) of varType, component or impact to filter on -# All gene entries kept ie multiple entries if multiple transcripts - -# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars -# Not fixed for gene recs yet - -# perl List_Unique_Variants_2_0_9.pl -# --File_Type [V|G] -# --Input_File input_file_1 [set of var or gene files] -# --Input_File input_file_2 -# ... -# --Input_File input_file_n -# --Output_File filename -# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma] -# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default] -# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default] -# --Scores [1.x var file specific, yes|no, yes is default] -# --Scores_VAF [2.0 var file specific, yes|no, yes is default] -# --Scores_EAF [2.0 var file specific, yes|no, yes is default] -# --Score_Qualities [yes|no, yes is default] -# eg -# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \ -# --File_Type V \ -# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ -# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ -# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ -# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ -# --Var_Type All -# --Component All -# --Impact All -# --Scores_VAF yes \ -# --Scores_EAF yes \ -# --Score_Qualities yes - -# var fields -# 1.x -# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef -# 2.0 -# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - -# gene fields -# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation -# component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence -# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation -# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - -# Parsing and storing input parameters -# Only input_file fields can be repeated -# input paramaters are case insensitive - -print "$0 @ARGV\nProcessing input parameters\n"; -my $NrParams; -my %ExpectedParams = GetExpectedParams (); # list of expected parms -my %EnteredParams = GetEnteredParams (); # list of entered params - -# Input Files -my $FileType = $EnteredParams{file_type}; -if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";} -my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files -print "File Type: $FileType\nInput Files:\n"; -my $NrInputFiles = int(@$FilesIn); -foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files -foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files -for my $n (0.. $NrInputFiles-2) # look for duplicates in list - {for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}} - -# Output Dir -#my $DirectoryOut = $EnteredParams{output_dir}; # output dir -#$DirectoryOut =~ s/\/$//; # remove trailing slash if present -#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can - -# Ouput File -my $FileOut = $EnteredParams{output_file}; # output file -print "FileOut: $FileOut\n"; -$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename -my $DirectoryOut = $1; # assign path # NEED MORE TESTING EG EMPTY PATH # -$FileOut = $2; # assign file prefix -print "File: $FileOut\nDir: $DirectoryOut\n"; #exit; -# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name -# { -# print "Output file $FileOut exists, modifying to unique file name "; -# $FileOut =~ /^(.+?)\./; # find name without extensions -# my $Stub = $1; # set stub to name without extensions -# $FileOut =~ /(\..+)?$/; # get extension(s) -# my $Ext = $1; # set ext to extensions -# my $n = 1; # n will increment to find a unique name -# my $Suff = ""; # suff tracks n -# while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename -# $FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique -# print "$FileOut\n"; -# } - -#print "Files\n",join("\n",@$FilesIn),"\n\n"; -#print "Ouput Dir\n$DirectoryOut\n"; -#print "Ouput File\n$FileOut\n"; - -# Extract Header & Column Header -my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format -my $Header = GetHeaderAsString ($IN); # get header -unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again -my $ColHeader = <$IN>; # get col header, first remaining line -chomp $ColHeader; - -# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed -my ($Version, $XrefField); -if ($FileType eq "V") -{ - ($Version, $XrefField) = GetVersion ($Header, $ColHeader); - #print "$Version $XrefField\n"; exit; - unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";} -} - -# Shared input params -my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc -$OutputVarTypes =~ s/\,/\|/g; # create regex string -$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces - -# Input Params for var file -my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras); -if ($FileType eq "V") -{ - if ($Version == 2) - { - $KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0 - $KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean - $KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0 - $KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean - $KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0 $KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0 - $KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean - } - else # Version 1 - { - $KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x - $KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean - } - $VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info -} - -# Input Params for gene file -my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc -my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc - -# Loading chr nrs, setting up var hash -my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', - 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', - 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order -my %Vars; # hash to store var records in an array for each chr -foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";} # set up hash of hashes, one for each chr -#print "\n"; #exit; - -# Create ouput col header -if ($FileType eq "V") -{ - # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef - # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - my @Fields = split "\t", $ColHeader; - $ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField]; - if ($Version == 2) # 2.0 - { - if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";} - if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";} - if ($KeepQuals) {$ColHeader .= "\tvarQuality";} - } - else # 1.x - { - if ($KeepScores) {$ColHeader .= "\ttotalScore";} - } -} -elsif ($FileType eq "G") -{ - # 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation - # component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence - # 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation - # component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - my @Fields = split "\t", $ColHeader; - $ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed - $OutputComponents =~ s/\,/\|/g; # create regex string - $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces - $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces - $OutputImpacts =~ s/\,/\|/g; # create regex string - $OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces - #print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n"; -} -else -{ - die "FileType $FileType not understood\n"; # redundant -} - -# Set up Processing Subs -if ($FileType eq "G") # use gene subs for 'G' -{ - *ExtractSub2Use = \&ExtractGeneFields; - *AddRecSub2Use = \&AddGeneRec; -} -else # use var subs for 'V' -{ - *ExtractSub2Use = \&ExtractVarFields; - *AddRecSub2Use = \&AddVarRec; -} -#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit; - -# Process Files -my $RecCount = 0; # total nr recs that are coding/splicing/spanning -my $FileCount = 0; # keeps track of nr files to use to format eg scores field -my $XRef; -foreach my $File (@$FilesIn) -{ - print "Processing file $File\n"; - $FileCount++; # counting nr files - - # Open file, process header, col header - my $IN = OpenFile ($File); # open the file with correct file format - my $Header = GetHeaderAsString ($IN); # get header - unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again - my $ColHeader = <$IN>; # get col header, first remaining line - unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"} - - my $Count = 0; # cnr recs for this file that are coding/splicing/spanning - while (<$IN>) # loop through remainder of file ie data - { - my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally - next unless $Rec; - AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty - $Count++; # increment count of coding/splicing/spanning vars - } - print "Nr matched records for this file: $Count\n"; - $RecCount += $Count; # add this file's count to total count - close $IN; -} -print "Nr matched records across all files:\t $RecCount\n"; # total count - -# Open file out, write col header -print "Sorting and Saving to file $DirectoryOut$FileOut ...\n"; -open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n"; -if ($FileType eq "V") -{ - print $OUT "variantId\t"; -} # first col header for var file -else -{ - $ColHeader =~ s/\tcall\t/\talleleSeq\t/; - print $OUT "index\t"; -} # first col header for gene file -print $OUT "$ColHeader\n"; # remainder of col header - -$RecCount = 0; # reuse total count for nr of non-reduntant vars -$FileCount--; # reduce by one, used below to add missing delimiters -foreach my $Chr (@ChrNames) # sort records in each chr array and print with count -{ - foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields - { - next unless $Rec; - $RecCount++; # increment count of coding/splicing/spanning vars - print $OUT "$RecCount\t$Rec"; # printing rec and count - print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files - - if ($VarExtras) - { - my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g; - #print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n"; - #exit if $TmpCount++ > 10; - my $Addition = ":" x ($FileCount - $FieldDelimiterCount); - if ($Version == 2) - { - print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF; - print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF; - print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals; - } - else - { - print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores; - } - #print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count - } - print $OUT "\n"; - } -} -print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve - - -########################################################################### -# SUBS # -########################################################################### - -sub GetExpectedParams -{ - my %Hash = # hash to store expected params - ( - "file_type" => -1, - "input_file" => [], - "output_file" => -1, - "var_type" => "all", - "component" => "ALL", - "impact" => "ALL", - "scores" => "yes", - "scores_eaf" => "yes", - "scores_vaf" => "yes", - "score_quality" => "yes", - ); - $NrParams = int keys %Hash; - return %Hash; -} - -sub GetEnteredParams -{ - # Processing @ARGV - my %Hash; - my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array - for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored] - { - $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces - my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val - $Key = lc $Key; # make lower case, ie case insensitive - if ($Key eq "input_file") # multiple entries expected, setting up array - { - push @{$Hash{$Key}}, $Val; # add input to input hash - - } - else - { - $Hash{$Key} = $Val; # make a hash entry out of key and val - } - } - return %Hash; # hash now has each --entry param, with associated values -} - -sub OpenFile -{ - my $File = shift; - my $FH; - open $FH, $File; - return $FH; -} - -sub OpenFileold -{ - my $File = shift; - my $FH; - - if ($File =~ /.bz2$/) - { - open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.gz$/) - { - open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.tsv$/ or $File =~ /.txt$/) - { - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - else - { - print ("Do not recognise file type for file $File.\nOpening as text file\n"); - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - return $FH; -} - -sub GetHeaderAsString -{ - my $FH = shift; - my $Header = ""; - my $Count = 0; - while (<$FH>) # loop until a line is empty - { - chomp; - if ($_ eq "") # exit when empty line - { - return $Header ; # return ref to array - } - else - { - $Header .= $_; - } - return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array - } -} - -sub GetVersion -{ - my $Header = shift; - my $ColHeader = shift; - - my $Version = 0; # need to know if it is 1.x or 2.x - my $XrefField = -1; - - if ($FileType eq "V") - { - if ($Header) - { - $Header =~ /#FORMAT_VERSION\t(\d)/; - if ($1 == 1) {$Version = 1; $XrefField = 11;} - elsif ($1 == 2) {$Version = 2; $XrefField = 13;} - else {print "Warning: Format Version not found in Header\n";} # not in header - } - unless ($Version) - { - my @ColHeader = split /\t/, $ColHeader; - for my $n (0..int(@ColHeader)-1) - { - if ($ColHeader eq "xRef") - { - $XrefField = $n; - if ($n == 11) {$Version = 1;} - elsif ($n == 13) {$Version = 2;} - last; - } - } - } - } - return ($Version, $XrefField); -} - -sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr -{ - my $Rec = shift; - # gene fields - # >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc - # proteinAcc symbol orientation component componentIndex hasCodingRegion impact - # nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - - chomp $Rec; # remove return - my @Fields = split "\t", $Rec; - #print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n"; - unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave - unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave - unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave - my $Chr = $Fields[3]; # assign chr - $Rec = join("\t",@Fields[3..24]); - #$Rec =~ s/\t$//; # remove trailing tab if there is one - - return ($Rec, $Chr); -} - -sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr -{ - my $Rec = shift; - my $XrefField = shift; - # var fields - # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef - # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - - chomp $Rec; # remove return - my @Fields = split "\t", $Rec; - #print "$Fields[6] $OutputVarTypes \n"; exit; - unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave - my $Chr = $Fields[3]; # assign chr - #$Rec = join("\t",@Fields[3..8]); - $Rec = join("\t",@Fields[3..8]); - #$Rec =~ s/\t$//; # remove trailing tab if there is one - - if ($VarExtras) - { - return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2; - return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1; - } - else - { - return ($Rec, $Chr); - } -} - -sub AddVarRec -{ - my $Rec = shift; - my $RecHash = shift; - my $ScoreVAF = shift; - my $ScoreEAF = shift; - my $ScoreQual = shift; - my $FileCount = shift; - my $XRef = shift; - - if ($VarExtras) # need to extract scores information - { - # locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - # Set delimiter - my $Delimiter; - if ($RecHash->{$Rec}) # hash entry for this var already exists - { - if ($RecHash->{$Rec}->[0] == $FileCount) - { - $Delimiter = "|"; # same chr, var is hom, use | - } - else # diff chr, use : - { - my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims - $Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var - } - $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer - } - else # new var - { - $RecHash->{$Rec} = []; # create array to hold it - $Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var - $RecHash->{$Rec}->[4] = $XRef; # add xref - } - - # Process var - $RecHash->{$Rec}->[0] = $FileCount; - if ($Version == 2) - { - $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF - $RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF - $RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual - } - else - { - $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore - } - - } - else # just the rec, no var extras being extrcted - { - $RecHash->{$Rec}++; # hash with rec as key, increment count for this key - $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here - } -} - -sub AddGeneRec -{ - my $Rec = shift; - my $RecHash = shift; - - $RecHash->{$Rec}++; # hash with rec as key, increment count for this key -} - -sub SortStringsasArrays # sorts based on begin and end of two recs -{ - my $String1 = shift; # first string - my $String2 = shift; # second string - - my @Array1 = split "\t", $String1; # put fields into array - my @Array2 = split "\t", $String2; - - # array[1] is begin, array[2] is end, returning order based on these fields - if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2 - { - return -1; - } - elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2 - { - if ($Array1[2] < $Array2[2]) # end of 1 < end of 2 - { - return -1; - } - elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2 - { - return 0; - } - else # end of 1 > end of 2 - { - return 1; - } - } - else # begin of 1 > begin of 2 - { - return 1; - } - -}