+
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/.DS_Store
Binary file cgatools_suite/tools/cgatools/.DS_Store has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._calldiff.xml
Binary file cgatools_suite/tools/cgatools/._calldiff.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._join.xml
Binary file cgatools_suite/tools/cgatools/._join.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._junctiondiff.xml
Binary file cgatools_suite/tools/cgatools/._junctiondiff.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._listtestvariants.xml
Binary file cgatools_suite/tools/cgatools/._listtestvariants.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._listvariants.xml
Binary file cgatools_suite/tools/cgatools/._listvariants.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._snpdiff.xml
Binary file cgatools_suite/tools/cgatools/._snpdiff.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._testing.pl
Binary file cgatools_suite/tools/cgatools/._testing.pl has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._testvariants.xml
Binary file cgatools_suite/tools/cgatools/._testvariants.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._varfilter.xml
Binary file cgatools_suite/tools/cgatools/._varfilter.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._varfilter_wrapper.pl
Binary file cgatools_suite/tools/cgatools/._varfilter_wrapper.pl has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/calldiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/calldiff.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,343 @@
+
+
+ compares two Complete Genomics variant files.
+
+
+ cgatools
+
+
+
+ cgatools calldiff --beta
+ --reference ${crr.fields.path}
+ --variantsA $data_sources.inputA
+ --variantsB $data_sources.inputB
+ $validation
+ $diploid
+ --locus-stats-column-count $column
+ --max-hypothesis-count $hypothesis
+ --output-prefix cg_
+ --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
+ #if $somatic.report6 == "SomaticOutput"
+ --genome-rootA $somatic.genomeA
+ --genome-rootB $somatic.genomeB
+ --calibration-root $somatic.calibration
+ #end if
+
+
+
+
+ (report1 == 'SuperlocusOutput')
+
+
+ (report2 == 'SuperlocusStats')
+
+
+ (report3 == 'LocusOutput')
+
+
+ (report4 == 'LocusStats')
+
+
+ (report5 == 'VariantOutput')
+
+
+ (report5 == 'VariantOutput')
+
+
+ (somatic['report6'] == 'SomaticOutput')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool compares two Complete Genomics variant files.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ calldiff - Compares two Complete Genomics variant files.
+
+ DESCRIPTION
+ Compares two Complete Genomics variant files. Divides the genome up into
+ superloci of nearby variants, then compares the superloci. Also refines the
+ comparison to determine per-call or per-locus comparison results.
+
+ Comparison results are usually described by a semi-colon separated string,
+ one per allele. Each allele's comparison result is one of the following
+ classifications:
+
+ ref-identical The alleles of the two variant files are identical, and
+ they are consistent with the reference.
+ alt-identical The alleles of the two variant files are identical, and
+ they are inconsistent with the reference.
+ ref-consistent The alleles of the two variant files are consistent,
+ and they are consistent with the reference.
+ alt-consistent The alleles of the two variant files are consistent,
+ and they are inconsistent with the reference.
+ onlyA The alleles of the two variant files are inconsistent,
+ and only file A is inconsistent with the reference.
+ onlyB The alleles of the two variant files are inconsistent,
+ and only file B is inconsistent with the reference.
+ mismatch The alleles of the two variant files are inconsistent,
+ and they are both inconsistent with the reference.
+ phase-mismatch The two variant files would be consistent if the
+ hapLink field had been empty, but they are
+ inconsistent.
+ ploidy-mismatch The superlocus did not have uniform ploidy.
+
+ In some contexts, this classification is rolled up into a simplified
+ classification, which is one of "identical", "consistent", "onlyA",
+ "onlyB", or "mismatch".
+
+ A good place to start looking at the results is the superlocus-output file.
+ It has columns defined as follows:
+
+ SuperlocusId An identifier given to the superlocus.
+ Chromosome The name of the chromosome.
+ Begin The 0-based offset of the start of the superlocus.
+ End The 0-based offset of the base one past the end of the
+ superlocus.
+ Classification The match classification of the superlocus.
+ Reference The reference sequence.
+ AllelesA A semicolon-separated list of the alleles (one per
+ haplotype) for variant file A, for the phasing with the
+ best comparison result.
+ AllelesB A semicolon-separated list of the alleles (one per
+ haplotype) for variant file B, for the phasing with the
+ best comparison result.
+
+ The locus-output file contains, for each locus in file A and file B that is
+ not consistent with the reference, an annotated set of calls for the locus.
+ The calls are annotated with the following columns:
+
+ SuperlocusId The id of the superlocus containing the locus.
+ File The variant file (A or B).
+ LocusClassification The locus classification is determined by the
+ varType column of the call that is inconsistent
+ with the reference, concatenated with a
+ modifier that describes whether the locus is
+ heterozygous, homozygous, or contains no-calls.
+ If there is no one variant in the locus (i.e.,
+ it is heterozygous alt-alt), the locus
+ classification begins with "other".
+ LocusDiffClassification The match classification for the locus. This is
+ defined to be the best of the comparison of the
+ locus to the same region in the other file, or
+ the comparison of the superlocus.
+
+ The somatic output file contains a list of putative somatic variations of
+ genome A. The output includes only those loci that can be classified as
+ snp, del, ins or sub in file A, and are called reference in the file B.
+ Every locus is annotated with the following columns:
+
+ VarCvgA The totalReadCount from file A for this locus
+ (computed on the fly if file A is not a
+ masterVar file).
+ VarScoreA The varScoreVAF from file A, or varScoreEAF if
+ the "--diploid" option is used.
+ RefCvgB The maximum of the uniqueSequenceCoverage
+ values for the locus in genome B.
+ RefScoreB Minimum of the reference scores of the locus in
+ genome B.
+ SomaticCategory The category used for determining the
+ calibrated scores and the SomaticRank.
+ VarScoreACalib The calibrated variant score of file A, under
+ the model selected by using or not using the
+ "--diploid" option, and corrected for the count
+ of heterozygous variants observed in this
+ genome. See user guide for more information.
+ VarScoreBCalib The calibrated reference score of file B, under
+ the model selected by using or not using the
+ "--diploid" option, and corrected for the count
+ of heterozygous variants observed in this
+ genome. See user guide for more information.
+ SomaticRank The estimated rank of this somatic mutation,
+ amongst all true somatic mutations within this
+ SomaticCategory. The value is a number between
+ 0 and 1; a value of 0.012 means, for example,
+ that an estimated 1.2% of the true somatic
+ mutations in this somaticCategory have a
+ somaticScore less than the somaticScore for
+ this mutation. See user guide for more
+ information.
+ SomaticScore An integer that provides a total order on
+ quality for all somatic mutations. It is equal
+ to -10*log10( P(false)/P(true) ), under the
+ assumption that this genome has a rate of
+ somatic mutation equal to 1/Mb for
+ SomaticCategory snp, 1/10Mb for SomaticCategory
+ ins, 1/10Mb for SomaticCategory del, and 1/20Mb
+ for SomaticCategory sub. The computation is
+ based on the assumptions described in the user
+ guide, and is affected by choice of variant
+ model selected by using or not using the
+ "--diploid" option.
+ SomaticQuality Equal to VQHIGH for all somatic mutations where
+ SomaticScore >= -10. Otherwise, this column is
+ empty.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --reference arg
+ The input crr file.
+
+ --variantsA arg
+ The "A" input variant file.
+
+ --variantsB arg
+ The "B" input variant file.
+
+ --output-prefix arg
+ The path prefix for all output reports.
+
+ --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
+ Comma-separated list of reports to generate. (Beware any reports whose
+ name begins with "Debug".) A report is one of:
+ SuperlocusOutput Report for superlocus classification.
+ SuperlocusStats Report for superlocus classification stats.
+ LocusOutput Report for locus classification.
+ LocusStats Report for locus stats.
+ VariantOutput Both variant files annotated by comparison
+ results.If the somatic output report is
+ requested, file A is also annotated with the
+ same score ranks as produced in that report.
+ SomaticOutput Report for the list of simple variations that
+ are present only in file "A", annotated with
+ the score that indicates the probability of
+ the variation being truly somatic. Requires
+ beta, genome-rootA, and genome-rootB options
+ to be provided as well. Note: generating this
+ report slows calldiff by 10x-20x.
+ DebugCallOutput Report for call classification.
+ DebugSuperlocusOutput Report for debug superlocus information.
+ DebugSomaticOutput Report for distribution estimates used for
+ somatic rescoring. Only produced if
+ SomaticOutput is also turned on.
+
+ --diploid
+ Uses varScoreEAF instead of varScoreVAF in somatic score computations.
+ Also, uses diploid variant model instead of variable allele mixture
+ model.
+
+ --locus-stats-column-count arg (=15)
+ The number of columns for locus compare classification in the locus
+ stats file.
+
+ --max-hypothesis-count arg (=32)
+ The maximum number of possible phasings to consider for a superlocus.
+
+ --no-reference-cover-validation
+ Turns off validation that all bases of a chromosome are covered by
+ calls of the variant file.
+
+ --genome-rootA arg
+ The "A" genome directory, for example /data/GS00118-DNA_A01; this
+ directory is expected to contain ASM/REF and ASM/EVIDENCE
+ subdirectories.
+
+ --genome-rootB arg
+ The "B" genome directory.
+
+ --calibration-root arg
+ The directory containing calibration data. For example, there should
+ exist a file calibration-root/0.0.0/metrics.tsv.
+
+ --beta
+ This flag enables the SomaticOutput report, which is beta
+ functionality.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/join.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/join.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,157 @@
+
+
+ two tsv files based on equal fields or overlapping regions.
+
+
+ cgatools
+
+
+
+ cgatools join --beta
+ --input $input1
+ --input $input2
+ --output $output
+ --output-mode $outmode
+ $dump
+ --select $col
+ #for $m in $matched
+ --match ${m.match}
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool joins two tab-delimited files based on equal fields or overlapping regions.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ join - Joins two tab-delimited files based on equal fields or overlapping regions.
+
+ DESCRIPTION
+ Joins two tab-delimited files based on equal fields or overlapping regions.
+ By default, an output record is produced for each match found between file
+ A and file B, but output format can be controlled by the --output-mode
+ parameter.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --input arg
+ File name to use as input (may be passed in as arguments at the end of
+ the command), or omitted for stdin). There must be exactly two input
+ files to join. If only one file is specified by name, file A is taken
+ to be stdin and file B is the named file. File B is read fully into
+ memory, and file A is streamed. File A's columns appear first in the
+ output.
+
+ --output arg (=STDOUT)
+ The output file name (may be omitted for stdout).
+
+ --match arg
+ A match specification, which is a column from A and a column from B
+ separated by a colon.
+
+ --overlap arg
+
+ -m [ --output-mode ] arg (=full)
+ Output mode, one of the following:
+ full Print an output record for each match found between
+ file A and file B.
+ compact Print at most one record for each record of file A,
+ joining the file B values by a semicolon and
+ suppressing repeated B values and empty B values.
+ compact-pct Same as compact, but for each distinct B value,
+ annotate with the percentage of the A record that is
+ overlapped by B records with that B value. Percentage
+ is rounded up to nearest integer.
+
+ --overlap-mode arg (=strict)
+ Overlap mode, one of the following:
+ strict Range A and B overlap if A.begin < B.end and
+ B.begin < A.end.
+ allow-abutting-points Range A and B overlap they meet the strict
+ requirements, or if A.begin <= B.end and
+ B.begin <= A.end and either A or B has zero
+ length.
+
+ --select arg (=A.*,B.*)
+ Set of fields to select for output.
+
+ -a [ --always-dump ]
+ Dump every record of A, even if there are no matches with file B.
+
+ --overlap-fraction-A arg (=0)
+ Minimum fraction of A region overlap for filtering output.
+
+ --boundary-uncertainty-A arg (=0)
+ Boundary uncertainty for overlap filtering. Specifically, records
+ failing the following predicate are filtered away: overlap >=
+ overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
+
+ --overlap-fraction-B arg (=0)
+ Minimum fraction of B region overlap for filtering output.
+
+ --boundary-uncertainty-B arg (=0)
+ Boundary uncertainty for overlap filtering. Specifically, records
+ failing the following predicate are filtered away: overlap >=
+ overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
+
+ SUPPORTED FORMAT_VERSION
+ Any
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/junctiondiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/junctiondiff.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,88 @@
+
+
+ reports difference between junction calls
+
+
+ cgatools
+
+
+
+ cgatools junctiondiff --beta -h
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool reports difference between junction calls of Complete Genomics junctions files
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ junctiondiff - Reports difference between junction calls of Complete Genomics junctions files.
+
+ DESCRIPTION
+ junctiondiff takes two junction files A and B as input and produces the
+ following output:
+ - "diff-inputFileName" - the junctions from an input file A that are not
+ present in input file B.
+ - "report.txt" - a brief summary report (if --statout is used)
+
+ Two junctions are considered equivalent if:
+ - they come from different files
+ - left and right positions of one junction are not more than "--distance"
+ bases apart from the corresponding positions of another junction
+ - the junction scores are equal or above the scoreThreshold
+ - they are on the same strands
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ -s [ --reference ] arg
+ Reference file.
+
+ -a [ --junctionsA ] arg
+ input junction file A.
+
+ -b [ --junctionsB ] arg
+ input junction file B.
+
+ -A [ --scoreThresholdA ] arg (=10)
+ score threshold value for the input file A.
+
+ -B [ --scoreThresholdB ] arg (=0)
+ score threshold value for the input file B.
+
+ -d [ --distance ] arg (=200)
+ Max distance between coordinates of potentially compatible junctions.
+
+ -l [ --minlength ] arg (=500)
+ Minimum deletion junction length to be included into the difference
+ file.
+
+ -o [ --output-prefix ] arg
+ The path prefix for all the output reports.
+
+ -S [ --statout ]
+ (Debug) Report various input file statistics. Experimental feature.
+
+ SUPPORTED FORMAT_VERSION
+ 1.5 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/listtestvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/listtestvariants.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,239 @@
+
+
+
+
+
+
+ cgatools
+
+
+
+ cgatools listvariants
+ --beta
+ --reference ${crr.fields.path}
+ --output $output1
+ #if $include_list.listing == "yes"
+ --variant-listing $include_list.list
+ #end if
+ $longvar
+ --variants
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+ ;
+
+ cgatools testvariants
+ --beta
+ --reference ${crr.fields.path}
+ --output $output2
+ --input $output1
+ --variants
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ listvariants - Lists the variants present in a variant file.
+
+ DESCRIPTION
+ Lists all called variants present in the specified variant files, in a
+ format suitable for processing by the testvariants command. The output is a
+ tab-delimited file consisting of the following columns:
+
+ variantId Sequential id assigned to each variant.
+ chromosome The chromosome of the variant.
+ begin 0-based reference offset of the beginning of the variant.
+ end 0-based reference offset of the end of the variant.
+ varType The varType as extracted from the variant file.
+ reference The reference sequence.
+ alleleSeq The variant allele sequence as extracted from the variant
+ file.
+ xRef The xRef as extrated from the variant file.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be positional args).
+
+ --variant-listing arg
+ The output of another listvariants run, to be merged in to produce the
+ output of this run.
+
+ --list-long-variants
+ In addition to listing short variants, list longer variants as well
+ (10's of bases) by concatenating nearby calls.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
+
+ COMMAND NAME
+ testvariants - Tests variant files for presence of variants.
+
+ DESCRIPTION
+ Tests variant files for presence of variants. The output is a tab-delimited
+ file consisting of the columns of the input variants file, plus a column
+ for each assembly results file that contains a character code for each
+ allele. The character codes have meaning as follows:
+
+ 0 This allele of this genome is consistent with the reference at this
+ locus but inconsistent with the variant.
+ 1 This allele of this genome has the input variant at this locus.
+ N This allele of this genome has no-calls but is consistent with the
+ input variant.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg (=STDIN)
+ The input variants to test for.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be passed in as arguments at the end of
+ the command).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/listvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/listvariants.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,177 @@
+
+
+
+ lists all called variants
+
+
+ cgatools
+
+
+
+ cgatools listvariants
+ --beta
+ --reference ${crr.fields.path}
+ --output $output
+ #if $include_list.listing == "yes"
+ --variant-listing $include_list.list
+ #end if
+ $longvar
+ --variants
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ listvariants - Lists the variants present in a variant file.
+
+ DESCRIPTION
+ Lists all called variants present in the specified variant files, in a
+ format suitable for processing by the testvariants command. The output is a
+ tab-delimited file consisting of the following columns:
+
+ variantId Sequential id assigned to each variant.
+ chromosome The chromosome of the variant.
+ begin 0-based reference offset of the beginning of the variant.
+ end 0-based reference offset of the end of the variant.
+ varType The varType as extracted from the variant file.
+ reference The reference sequence.
+ alleleSeq The variant allele sequence as extracted from the variant
+ file.
+ xRef The xRef as extrated from the variant file.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be positional args).
+
+ --variant-listing arg
+ The output of another listvariants run, to be merged in to produce the
+ output of this run.
+
+ --list-long-variants
+ In addition to listing short variants, list longer variants as well
+ (10's of bases) by concatenating nearby calls.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/snpdiff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/snpdiff.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,116 @@
+
+
+ compares snp calls to a Complete Genomics variant file.
+
+
+ cgatools
+
+
+
+ cgatools snpdiff --beta -h
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool ompares snp calls to a Complete Genomics variant file.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ snpdiff - Compares snp calls to a Complete Genomics variant file.
+
+ DESCRIPTION
+ Compares the snp calls in the "genotypes" file to the calls in a Complete
+ Genomics variant file. The genotypes file is a tab-delimited file with at
+ least the following columns (additional columns may be given):
+
+ Chromosome (Required) The name of the chromosome.
+ Offset0Based (Required) The 0-based offset in the chromosome.
+ GenotypesStrand (Optional) The strand of the calls in the Genotypes
+ column (+ or -, defaults to +).
+ Genotypes (Optional) The calls, one per allele. The following
+ calls are recognized:
+ A,C,G,T A called base.
+ N A no-call.
+ - A deleted base.
+ . A non-snp variation.
+
+ The output is a tab-delimited file consisting of the columns of the
+ original genotypes file, plus the following additional columns:
+
+ Reference The reference base at the given position.
+ VariantFile The calls made by the variant file, one per allele.
+ The character codes are the same as is described for
+ the Genotypes column.
+ DiscordantAlleles (Only if Genotypes is present) The number of
+ Genotypes alleles that are discordant with calls in
+ the VariantFile. If the VariantFile is described as
+ haploid at the given position but the Genotypes is
+ diploid, then each genotype allele is compared
+ against the haploid call of the VariantFile.
+ NoCallAlleles (Only if Genotypes is present) The number of
+ Genotypes alleles that were no-called by the
+ VariantFile. If the VariantFile is described as
+ haploid at the given position but the Genotypes is
+ diploid, then a VariantFile no-call is counted twice.
+
+ The verbose output is a tab-delimited file consisting of the columns of the
+ original genotypes file, plus the following additional columns:
+
+ Reference The reference base at the given position.
+ VariantFile The call made by the variant file for one allele (there is
+ a line in this file for each allele). The character codes
+ are the same as is described for the Genotypes column.
+ [CALLS] The rest of the columns are pasted in from the VariantFile,
+ describing the variant file line used to make the call.
+
+ The stats output is a comma-separated file with several tables describing
+ the results of the snp comparison, for each diploid genotype. The tables
+ all describe the comparison result (column headers) versus the genotype
+ classification (row labels) in different ways. The "Locus classification"
+ tables have the most detailed match classifications, while the "Locus
+ concordance" tables roll these match classifications up into "discordance"
+ and "no-call". A locus is considered discordant if it is discordant for
+ either allele. A locus is considered no-call if it is concordant for both
+ alleles but has a no-call on either allele. The "Allele concordance"
+ describes the comparison result on a per-allele basis.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --reference arg
+ The input crr file.
+
+ --variants arg
+ The input variant file.
+
+ --genotypes arg
+ The input genotypes file.
+
+ --output-prefix arg
+ The path prefix for all output reports.
+
+ --reports arg (=Output,Verbose,Stats)
+ Comma-separated list of reports to generate. A report is one of:
+ Output The output genotypes file.
+ Verbose The verbose output file.
+ Stats The stats output file.
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/testing.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/testing.pl Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,10 @@
+#!/usr/bin/perl
+
+print "$0 @ARGV\n";
+open OUT, ">@ARGV[0]";
+print "test1 ok\ttest1 ok\ntest1 ok\ttest1 ok\n";
+print OUT "test ok\ttest ok\ntest ok\ttest ok\n";
+close OUT;
+open OUT, ">somefile";
+print OUT "test2 ok\ttest2 ok\ntest2 ok\ttest2 ok\n";
+close OUT;
\ No newline at end of file
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/testvariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/testvariants.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,157 @@
+
+
+
+ test for the presence of variants
+
+
+ cgatools
+
+
+
+ cgatools testvariants
+ --beta
+ --reference ${crr.fields.path}
+ --output $output
+ --input $listing
+ --variants
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ testvariants - Tests variant files for presence of variants.
+
+ DESCRIPTION
+ Tests variant files for presence of variants. The output is a tab-delimited
+ file consisting of the columns of the input variants file, plus a column
+ for each assembly results file that contains a character code for each
+ allele. The character codes have meaning as follows:
+
+ 0 This allele of this genome is consistent with the reference at this
+ locus but inconsistent with the variant.
+ 1 This allele of this genome has the input variant at this locus.
+ N This allele of this genome has no-calls but is consistent with the
+ input variant.
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta
+ flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg (=STDIN)
+ The input variants to test for.
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ --variants arg
+ The input variant files (may be passed in as arguments at the end of
+ the command).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/varfilter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/varfilter.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,184 @@
+
+
+
+ copies input file, applying filters.
+
+
+ cgatools
+
+
+
+ varfilter_wrapper.pl
+ --reference $crr.fields.path
+ --output $output
+ --input $file_types.data_sources.input
+ #for $f in $filters
+ --zygosity $f.zygosity
+ --vartype $f.vartype
+ --varscorevaf x$f.varscorevaf
+ --varscoreeaf x$f.varscoreeaf
+ --varquality $f.varquality
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool copies input var file or masterVar file to output, applying specified filters.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+ COMMAND NAME
+ varfilter - Copies input var file or masterVar file to output, applying
+ specified filters.
+
+ DESCRIPTION
+ Copies input var file or masterVar file to output, applying specified
+ filters (which are available to all cgatools commands that read a var file
+ or masterVar file as input). Filters are specified by appending the filter
+ specification to the var file name on the command line. For example:
+
+ /path/to/var.tsv.bz2#varQuality!=VQHIGH
+
+ The preceding example filters out any calls marked as VQLOW. The filter
+ specification follows the "#" sign, and consists of a list of filters to
+ apply, separated by a comma. Each filter is a colon-separated list of call
+ selectors. Any scored call that passes all the colon-separated call
+ selectors for one or more of the comma-separated filters is turned into a
+ no-call. The following call selectors are available:
+
+ hom Selects only calls in homozygous loci.
+ het Selects any scored call not selected by the hom selector.
+ varType=XX Selects calls whose varType is XX.
+ varScoreVAF<XX Selects calls whose varScoreVAF<XX.
+ varScoreEAF<XX Selects calls whose varScoreEAF<XX.
+ varQuality!=XX Selects calls whose varQuality is not XX.
+
+ Here is an example that filters homozygous SNPs with varScoreVAF < 25 and
+ heterozygous insertions with varScoreEAF < 50:
+
+
+ '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50'
+
+
+ OPTIONS
+ -h [ --help ]
+ Print this help message.
+
+ --beta
+ This is a beta command. To run this command, you must pass the --beta flag.
+
+ --reference arg
+ The reference crr file.
+
+ --input arg
+ The input var file or masterVar file (typically with filters specified).
+
+ --output arg (=STDOUT)
+ The output file (may be omitted for stdout).
+
+ SUPPORTED FORMAT_VERSION
+ 0.3 or later
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/varfilter_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgatools/varfilter_wrapper.pl Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,56 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality);
+$| = 1; # set autoflush to screen
+
+# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy.
+# The wrapper generates the filter(s) in the correct format to be used with the input file.
+# written 6-1-2012 by bcrain@completegenomics.com
+
+
+#print join("\n", @ARGV), "\n";
+&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@");
+
+my $append = '';
+
+for (my $i = 0; $i <= $#opt_zygosity; $i ++)
+{
+ my $filter = '';
+ unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];}
+ unless ($opt_vartype[$i] eq 'NA')
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varType=' . $opt_vartype[$i];
+ }
+ unless ($opt_varscorevaf[$i] eq 'x')
+ {
+ $filter ne '' and $filter .= ':';
+ $opt_varscorevaf[$i] =~ s/^x//;
+ $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i];
+ }
+ unless ($opt_varscoreeaf[$i] eq 'x')
+ {
+ $filter ne '' and $filter .= ':';
+ $opt_varscoreeaf[$i] =~ s/^x//;
+ $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i];
+ }
+ unless ($opt_varquality[$i] eq 'NA')
+ {
+ $filter ne '' and $filter .= ':';
+ $filter .= 'varQuality!=' . $opt_varquality[$i];
+ }
+
+ if ($filter ne '')
+ {
+ if ($append eq '') {$append = '#' . $filter;}
+ else {$append .= ',' . $filter;}
+ }
+}
+print "cgatools varfilter
+--beta
+--reference $opt_reference
+--output $opt_output
+--input '${opt_input}${append}'\n";
+
+`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`;
\ No newline at end of file
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/.DS_Store
Binary file cgatools_suite/tools/cgi_scripts/.DS_Store has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml
Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml
Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl
Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl has changed
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,67 @@
+
+
+ in cgatools-testvariants file
+
+
+ Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
+ --Input $input
+ --First_Genome_Field_Nr $first_col
+ --Last_Genome_Field_Nr $last_col
+ --Output1 $output1
+ --Output2 $output2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool calculates the allele frequencies for all variants present in the testvariant file.
+
+-----
+
+**Instructions**::
+
+ Calculate the frequencies of variants in a testvariants output file
+ Two values calculated:
+ Frequency vs all alleles
+ Frequency vs called alleles
+
+ Input: testvariants file
+ Outputs:
+ All data to *-Freq.tsv, including scores and quals
+ vars and freqs to *-Freq_Short.tsv
+ Exceptions to *-Freq_Log
+ Stats to *-Freq_Stats
+
+
+ perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
+ --Input input_file \
+ --First_Genome_Field_Nr col_nr1 \
+ --Last_Genome_Field_Nr col_nr2
+ --Output1 output1 \
+ --Output2 output_short \
+ eg
+ perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \
+ --Input /data/Family_Quartet_testvariants.tsv \
+ --Output /data/Family_Quartet_testvariants
+ --First_Genome_Field_Nr 9 \
+ --Last_Genome_Field_Nr 11
+ --Output1 /data/Family_Quartet_testvariants
+ --Output2 /data/Family_Quartet_testvariants_short
+
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+use strict;
+#use feature "say";
+#use File::Basename;
+$| = 1;
+
+# Get_TestVariants_Variant_Frequencies
+# Calculate the frequencies of variants in a testvariants output file
+# Two values calculated:
+# Frequency vs all alleles
+# Frequency vs called alleles
+
+# Input is a testvariants file
+# Outputs:
+# All data to *-Freq.tsv, including scores and quals
+# vars and freqs to *-Freq_Short.tsv
+# Exceptions to *-Freq_Log
+# Stats to *-Freq_Stats
+
+# Format:
+# perl prog file dir
+# ie
+# perl Get_TestVariants_Variant_Frequencies \
+# --Input input_file \
+# --First_Genome_Field_Nr col_nr1 \
+# --Last_Genome_Field_Nr col_nr2
+# --Output1 output1 \
+# --Output2 output2
+
+# eg
+# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
+# --Input /data/Family_Quartet_testvariants.tsv \
+# --First_Genome_Field_Nr 9 \
+# --Last_Genome_Field_Nr 11
+# --Output1 output1 \
+# --Output2 output2
+
+
+# Rick Tearle 2010-11
+
+my $Time -= time; # start time
+my $Debug = 0;
+
+# Parsing and storing input parameters
+# Only childfields can be repeated
+print "$0 @ARGV\nProcessing input parameters\n";
+my %ExpectedParams = GetExpectedParams ();
+my %EnteredParams = GetEnteredParams ();
+
+# Setting up prog paras from input paras
+my $FileIn = $EnteredParams{input};
+unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
+#my $FileOut = $EnteredParams{output}; #
+#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
+#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
+#print "$FileIn\n$DirectoryOut\n";
+#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions
+
+# my $FileOut1 = $FileOut."-Freq.tsv";
+# my $FileOut2 = $FileOut."-Freq_Short.tsv";
+# my $FileOut3 = $FileOut."-Freq_Stats.tsv";
+# my $FileOut4 = $FileOut."-Freq_Log.tsv";
+
+print "\nOpening Input File:\n\t$FileIn\n";
+my $IN = OpenFile ($FileIn); # open the file with correct file format
+
+#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
+open my $OUT1, ">", $EnteredParams{output1};
+open my $OUT2, ">", $EnteredParams{output2};
+#open my $OUT3, ">", $EnteredParams{output3};
+#open my $OUT4, ">", $EnteredParams{output4};
+
+# Get col header and genomes fields
+my $ColHeader = <$IN>; # get col header
+chomp $ColHeader;
+my @ColHeader = split /\t/, $ColHeader;
+my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
+my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
+if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
+if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
+if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
+if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
+if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
+my $NrGenomes = $StopGenomes - $StartGenomes + 1;
+#print "$StartGenomes\t$StopGenomes\n"; #exit;
+#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
+#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";
+
+# print column headers
+print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
+print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
+print join("\t",@ColHeader),"\n";
+print "First Genome Field: $ColHeader[$StartGenomes]\n";
+print "Last Genome Field: $ColHeader[$StopGenomes]\n";
+print "Nr Genomes: $NrGenomes\n\n";
+
+print "\nProcessing Variants....\n";
+my $VariantCount = 0; # variant locus counter, not used
+my %AllFreqCounts; # storing histogram of all freq counts
+my %CalledFreqCounts; # storing histogram of called freq counts
+my $Warnings;
+while (<$IN>)
+{
+ # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
+ my $Line = $_; # save line for output below
+ chomp $Line;
+ my @F = split /\t/, $Line; # split in to fields
+ $VariantCount++; # increment variant counter
+ my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
+ my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
+ my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
+ my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
+ my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
+ unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
+ {
+ print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
+ #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
+ #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
+ }
+ my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
+ my $CalledFreq = sprintf("%0.3f",0);
+ if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
+ $AllFreqCounts{$AllFreq}++; # increment all freq histogram
+ $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
+ #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
+ print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
+ print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
+ #exit if $VariantCount > 20;
+}
+close $OUT1;
+close $OUT2;
+
+# Print frequency histograms
+print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
+foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}
+
+print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
+foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}
+
+$Time += time;
+print "\ntime $Time\n";
+
+###########################################################################
+# SUBS #
+###########################################################################
+
+sub GetExpectedParams
+{
+ my %Hash =
+ (
+ "input" => -1,
+ "output_dir" => -1,
+ ); # store parameters and values
+ return %Hash;
+}
+
+sub GetEnteredParams
+{
+ # Processing @ARGV
+ my %Hash;
+
+ my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
+ #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
+ #print "Key\tVal\n" if $Debug; #exit;
+ for my $n (1..$#ARGVs) # parse each
+ {
+ $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
+ my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
+ $Key = lc $Key;
+ $Hash{$Key} = $Val; # make a hash entry out of key and val
+ #print "$Key\t$EnteredParams{$Key}\n" if $Debug;
+ }
+ #print int(keys %Hash),"\n" if $Debug;
+ #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
+ #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
+ #exit if $Debug;
+ return %Hash; # hash now has each -- entry param, with associated values
+}
+
+sub SaveArrayAsString
+{
+ my $FH = shift;
+ my $Fields = shift;
+ #print "$Fields\n";
+ print $FH join("\t",@$Fields),"\n";
+}
+
+sub ConcatenateVariants
+{
+ my $ArrayIn = shift; # ptr to array
+ my $StateFieldNr = shift; # field to process
+ #print int(@$ArrayIn),"\n";
+ my @ArrayOut; # array to store records out
+ my $Nr = -1;
+ foreach my $Entry (@$ArrayIn)
+ {
+ }
+ return \@ArrayOut; # return ptr to array
+}
+
+sub LoadStateRecord
+{
+ my $Out = shift;
+ my $In = shift;
+ my $StateFieldNr = shift;
+
+ $Out->{State} = $$In[$StateFieldNr]; # get state for new record
+ $Out->{Chr} = $$In[1]; # get chr
+ $Out->{Begin} = $$In[2]; # get begin of state range
+ $Out->{End} = $$In[3]; # get current end of state range
+ $Out->{Records}++; # record added to new count
+}
+
+sub OpenFile
+{
+ my $File = shift;
+ my $FH;
+ open ($FH, "$File") or die ("$!: can't open file $File");
+ return $FH;
+}
+
+sub OpenFileold
+{
+ my $File = shift;
+ my $FH;
+
+ if ($File =~ /.bz2$/)
+ {
+ open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
+ }
+ elsif ($File =~ /.gz$/)
+ {
+ open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
+ }
+ elsif ($File =~ /.tsv$/)
+ {
+ open ($FH, "cat $File |") or die ("$!: can't open file $File");
+ }
+ else
+ {
+ die ("$!: do not recognise file type $File");
+ }
+ return $FH;
+}
+
+sub LoadNewRecord
+{
+ my $In = shift;
+ my $Out = shift;
+ $Out->{Chr} = $In->{Chr};
+ $Out->{State} = $In->{State};
+ $Out->{Begin} = $In->{Begin};
+ $Out->{End} = $In->{End};
+ $Out->{Records} = $In->{Records};
+}
+
+sub NewStateRecord
+{
+ my $Record =
+ {
+ Chr => "",
+ Begin => -1,
+ End => -1,
+ State => "",
+ Records => 0,
+ MIEs => 0,
+ StateErrors => 0,
+ Length => -1,
+ };
+ return $Record;
+}
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,323 @@
+
+
+ with annotations from gene or var files
+
+
+ #if $file_types.file_type =="var2"
+ List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output
+ --Var_Type $file_types.variants
+ $file_types.scoresVAF
+ $file_types.scoresEAF
+ $file_types.varQuality
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ --Input_File ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+
+ #else if $file_types.file_type =="var1"
+ List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output
+ --Var_Type $file_types.variants
+ $file_types.scores
+ #if $file_types.data_sources.data_source == "in"
+ #for $v in $file_types.data_sources.varfiles
+ --Input_File ${v.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.varlist`
+ #end if
+
+ #else if $file_types.file_type =="gene"
+ List_Unique_Variants_2_1_0.pl --File_Type G --Output_File $output
+ --Var_Type $file_types.variants
+ --Component $file_types.component
+ --Impact $file_types.impact
+ #if $file_types.data_sources.data_source == "in"
+ #for $g in $file_types.data_sources.genefiles
+ --Input_File ${g.input}
+ #end for
+ #else
+ `cat $file_types.data_sources.genelist`
+ #end if
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool identifies all called variants present in the var or gene files and generates annotated variant list.
+
+-----
+
+**Instructions**::
+
+ List Unique Variants for Pipeline 1.x and 2.x
+ [Uses header if available, checks for position of xref field if not]
+ Take one or more var or gene files
+ Extract a non-redundant set of variants
+
+ For var files:
+ The fields used to define non-redundant variants are are:
+ chromosome begin end varType reference alleleSeq xRef
+ User can nominate class(es) of varType to filter on
+ Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn
+ them off (separately)
+ Scores and qualities stored in separate fields, all values for a variant across
+ a set of genomes.
+ Values for different genomes separated by ':', for two hom entries for the same
+ genome by '|'
+ Output is accepted by testvariants to generate a variant table, all fields kept
+ in testvariants output
+
+ For gene files:
+ The fields used to define non-redundant gene variants are:
+ chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol
+ orientation component componentIndex codingRegionKnown impact nucleotidePos
+ proteinPos annotationRefSequence sampleSequence genomeRefSequence
+ User can nominate class(es) of varType, component or impact to filter on
+ All gene entries kept ie multiple entries if multiple transcripts
+
+ NB Now treating xref as a separate component in var recs, as it is not consistent
+ between X and Y vars
+ Not fixed for gene recs yet
+
+ perl List_Unique_Variants_2_0_11.pl
+ --File_Type [V|G]
+ --Input_File input_file_1 [set of var or gene files]
+ --Input_File input_file_2
+ ...
+ --Input_File input_file_n
+ --Output_File filename
+ --Var_Type [For both file types, 'All' or any value from the varType field,
+ multiple values allowed, separated by comma]
+ --Component [Gene file specific,'All' or any value from component field of gene
+ file, multiple allowed; 'All" is default]
+ --Impact All [Gene file specific,'All' or any value from impact field of gene
+ file, multiple allowed; 'All" is default]
+ --Scores [1.x var file specific, yes|no, yes is default]
+ --Scores_VAF [2.0 var file specific, yes|no, yes is default]
+ --Scores_EAF [2.0 var file specific, yes|no, yes is default]
+ --Score_Qualities [yes|no, yes is default]
+ eg
+ perl List_Unique_Variants_2_0_11.pl \
+ --File_Type V \
+ --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \
+ --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \
+ --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \
+ --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \
+ --Var_Type All
+ --Component All
+ --Impact All
+ --Scores_VAF yes \
+ --Scores_EAF yes \
+ --Score_Qualities yes
+
+ var fields
+ 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq
+ totalScore hapLink xRef
+ 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq
+ varScoreVAF varScoreEAF varQuality hapLink xRef
+
+ gene fields
+ 1.x index locus allele chromosome begin end varType reference call xRef geneId
+ mrnaAcc proteinAcc symbol orientation component componentIndex
+ codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence
+ sampleSequence genomeRefSequence
+ 2.0 index locus allele chromosome begin end varType reference call xRef geneId
+ mrnaAcc proteinAcc symbol orientation component componentIndex hasCodingRegion
+ impact nucleotidePos proteinPos annotationRefSequence sampleSequence
+ genomeRefSequence pfam
+
+ Parsing and storing input parameters
+ Only input_file fields can be repeated
+ input paramaters are case insensitive
+
+
+
+
diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl Wed Jun 06 16:58:26 2012 -0400
@@ -0,0 +1,583 @@
+#!/usr/bin/perl
+use strict;
+use File::Basename;
+$| = 1;
+
+# List Unique Variants for Pipeline 1.x and 2.x
+# [Uses header if available, checks for position of xref field if not]
+# Take one or more var or gene files
+# Extract a non-redundant set of variants
+
+# For var files:
+# The fields used to define non-redundant variants are are:
+# chromosome begin end varType reference alleleSeq xRef
+# User can nominate class(es) of varType to filter on
+# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately)
+# Scores and qualities stored in separate fields, all values for a variant across a set of genomes.
+# Values for different genomes separated by ':', for two hom entries for the same genome by '|'
+# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output
+
+# For gene files:
+# The fields used to define non-redundant gene variants are:
+# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
+# User can nominate class(es) of varType, component or impact to filter on
+# All gene entries kept ie multiple entries if multiple transcripts
+
+# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars
+# Not fixed for gene recs yet
+
+# perl List_Unique_Variants_2_0_9.pl
+# --File_Type [V|G]
+# --Input_File input_file_1 [set of var or gene files]
+# --Input_File input_file_2
+# ...
+# --Input_File input_file_n
+# --Output_File filename
+# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma]
+# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default]
+# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default]
+# --Scores [1.x var file specific, yes|no, yes is default]
+# --Scores_VAF [2.0 var file specific, yes|no, yes is default]
+# --Scores_EAF [2.0 var file specific, yes|no, yes is default]
+# --Score_Qualities [yes|no, yes is default]
+# eg
+# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \
+# --File_Type V \
+# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \
+# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \
+# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \
+# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \
+# --Var_Type All
+# --Component All
+# --Impact All
+# --Scores_VAF yes \
+# --Scores_EAF yes \
+# --Score_Qualities yes
+
+# var fields
+# 1.x
+# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
+# 2.0
+# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
+
+# gene fields
+# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
+# component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
+# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
+# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
+
+# Parsing and storing input parameters
+# Only input_file fields can be repeated
+# input paramaters are case insensitive
+
+print "$0 @ARGV\nProcessing input parameters\n";
+my $NrParams;
+my %ExpectedParams = GetExpectedParams (); # list of expected parms
+my %EnteredParams = GetEnteredParams (); # list of entered params
+
+# Input Files
+my $FileType = $EnteredParams{file_type};
+if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";}
+my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files
+print "File Type: $FileType\nInput Files:\n";
+my $NrInputFiles = int(@$FilesIn);
+foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files
+foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files
+for my $n (0.. $NrInputFiles-2) # look for duplicates in list
+ {for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}}
+
+# Output Dir
+#my $DirectoryOut = $EnteredParams{output_dir}; # output dir
+#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
+#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can
+
+# Ouput File
+my $FileOut = $EnteredParams{output_file}; # output file
+print "FileOut: $FileOut\n";
+$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename
+my $DirectoryOut = $1; # assign path # NEED MORE TESTING EG EMPTY PATH #
+$FileOut = $2; # assign file prefix
+print "File: $FileOut\nDir: $DirectoryOut\n"; #exit;
+# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name
+# {
+# print "Output file $FileOut exists, modifying to unique file name ";
+# $FileOut =~ /^(.+?)\./; # find name without extensions
+# my $Stub = $1; # set stub to name without extensions
+# $FileOut =~ /(\..+)?$/; # get extension(s)
+# my $Ext = $1; # set ext to extensions
+# my $n = 1; # n will increment to find a unique name
+# my $Suff = ""; # suff tracks n
+# while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename
+# $FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique
+# print "$FileOut\n";
+# }
+
+#print "Files\n",join("\n",@$FilesIn),"\n\n";
+#print "Ouput Dir\n$DirectoryOut\n";
+#print "Ouput File\n$FileOut\n";
+
+# Extract Header & Column Header
+my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format
+my $Header = GetHeaderAsString ($IN); # get header
+unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again
+my $ColHeader = <$IN>; # get col header, first remaining line
+chomp $ColHeader;
+
+# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed
+my ($Version, $XrefField);
+if ($FileType eq "V")
+{
+ ($Version, $XrefField) = GetVersion ($Header, $ColHeader);
+ #print "$Version $XrefField\n"; exit;
+ unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";}
+}
+
+# Shared input params
+my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc
+$OutputVarTypes =~ s/\,/\|/g; # create regex string
+$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces
+
+# Input Params for var file
+my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras);
+if ($FileType eq "V")
+{
+ if ($Version == 2)
+ {
+ $KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0
+ $KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean
+ $KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0
+ $KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean
+ $KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0 $KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0
+ $KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean
+ }
+ else # Version 1
+ {
+ $KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x
+ $KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean
+ }
+ $VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info
+}
+
+# Input Params for gene file
+my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc
+my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc
+
+# Loading chr nrs, setting up var hash
+my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
+ 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
+ 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order
+my %Vars; # hash to store var records in an array for each chr
+foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";} # set up hash of hashes, one for each chr
+#print "\n"; #exit;
+
+# Create ouput col header
+if ($FileType eq "V")
+{
+ # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
+ # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
+ my @Fields = split "\t", $ColHeader;
+ $ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField];
+ if ($Version == 2) # 2.0
+ {
+ if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";}
+ if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";}
+ if ($KeepQuals) {$ColHeader .= "\tvarQuality";}
+ }
+ else # 1.x
+ {
+ if ($KeepScores) {$ColHeader .= "\ttotalScore";}
+ }
+}
+elsif ($FileType eq "G")
+{
+ # 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
+ # component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
+ # 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
+ # component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
+ my @Fields = split "\t", $ColHeader;
+ $ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed
+ $OutputComponents =~ s/\,/\|/g; # create regex string
+ $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
+ $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
+ $OutputImpacts =~ s/\,/\|/g; # create regex string
+ $OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces
+ #print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n";
+}
+else
+{
+ die "FileType $FileType not understood\n"; # redundant
+}
+
+# Set up Processing Subs
+if ($FileType eq "G") # use gene subs for 'G'
+{
+ *ExtractSub2Use = \&ExtractGeneFields;
+ *AddRecSub2Use = \&AddGeneRec;
+}
+else # use var subs for 'V'
+{
+ *ExtractSub2Use = \&ExtractVarFields;
+ *AddRecSub2Use = \&AddVarRec;
+}
+#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit;
+
+# Process Files
+my $RecCount = 0; # total nr recs that are coding/splicing/spanning
+my $FileCount = 0; # keeps track of nr files to use to format eg scores field
+my $XRef;
+foreach my $File (@$FilesIn)
+{
+ print "Processing file $File\n";
+ $FileCount++; # counting nr files
+
+ # Open file, process header, col header
+ my $IN = OpenFile ($File); # open the file with correct file format
+ my $Header = GetHeaderAsString ($IN); # get header
+ unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again
+ my $ColHeader = <$IN>; # get col header, first remaining line
+ unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"}
+
+ my $Count = 0; # cnr recs for this file that are coding/splicing/spanning
+ while (<$IN>) # loop through remainder of file ie data
+ {
+ my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally
+ next unless $Rec;
+ AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty
+ $Count++; # increment count of coding/splicing/spanning vars
+ }
+ print "Nr matched records for this file: $Count\n";
+ $RecCount += $Count; # add this file's count to total count
+ close $IN;
+}
+print "Nr matched records across all files:\t $RecCount\n"; # total count
+
+# Open file out, write col header
+print "Sorting and Saving to file $DirectoryOut$FileOut ...\n";
+open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n";
+if ($FileType eq "V")
+{
+ print $OUT "variantId\t";
+} # first col header for var file
+else
+{
+ $ColHeader =~ s/\tcall\t/\talleleSeq\t/;
+ print $OUT "index\t";
+} # first col header for gene file
+print $OUT "$ColHeader\n"; # remainder of col header
+
+$RecCount = 0; # reuse total count for nr of non-reduntant vars
+$FileCount--; # reduce by one, used below to add missing delimiters
+foreach my $Chr (@ChrNames) # sort records in each chr array and print with count
+{
+ foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields
+ {
+ next unless $Rec;
+ $RecCount++; # increment count of coding/splicing/spanning vars
+ print $OUT "$RecCount\t$Rec"; # printing rec and count
+ print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files
+
+ if ($VarExtras)
+ {
+ my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g;
+ #print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n";
+ #exit if $TmpCount++ > 10;
+ my $Addition = ":" x ($FileCount - $FieldDelimiterCount);
+ if ($Version == 2)
+ {
+ print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF;
+ print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF;
+ print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals;
+ }
+ else
+ {
+ print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores;
+ }
+ #print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count
+ }
+ print $OUT "\n";
+ }
+}
+print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve
+
+
+###########################################################################
+# SUBS #
+###########################################################################
+
+sub GetExpectedParams
+{
+ my %Hash = # hash to store expected params
+ (
+ "file_type" => -1,
+ "input_file" => [],
+ "output_file" => -1,
+ "var_type" => "all",
+ "component" => "ALL",
+ "impact" => "ALL",
+ "scores" => "yes",
+ "scores_eaf" => "yes",
+ "scores_vaf" => "yes",
+ "score_quality" => "yes",
+ );
+ $NrParams = int keys %Hash;
+ return %Hash;
+}
+
+sub GetEnteredParams
+{
+ # Processing @ARGV
+ my %Hash;
+ my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
+ for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored]
+ {
+ $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
+ my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
+ $Key = lc $Key; # make lower case, ie case insensitive
+ if ($Key eq "input_file") # multiple entries expected, setting up array
+ {
+ push @{$Hash{$Key}}, $Val; # add input to input hash
+
+ }
+ else
+ {
+ $Hash{$Key} = $Val; # make a hash entry out of key and val
+ }
+ }
+ return %Hash; # hash now has each --entry param, with associated values
+}
+
+sub OpenFile
+{
+ my $File = shift;
+ my $FH;
+ open $FH, $File;
+ return $FH;
+}
+
+sub OpenFileold
+{
+ my $File = shift;
+ my $FH;
+
+ if ($File =~ /.bz2$/)
+ {
+ open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
+ }
+ elsif ($File =~ /.gz$/)
+ {
+ open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
+ }
+ elsif ($File =~ /.tsv$/ or $File =~ /.txt$/)
+ {
+ open ($FH, "cat $File |") or die ("$!: can't open file $File");
+ }
+ else
+ {
+ print ("Do not recognise file type for file $File.\nOpening as text file\n");
+ open ($FH, "cat $File |") or die ("$!: can't open file $File");
+ }
+ return $FH;
+}
+
+sub GetHeaderAsString
+{
+ my $FH = shift;
+ my $Header = "";
+ my $Count = 0;
+ while (<$FH>) # loop until a line is empty
+ {
+ chomp;
+ if ($_ eq "") # exit when empty line
+ {
+ return $Header ; # return ref to array
+ }
+ else
+ {
+ $Header .= $_;
+ }
+ return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array
+ }
+}
+
+sub GetVersion
+{
+ my $Header = shift;
+ my $ColHeader = shift;
+
+ my $Version = 0; # need to know if it is 1.x or 2.x
+ my $XrefField = -1;
+
+ if ($FileType eq "V")
+ {
+ if ($Header)
+ {
+ $Header =~ /#FORMAT_VERSION\t(\d)/;
+ if ($1 == 1) {$Version = 1; $XrefField = 11;}
+ elsif ($1 == 2) {$Version = 2; $XrefField = 13;}
+ else {print "Warning: Format Version not found in Header\n";} # not in header
+ }
+ unless ($Version)
+ {
+ my @ColHeader = split /\t/, $ColHeader;
+ for my $n (0..int(@ColHeader)-1)
+ {
+ if ($ColHeader eq "xRef")
+ {
+ $XrefField = $n;
+ if ($n == 11) {$Version = 1;}
+ elsif ($n == 13) {$Version = 2;}
+ last;
+ }
+ }
+ }
+ }
+ return ($Version, $XrefField);
+}
+
+sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr
+{
+ my $Rec = shift;
+ # gene fields
+ # >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc
+ # proteinAcc symbol orientation component componentIndex hasCodingRegion impact
+ # nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
+
+ chomp $Rec; # remove return
+ my @Fields = split "\t", $Rec;
+ #print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n";
+ unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
+ unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave
+ unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave
+ my $Chr = $Fields[3]; # assign chr
+ $Rec = join("\t",@Fields[3..24]);
+ #$Rec =~ s/\t$//; # remove trailing tab if there is one
+
+ return ($Rec, $Chr);
+}
+
+sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr
+{
+ my $Rec = shift;
+ my $XrefField = shift;
+ # var fields
+ # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
+ # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
+
+ chomp $Rec; # remove return
+ my @Fields = split "\t", $Rec;
+ #print "$Fields[6] $OutputVarTypes \n"; exit;
+ unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
+ my $Chr = $Fields[3]; # assign chr
+ #$Rec = join("\t",@Fields[3..8]);
+ $Rec = join("\t",@Fields[3..8]);
+ #$Rec =~ s/\t$//; # remove trailing tab if there is one
+
+ if ($VarExtras)
+ {
+ return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2;
+ return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1;
+ }
+ else
+ {
+ return ($Rec, $Chr);
+ }
+}
+
+sub AddVarRec
+{
+ my $Rec = shift;
+ my $RecHash = shift;
+ my $ScoreVAF = shift;
+ my $ScoreEAF = shift;
+ my $ScoreQual = shift;
+ my $FileCount = shift;
+ my $XRef = shift;
+
+ if ($VarExtras) # need to extract scores information
+ {
+ # locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
+ # Set delimiter
+ my $Delimiter;
+ if ($RecHash->{$Rec}) # hash entry for this var already exists
+ {
+ if ($RecHash->{$Rec}->[0] == $FileCount)
+ {
+ $Delimiter = "|"; # same chr, var is hom, use |
+ }
+ else # diff chr, use :
+ {
+ my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims
+ $Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var
+ }
+ $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer
+ }
+ else # new var
+ {
+ $RecHash->{$Rec} = []; # create array to hold it
+ $Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var
+ $RecHash->{$Rec}->[4] = $XRef; # add xref
+ }
+
+ # Process var
+ $RecHash->{$Rec}->[0] = $FileCount;
+ if ($Version == 2)
+ {
+ $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF
+ $RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF
+ $RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual
+ }
+ else
+ {
+ $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore
+ }
+
+ }
+ else # just the rec, no var extras being extrcted
+ {
+ $RecHash->{$Rec}++; # hash with rec as key, increment count for this key
+ $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here
+ }
+}
+
+sub AddGeneRec
+{
+ my $Rec = shift;
+ my $RecHash = shift;
+
+ $RecHash->{$Rec}++; # hash with rec as key, increment count for this key
+}
+
+sub SortStringsasArrays # sorts based on begin and end of two recs
+{
+ my $String1 = shift; # first string
+ my $String2 = shift; # second string
+
+ my @Array1 = split "\t", $String1; # put fields into array
+ my @Array2 = split "\t", $String2;
+
+ # array[1] is begin, array[2] is end, returning order based on these fields
+ if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2
+ {
+ return -1;
+ }
+ elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2
+ {
+ if ($Array1[2] < $Array2[2]) # end of 1 < end of 2
+ {
+ return -1;
+ }
+ elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2
+ {
+ return 0;
+ }
+ else # end of 1 > end of 2
+ {
+ return 1;
+ }
+ }
+ else # begin of 1 > begin of 2
+ {
+ return 1;
+ }
+
+}