# HG changeset patch # User bcrain-completegenomics # Date 1339016306 14400 # Node ID 96829b1b73ea0fe54678c16aaf7fea76c2d0b4b2 # Parent e4eff539a999c015e682071e37810ec773ededeb Uploaded diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/.DS_Store Binary file cgatools_suite/.DS_Store has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/._datatypes_conf.xml Binary file cgatools_suite/._datatypes_conf.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/._tool_config.xml Binary file cgatools_suite/._tool_config.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/._tool_data_table_conf.xml Binary file cgatools_suite/._tool_data_table_conf.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/datatypes_conf.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/lib/galaxy/datatypes/._completegenomics.py Binary file cgatools_suite/lib/galaxy/datatypes/._completegenomics.py has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/lib/galaxy/datatypes/completegenomics.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/lib/galaxy/datatypes/completegenomics.py Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,71 @@ +""" +Complete Genomics datatypes +Birgit Crain - Complete Genomics, Inc +""" + +import pkg_resources +pkg_resources.require( "bx-python" ) + +import logging +from galaxy.datatypes import data +from galaxy import util +from cgi import escape +from galaxy.datatypes import metadata +from galaxy.datatypes import tabular +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes.tabular import Tabular +import galaxy_utils.sequence.vcf +from galaxy.datatypes.sniff import * + +log = logging.getLogger(__name__) + +class CG_Var( Tabular ): + file_ext = 'cg_var' + def __init__(self, **kwd): + """Initialize CG_Var datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['locus', 'ploidy', 'allele', 'chromosome', 'begin', 'end', + 'varType', 'reference', 'alleleSeq', 'varScoreVAF', + 'varScoreEAF', 'varQuality', 'hapLink', 'xRef' + ] + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + +class CG_MasterVar( Tabular ): + file_ext = 'cg_mastervar' + def __init__(self, **kwd): + """Initialize CG_MasterVar datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['locus', 'ploidy', 'chromosome', 'begin', 'end', 'zygosity', + 'varType', 'reference', 'allele1Seq', 'allele2Seq', + 'allele1VarScoreVAF', 'allele2VarScoreVAF', 'allele1VarScoreEAF', + 'allele2VarScoreEAF', 'allele1VarQuality', 'allele2VarQuality', + 'allele1HapLink', 'allele2HapLink', 'allele1XRef', 'allele2XRef', + 'evidenceIntervalId', 'allele1ReadCount', 'allele2ReadCount', + 'referenceAlleleRead', 'totalReadCount', 'allele1Gene', + 'allele2Gene pfam', 'miRBaseId', 'repeatMasker', 'segDupOverlap', + 'relativeCoverageDiploid', 'calledPloidy', + 'relativeCoverageNondiploid', 'calledLevel' + ] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + +class CG_Gene( Tabular ): + file_ext = 'cg_gene' + def __init__(self, **kwd): + """Initialize CG_Gene datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['index', 'locus', 'allele', 'chromosome', 'begin', 'end', + 'varType', 'reference', 'call', 'xRef', 'geneId', + 'mrnaAcc', 'proteinAcc', 'symbol', 'orientation', 'component', + 'componentIndex', 'hasCodingRegion', 'impact', 'nucleotidePos', + 'proteinPos', 'annotationRefSequence', 'sampleSequence', + 'genomeRefSequence', 'pfam' + ] + + def display_peek( self, dataset ): + """Returns formated html of peek""" + return Tabular.make_html_table( self, dataset, column_names=self.column_names ) diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tool-data/._cg_crr_files.loc.sample Binary file cgatools_suite/tool-data/._cg_crr_files.loc.sample has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tool-data/cg_crr_files.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tool-data/cg_crr_files.loc.sample Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,11 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use .crr reference files. You will need to download or create +#the .crr reference files and then create a cg_crr_files.loc file +#similar to this one (store it in this directory) that points to +#the location of the files. The cg_crr_files.loc +#file has this format (white space characters are TAB characters): +# +# +# +#hg19 hg19 hg19.crr /Users/bcrain/Documents/hg19.crr + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tool_config.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tool_config.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,20 @@ + + + +

+ + + + + + + + + + + + +

+ \ No newline at end of file diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tool_data_table_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tool_data_table_conf.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,12 @@ + + + + + value, dbkey, name, path + +

+ + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/.DS_Store Binary file cgatools_suite/tools/cgatools/.DS_Store has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._calldiff.xml Binary file cgatools_suite/tools/cgatools/._calldiff.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._join.xml Binary file cgatools_suite/tools/cgatools/._join.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._junctiondiff.xml Binary file cgatools_suite/tools/cgatools/._junctiondiff.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._listtestvariants.xml Binary file cgatools_suite/tools/cgatools/._listtestvariants.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._listvariants.xml Binary file cgatools_suite/tools/cgatools/._listvariants.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._snpdiff.xml Binary file cgatools_suite/tools/cgatools/._snpdiff.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._testing.pl Binary file cgatools_suite/tools/cgatools/._testing.pl has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._testvariants.xml Binary file cgatools_suite/tools/cgatools/._testvariants.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._varfilter.xml Binary file cgatools_suite/tools/cgatools/._varfilter.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/._varfilter_wrapper.pl Binary file cgatools_suite/tools/cgatools/._varfilter_wrapper.pl has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/calldiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/calldiff.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,343 @@ + + + compares two Complete Genomics variant files. + + + cgatools + + + + cgatools calldiff --beta + --reference ${crr.fields.path} + --variantsA $data_sources.inputA + --variantsB $data_sources.inputB + $validation + $diploid + --locus-stats-column-count $column + --max-hypothesis-count $hypothesis + --output-prefix cg_ + --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` + #if $somatic.report6 == "SomaticOutput" + --genome-rootA $somatic.genomeA + --genome-rootB $somatic.genomeB + --calibration-root $somatic.calibration + #end if + + + + + (report1 == 'SuperlocusOutput') + + + (report2 == 'SuperlocusStats') + + + (report3 == 'LocusOutput') + + + (report4 == 'LocusStats') + + + (report5 == 'VariantOutput') + + + (report5 == 'VariantOutput') + + + (somatic['report6'] == 'SomaticOutput') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool compares two Complete Genomics variant files. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + calldiff - Compares two Complete Genomics variant files. + + DESCRIPTION + Compares two Complete Genomics variant files. Divides the genome up into + superloci of nearby variants, then compares the superloci. Also refines the + comparison to determine per-call or per-locus comparison results. + + Comparison results are usually described by a semi-colon separated string, + one per allele. Each allele's comparison result is one of the following + classifications: + + ref-identical The alleles of the two variant files are identical, and + they are consistent with the reference. + alt-identical The alleles of the two variant files are identical, and + they are inconsistent with the reference. + ref-consistent The alleles of the two variant files are consistent, + and they are consistent with the reference. + alt-consistent The alleles of the two variant files are consistent, + and they are inconsistent with the reference. + onlyA The alleles of the two variant files are inconsistent, + and only file A is inconsistent with the reference. + onlyB The alleles of the two variant files are inconsistent, + and only file B is inconsistent with the reference. + mismatch The alleles of the two variant files are inconsistent, + and they are both inconsistent with the reference. + phase-mismatch The two variant files would be consistent if the + hapLink field had been empty, but they are + inconsistent. + ploidy-mismatch The superlocus did not have uniform ploidy. + + In some contexts, this classification is rolled up into a simplified + classification, which is one of "identical", "consistent", "onlyA", + "onlyB", or "mismatch". + + A good place to start looking at the results is the superlocus-output file. + It has columns defined as follows: + + SuperlocusId An identifier given to the superlocus. + Chromosome The name of the chromosome. + Begin The 0-based offset of the start of the superlocus. + End The 0-based offset of the base one past the end of the + superlocus. + Classification The match classification of the superlocus. + Reference The reference sequence. + AllelesA A semicolon-separated list of the alleles (one per + haplotype) for variant file A, for the phasing with the + best comparison result. + AllelesB A semicolon-separated list of the alleles (one per + haplotype) for variant file B, for the phasing with the + best comparison result. + + The locus-output file contains, for each locus in file A and file B that is + not consistent with the reference, an annotated set of calls for the locus. + The calls are annotated with the following columns: + + SuperlocusId The id of the superlocus containing the locus. + File The variant file (A or B). + LocusClassification The locus classification is determined by the + varType column of the call that is inconsistent + with the reference, concatenated with a + modifier that describes whether the locus is + heterozygous, homozygous, or contains no-calls. + If there is no one variant in the locus (i.e., + it is heterozygous alt-alt), the locus + classification begins with "other". + LocusDiffClassification The match classification for the locus. This is + defined to be the best of the comparison of the + locus to the same region in the other file, or + the comparison of the superlocus. + + The somatic output file contains a list of putative somatic variations of + genome A. The output includes only those loci that can be classified as + snp, del, ins or sub in file A, and are called reference in the file B. + Every locus is annotated with the following columns: + + VarCvgA The totalReadCount from file A for this locus + (computed on the fly if file A is not a + masterVar file). + VarScoreA The varScoreVAF from file A, or varScoreEAF if + the "--diploid" option is used. + RefCvgB The maximum of the uniqueSequenceCoverage + values for the locus in genome B. + RefScoreB Minimum of the reference scores of the locus in + genome B. + SomaticCategory The category used for determining the + calibrated scores and the SomaticRank. + VarScoreACalib The calibrated variant score of file A, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + VarScoreBCalib The calibrated reference score of file B, under + the model selected by using or not using the + "--diploid" option, and corrected for the count + of heterozygous variants observed in this + genome. See user guide for more information. + SomaticRank The estimated rank of this somatic mutation, + amongst all true somatic mutations within this + SomaticCategory. The value is a number between + 0 and 1; a value of 0.012 means, for example, + that an estimated 1.2% of the true somatic + mutations in this somaticCategory have a + somaticScore less than the somaticScore for + this mutation. See user guide for more + information. + SomaticScore An integer that provides a total order on + quality for all somatic mutations. It is equal + to -10*log10( P(false)/P(true) ), under the + assumption that this genome has a rate of + somatic mutation equal to 1/Mb for + SomaticCategory snp, 1/10Mb for SomaticCategory + ins, 1/10Mb for SomaticCategory del, and 1/20Mb + for SomaticCategory sub. The computation is + based on the assumptions described in the user + guide, and is affected by choice of variant + model selected by using or not using the + "--diploid" option. + SomaticQuality Equal to VQHIGH for all somatic mutations where + SomaticScore >= -10. Otherwise, this column is + empty. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variantsA arg + The "A" input variant file. + + --variantsB arg + The "B" input variant file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) + Comma-separated list of reports to generate. (Beware any reports whose + name begins with "Debug".) A report is one of: + SuperlocusOutput Report for superlocus classification. + SuperlocusStats Report for superlocus classification stats. + LocusOutput Report for locus classification. + LocusStats Report for locus stats. + VariantOutput Both variant files annotated by comparison + results.If the somatic output report is + requested, file A is also annotated with the + same score ranks as produced in that report. + SomaticOutput Report for the list of simple variations that + are present only in file "A", annotated with + the score that indicates the probability of + the variation being truly somatic. Requires + beta, genome-rootA, and genome-rootB options + to be provided as well. Note: generating this + report slows calldiff by 10x-20x. + DebugCallOutput Report for call classification. + DebugSuperlocusOutput Report for debug superlocus information. + DebugSomaticOutput Report for distribution estimates used for + somatic rescoring. Only produced if + SomaticOutput is also turned on. + + --diploid + Uses varScoreEAF instead of varScoreVAF in somatic score computations. + Also, uses diploid variant model instead of variable allele mixture + model. + + --locus-stats-column-count arg (=15) + The number of columns for locus compare classification in the locus + stats file. + + --max-hypothesis-count arg (=32) + The maximum number of possible phasings to consider for a superlocus. + + --no-reference-cover-validation + Turns off validation that all bases of a chromosome are covered by + calls of the variant file. + + --genome-rootA arg + The "A" genome directory, for example /data/GS00118-DNA_A01; this + directory is expected to contain ASM/REF and ASM/EVIDENCE + subdirectories. + + --genome-rootB arg + The "B" genome directory. + + --calibration-root arg + The directory containing calibration data. For example, there should + exist a file calibration-root/0.0.0/metrics.tsv. + + --beta + This flag enables the SomaticOutput report, which is beta + functionality. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/join.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/join.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,157 @@ + + + two tsv files based on equal fields or overlapping regions. + + + cgatools + + + + cgatools join --beta + --input $input1 + --input $input2 + --output $output + --output-mode $outmode + $dump + --select $col + #for $m in $matched + --match ${m.match} + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool joins two tab-delimited files based on equal fields or overlapping regions. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + join - Joins two tab-delimited files based on equal fields or overlapping regions. + + DESCRIPTION + Joins two tab-delimited files based on equal fields or overlapping regions. + By default, an output record is produced for each match found between file + A and file B, but output format can be controlled by the --output-mode + parameter. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --input arg + File name to use as input (may be passed in as arguments at the end of + the command), or omitted for stdin). There must be exactly two input + files to join. If only one file is specified by name, file A is taken + to be stdin and file B is the named file. File B is read fully into + memory, and file A is streamed. File A's columns appear first in the + output. + + --output arg (=STDOUT) + The output file name (may be omitted for stdout). + + --match arg + A match specification, which is a column from A and a column from B + separated by a colon. + + --overlap arg + + -m [ --output-mode ] arg (=full) + Output mode, one of the following: + full Print an output record for each match found between + file A and file B. + compact Print at most one record for each record of file A, + joining the file B values by a semicolon and + suppressing repeated B values and empty B values. + compact-pct Same as compact, but for each distinct B value, + annotate with the percentage of the A record that is + overlapped by B records with that B value. Percentage + is rounded up to nearest integer. + + --overlap-mode arg (=strict) + Overlap mode, one of the following: + strict Range A and B overlap if A.begin < B.end and + B.begin < A.end. + allow-abutting-points Range A and B overlap they meet the strict + requirements, or if A.begin <= B.end and + B.begin <= A.end and either A or B has zero + length. + + --select arg (=A.*,B.*) + Set of fields to select for output. + + -a [ --always-dump ] + Dump every record of A, even if there are no matches with file B. + + --overlap-fraction-A arg (=0) + Minimum fraction of A region overlap for filtering output. + + --boundary-uncertainty-A arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) + + --overlap-fraction-B arg (=0) + Minimum fraction of B region overlap for filtering output. + + --boundary-uncertainty-B arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) + + SUPPORTED FORMAT_VERSION + Any + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/junctiondiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/junctiondiff.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,88 @@ + + + reports difference between junction calls + + + cgatools + + + + cgatools junctiondiff --beta -h + + + + + + + + + + + +**What it does** + +This tool reports difference between junction calls of Complete Genomics junctions files + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. + + DESCRIPTION + junctiondiff takes two junction files A and B as input and produces the + following output: + - "diff-inputFileName" - the junctions from an input file A that are not + present in input file B. + - "report.txt" - a brief summary report (if --statout is used) + + Two junctions are considered equivalent if: + - they come from different files + - left and right positions of one junction are not more than "--distance" + bases apart from the corresponding positions of another junction + - the junction scores are equal or above the scoreThreshold + - they are on the same strands + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + -s [ --reference ] arg + Reference file. + + -a [ --junctionsA ] arg + input junction file A. + + -b [ --junctionsB ] arg + input junction file B. + + -A [ --scoreThresholdA ] arg (=10) + score threshold value for the input file A. + + -B [ --scoreThresholdB ] arg (=0) + score threshold value for the input file B. + + -d [ --distance ] arg (=200) + Max distance between coordinates of potentially compatible junctions. + + -l [ --minlength ] arg (=500) + Minimum deletion junction length to be included into the difference + file. + + -o [ --output-prefix ] arg + The path prefix for all the output reports. + + -S [ --statout ] + (Debug) Report various input file statistics. Experimental feature. + + SUPPORTED FORMAT_VERSION + 1.5 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/listtestvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/listtestvariants.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,239 @@ + + + + + + + cgatools + + + + cgatools listvariants + --beta + --reference ${crr.fields.path} + --output $output1 + #if $include_list.listing == "yes" + --variant-listing $include_list.list + #end if + $longvar + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + ; + + cgatools testvariants + --beta + --reference ${crr.fields.path} + --output $output2 + --input $output1 + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/listvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/listvariants.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,177 @@ + + + + lists all called variants + + + cgatools + + + + cgatools listvariants + --beta + --reference ${crr.fields.path} + --output $output + #if $include_list.listing == "yes" + --variant-listing $include_list.list + #end if + $longvar + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + listvariants - Lists the variants present in a variant file. + + DESCRIPTION + Lists all called variants present in the specified variant files, in a + format suitable for processing by the testvariants command. The output is a + tab-delimited file consisting of the following columns: + + variantId Sequential id assigned to each variant. + chromosome The chromosome of the variant. + begin 0-based reference offset of the beginning of the variant. + end 0-based reference offset of the end of the variant. + varType The varType as extracted from the variant file. + reference The reference sequence. + alleleSeq The variant allele sequence as extracted from the variant + file. + xRef The xRef as extrated from the variant file. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be positional args). + + --variant-listing arg + The output of another listvariants run, to be merged in to produce the + output of this run. + + --list-long-variants + In addition to listing short variants, list longer variants as well + (10's of bases) by concatenating nearby calls. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/snpdiff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/snpdiff.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,116 @@ + + + compares snp calls to a Complete Genomics variant file. + + + cgatools + + + + cgatools snpdiff --beta -h + + + + + + + + + + + +**What it does** + +This tool ompares snp calls to a Complete Genomics variant file. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + snpdiff - Compares snp calls to a Complete Genomics variant file. + + DESCRIPTION + Compares the snp calls in the "genotypes" file to the calls in a Complete + Genomics variant file. The genotypes file is a tab-delimited file with at + least the following columns (additional columns may be given): + + Chromosome (Required) The name of the chromosome. + Offset0Based (Required) The 0-based offset in the chromosome. + GenotypesStrand (Optional) The strand of the calls in the Genotypes + column (+ or -, defaults to +). + Genotypes (Optional) The calls, one per allele. The following + calls are recognized: + A,C,G,T A called base. + N A no-call. + - A deleted base. + . A non-snp variation. + + The output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The calls made by the variant file, one per allele. + The character codes are the same as is described for + the Genotypes column. + DiscordantAlleles (Only if Genotypes is present) The number of + Genotypes alleles that are discordant with calls in + the VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then each genotype allele is compared + against the haploid call of the VariantFile. + NoCallAlleles (Only if Genotypes is present) The number of + Genotypes alleles that were no-called by the + VariantFile. If the VariantFile is described as + haploid at the given position but the Genotypes is + diploid, then a VariantFile no-call is counted twice. + + The verbose output is a tab-delimited file consisting of the columns of the + original genotypes file, plus the following additional columns: + + Reference The reference base at the given position. + VariantFile The call made by the variant file for one allele (there is + a line in this file for each allele). The character codes + are the same as is described for the Genotypes column. + [CALLS] The rest of the columns are pasted in from the VariantFile, + describing the variant file line used to make the call. + + The stats output is a comma-separated file with several tables describing + the results of the snp comparison, for each diploid genotype. The tables + all describe the comparison result (column headers) versus the genotype + classification (row labels) in different ways. The "Locus classification" + tables have the most detailed match classifications, while the "Locus + concordance" tables roll these match classifications up into "discordance" + and "no-call". A locus is considered discordant if it is discordant for + either allele. A locus is considered no-call if it is concordant for both + alleles but has a no-call on either allele. The "Allele concordance" + describes the comparison result on a per-allele basis. + + OPTIONS + -h [ --help ] + Print this help message. + + --reference arg + The input crr file. + + --variants arg + The input variant file. + + --genotypes arg + The input genotypes file. + + --output-prefix arg + The path prefix for all output reports. + + --reports arg (=Output,Verbose,Stats) + Comma-separated list of reports to generate. A report is one of: + Output The output genotypes file. + Verbose The verbose output file. + Stats The stats output file. + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/testing.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/testing.pl Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,10 @@ +#!/usr/bin/perl + +print "$0 @ARGV\n"; +open OUT, ">@ARGV[0]"; +print "test1 ok\ttest1 ok\ntest1 ok\ttest1 ok\n"; +print OUT "test ok\ttest ok\ntest ok\ttest ok\n"; +close OUT; +open OUT, ">somefile"; +print OUT "test2 ok\ttest2 ok\ntest2 ok\ttest2 ok\n"; +close OUT; \ No newline at end of file diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/testvariants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/testvariants.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,157 @@ + + + + test for the presence of variants + + + cgatools + + + + cgatools testvariants + --beta + --reference ${crr.fields.path} + --output $output + --input $listing + --variants + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + testvariants - Tests variant files for presence of variants. + + DESCRIPTION + Tests variant files for presence of variants. The output is a tab-delimited + file consisting of the columns of the input variants file, plus a column + for each assembly results file that contains a character code for each + allele. The character codes have meaning as follows: + + 0 This allele of this genome is consistent with the reference at this + locus but inconsistent with the variant. + 1 This allele of this genome has the input variant at this locus. + N This allele of this genome has no-calls but is consistent with the + input variant. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --reference arg + The reference crr file. + + --input arg (=STDIN) + The input variants to test for. + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + --variants arg + The input variant files (may be passed in as arguments at the end of + the command). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/varfilter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/varfilter.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,184 @@ + + + + copies input file, applying filters. + + + cgatools + + + + varfilter_wrapper.pl + --reference $crr.fields.path + --output $output + --input $file_types.data_sources.input + #for $f in $filters + --zygosity $f.zygosity + --vartype $f.vartype + --varscorevaf x$f.varscorevaf + --varscoreeaf x$f.varscoreeaf + --varquality $f.varquality + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool copies input var file or masterVar file to output, applying specified filters. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + varfilter - Copies input var file or masterVar file to output, applying + specified filters. + + DESCRIPTION + Copies input var file or masterVar file to output, applying specified + filters (which are available to all cgatools commands that read a var file + or masterVar file as input). Filters are specified by appending the filter + specification to the var file name on the command line. For example: + + /path/to/var.tsv.bz2#varQuality!=VQHIGH + + The preceding example filters out any calls marked as VQLOW. The filter + specification follows the "#" sign, and consists of a list of filters to + apply, separated by a comma. Each filter is a colon-separated list of call + selectors. Any scored call that passes all the colon-separated call + selectors for one or more of the comma-separated filters is turned into a + no-call. The following call selectors are available: + + hom Selects only calls in homozygous loci. + het Selects any scored call not selected by the hom selector. + varType=XX Selects calls whose varType is XX. + varScoreVAF<XX Selects calls whose varScoreVAF<XX. + varScoreEAF<XX Selects calls whose varScoreEAF<XX. + varQuality!=XX Selects calls whose varQuality is not XX. + + Here is an example that filters homozygous SNPs with varScoreVAF < 25 and + heterozygous insertions with varScoreEAF < 50: + + + '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' + + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta flag. + + --reference arg + The reference crr file. + + --input arg + The input var file or masterVar file (typically with filters specified). + + --output arg (=STDOUT) + The output file (may be omitted for stdout). + + SUPPORTED FORMAT_VERSION + 0.3 or later + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgatools/varfilter_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgatools/varfilter_wrapper.pl Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,56 @@ +#!/usr/bin/perl +use strict; +use Getopt::Long; +use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); +$| = 1; # set autoflush to screen + +# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. +# The wrapper generates the filter(s) in the correct format to be used with the input file. +# written 6-1-2012 by bcrain@completegenomics.com + + +#print join("\n", @ARGV), "\n"; +&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); + +my $append = ''; + +for (my $i = 0; $i <= $#opt_zygosity; $i ++) +{ + my $filter = ''; + unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} + unless ($opt_vartype[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varType=' . $opt_vartype[$i]; + } + unless ($opt_varscorevaf[$i] eq 'x') + { + $filter ne '' and $filter .= ':'; + $opt_varscorevaf[$i] =~ s/^x//; + $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; + } + unless ($opt_varscoreeaf[$i] eq 'x') + { + $filter ne '' and $filter .= ':'; + $opt_varscoreeaf[$i] =~ s/^x//; + $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; + } + unless ($opt_varquality[$i] eq 'NA') + { + $filter ne '' and $filter .= ':'; + $filter .= 'varQuality!=' . $opt_varquality[$i]; + } + + if ($filter ne '') + { + if ($append eq '') {$append = '#' . $filter;} + else {$append .= ',' . $filter;} + } +} +print "cgatools varfilter +--beta +--reference $opt_reference +--output $opt_output +--input '${opt_input}${append}'\n"; + +`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/.DS_Store Binary file cgatools_suite/tools/cgi_scripts/.DS_Store has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl has changed diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,67 @@ + + + in cgatools-testvariants file + + + Calculate_TestVariants_Variant_Frequencies_0_1_0.pl + --Input $input + --First_Genome_Field_Nr $first_col + --Last_Genome_Field_Nr $last_col + --Output1 $output1 + --Output2 $output2 + + + + + + + + + + + + + + + + + + +**What it does** + +This tool calculates the allele frequencies for all variants present in the testvariant file. + +----- + +**Instructions**:: + + Calculate the frequencies of variants in a testvariants output file + Two values calculated: + Frequency vs all alleles + Frequency vs called alleles + + Input: testvariants file + Outputs: + All data to *-Freq.tsv, including scores and quals + vars and freqs to *-Freq_Short.tsv + Exceptions to *-Freq_Log + Stats to *-Freq_Stats + + + perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ + --Input input_file \ + --First_Genome_Field_Nr col_nr1 \ + --Last_Genome_Field_Nr col_nr2 + --Output1 output1 \ + --Output2 output_short \ + eg + perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ + --Input /data/Family_Quartet_testvariants.tsv \ + --Output /data/Family_Quartet_testvariants + --First_Genome_Field_Nr 9 \ + --Last_Genome_Field_Nr 11 + --Output1 /data/Family_Quartet_testvariants + --Output2 /data/Family_Quartet_testvariants_short + + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,271 @@ +#!/usr/bin/perl +use strict; +#use feature "say"; +#use File::Basename; +$| = 1; + +# Get_TestVariants_Variant_Frequencies +# Calculate the frequencies of variants in a testvariants output file +# Two values calculated: +# Frequency vs all alleles +# Frequency vs called alleles + +# Input is a testvariants file +# Outputs: +# All data to *-Freq.tsv, including scores and quals +# vars and freqs to *-Freq_Short.tsv +# Exceptions to *-Freq_Log +# Stats to *-Freq_Stats + +# Format: +# perl prog file dir +# ie +# perl Get_TestVariants_Variant_Frequencies \ +# --Input input_file \ +# --First_Genome_Field_Nr col_nr1 \ +# --Last_Genome_Field_Nr col_nr2 +# --Output1 output1 \ +# --Output2 output2 + +# eg +# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \ +# --Input /data/Family_Quartet_testvariants.tsv \ +# --First_Genome_Field_Nr 9 \ +# --Last_Genome_Field_Nr 11 +# --Output1 output1 \ +# --Output2 output2 + + +# Rick Tearle 2010-11 + +my $Time -= time; # start time +my $Debug = 0; + +# Parsing and storing input parameters +# Only childfields can be repeated +print "$0 @ARGV\nProcessing input parameters\n"; +my %ExpectedParams = GetExpectedParams (); +my %EnteredParams = GetEnteredParams (); + +# Setting up prog paras from input paras +my $FileIn = $EnteredParams{input}; +unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file +#my $FileOut = $EnteredParams{output}; # +#$DirectoryOut =~ s/\/$//; # remove trailing slash if present +#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file +#print "$FileIn\n$DirectoryOut\n"; +#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions + +# my $FileOut1 = $FileOut."-Freq.tsv"; +# my $FileOut2 = $FileOut."-Freq_Short.tsv"; +# my $FileOut3 = $FileOut."-Freq_Stats.tsv"; +# my $FileOut4 = $FileOut."-Freq_Log.tsv"; + +print "\nOpening Input File:\n\t$FileIn\n"; +my $IN = OpenFile ($FileIn); # open the file with correct file format + +#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit; +open my $OUT1, ">", $EnteredParams{output1}; +open my $OUT2, ">", $EnteredParams{output2}; +#open my $OUT3, ">", $EnteredParams{output3}; +#open my $OUT4, ">", $EnteredParams{output4}; + +# Get col header and genomes fields +my $ColHeader = <$IN>; # get col header +chomp $ColHeader; +my @ColHeader = split /\t/, $ColHeader; +my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based +my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based +if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";} +if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";} +if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";} +if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";} +if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";} +my $NrGenomes = $StopGenomes - $StartGenomes + 1; +#print "$StartGenomes\t$StopGenomes\n"; #exit; +#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n"; +#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n"; + +# print column headers +print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n"; +print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n"; +print join("\t",@ColHeader),"\n"; +print "First Genome Field: $ColHeader[$StartGenomes]\n"; +print "Last Genome Field: $ColHeader[$StopGenomes]\n"; +print "Nr Genomes: $NrGenomes\n\n"; + +print "\nProcessing Variants....\n"; +my $VariantCount = 0; # variant locus counter, not used +my %AllFreqCounts; # storing histogram of all freq counts +my %CalledFreqCounts; # storing histogram of called freq counts +my $Warnings; +while (<$IN>) +{ + # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM] + my $Line = $_; # save line for output below + chomp $Line; + my @F = split /\t/, $Line; # split in to fields + $VariantCount++; # increment variant counter + my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s + my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s + my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s + my $CountN = () = $UseFields =~ /N/g; # count the number of Ns + my $NrAlleles = $Count1 + $Count0 + $CountN; # total count + unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus + { + print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning + #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n"; + #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings + } + my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles + my $CalledFreq = sprintf("%0.3f",0); + if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any + $AllFreqCounts{$AllFreq}++; # increment all freq histogram + $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram + #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit; + print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var + print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var + #exit if $VariantCount > 20; +} +close $OUT1; +close $OUT2; + +# Print frequency histograms +print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header +foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";} + +print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header +foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";} + +$Time += time; +print "\ntime $Time\n"; + +########################################################################### +# SUBS # +########################################################################### + +sub GetExpectedParams +{ + my %Hash = + ( + "input" => -1, + "output_dir" => -1, + ); # store parameters and values + return %Hash; +} + +sub GetEnteredParams +{ + # Processing @ARGV + my %Hash; + + my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array + #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug; + #print "Key\tVal\n" if $Debug; #exit; + for my $n (1..$#ARGVs) # parse each + { + $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces + my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val + $Key = lc $Key; + $Hash{$Key} = $Val; # make a hash entry out of key and val + #print "$Key\t$EnteredParams{$Key}\n" if $Debug; + } + #print int(keys %Hash),"\n" if $Debug; + #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";} + #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug; + #exit if $Debug; + return %Hash; # hash now has each -- entry param, with associated values +} + +sub SaveArrayAsString +{ + my $FH = shift; + my $Fields = shift; + #print "$Fields\n"; + print $FH join("\t",@$Fields),"\n"; +} + +sub ConcatenateVariants +{ + my $ArrayIn = shift; # ptr to array + my $StateFieldNr = shift; # field to process + #print int(@$ArrayIn),"\n"; + my @ArrayOut; # array to store records out + my $Nr = -1; + foreach my $Entry (@$ArrayIn) + { + } + return \@ArrayOut; # return ptr to array +} + +sub LoadStateRecord +{ + my $Out = shift; + my $In = shift; + my $StateFieldNr = shift; + + $Out->{State} = $$In[$StateFieldNr]; # get state for new record + $Out->{Chr} = $$In[1]; # get chr + $Out->{Begin} = $$In[2]; # get begin of state range + $Out->{End} = $$In[3]; # get current end of state range + $Out->{Records}++; # record added to new count +} + +sub OpenFile +{ + my $File = shift; + my $FH; + open ($FH, "$File") or die ("$!: can't open file $File"); + return $FH; +} + +sub OpenFileold +{ + my $File = shift; + my $FH; + + if ($File =~ /.bz2$/) + { + open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.gz$/) + { + open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.tsv$/) + { + open ($FH, "cat $File |") or die ("$!: can't open file $File"); + } + else + { + die ("$!: do not recognise file type $File"); + } + return $FH; +} + +sub LoadNewRecord +{ + my $In = shift; + my $Out = shift; + $Out->{Chr} = $In->{Chr}; + $Out->{State} = $In->{State}; + $Out->{Begin} = $In->{Begin}; + $Out->{End} = $In->{End}; + $Out->{Records} = $In->{Records}; +} + +sub NewStateRecord +{ + my $Record = + { + Chr => "", + Begin => -1, + End => -1, + State => "", + Records => 0, + MIEs => 0, + StateErrors => 0, + Length => -1, + }; + return $Record; +} diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,323 @@ + + + with annotations from gene or var files + + + #if $file_types.file_type =="var2" + List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output + --Var_Type $file_types.variants + $file_types.scoresVAF + $file_types.scoresEAF + $file_types.varQuality + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + --Input_File ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + + #else if $file_types.file_type =="var1" + List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output + --Var_Type $file_types.variants + $file_types.scores + #if $file_types.data_sources.data_source == "in" + #for $v in $file_types.data_sources.varfiles + --Input_File ${v.input} + #end for + #else + `cat $file_types.data_sources.varlist` + #end if + + #else if $file_types.file_type =="gene" + List_Unique_Variants_2_1_0.pl --File_Type G --Output_File $output + --Var_Type $file_types.variants + --Component $file_types.component + --Impact $file_types.impact + #if $file_types.data_sources.data_source == "in" + #for $g in $file_types.data_sources.genefiles + --Input_File ${g.input} + #end for + #else + `cat $file_types.data_sources.genelist` + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool identifies all called variants present in the var or gene files and generates annotated variant list. + +----- + +**Instructions**:: + + List Unique Variants for Pipeline 1.x and 2.x + [Uses header if available, checks for position of xref field if not] + Take one or more var or gene files + Extract a non-redundant set of variants + + For var files: + The fields used to define non-redundant variants are are: + chromosome begin end varType reference alleleSeq xRef + User can nominate class(es) of varType to filter on + Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn + them off (separately) + Scores and qualities stored in separate fields, all values for a variant across + a set of genomes. + Values for different genomes separated by ':', for two hom entries for the same + genome by '|' + Output is accepted by testvariants to generate a variant table, all fields kept + in testvariants output + + For gene files: + The fields used to define non-redundant gene variants are: + chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol + orientation component componentIndex codingRegionKnown impact nucleotidePos + proteinPos annotationRefSequence sampleSequence genomeRefSequence + User can nominate class(es) of varType, component or impact to filter on + All gene entries kept ie multiple entries if multiple transcripts + + NB Now treating xref as a separate component in var recs, as it is not consistent + between X and Y vars + Not fixed for gene recs yet + + perl List_Unique_Variants_2_0_11.pl + --File_Type [V|G] + --Input_File input_file_1 [set of var or gene files] + --Input_File input_file_2 + ... + --Input_File input_file_n + --Output_File filename + --Var_Type [For both file types, 'All' or any value from the varType field, + multiple values allowed, separated by comma] + --Component [Gene file specific,'All' or any value from component field of gene + file, multiple allowed; 'All" is default] + --Impact All [Gene file specific,'All' or any value from impact field of gene + file, multiple allowed; 'All" is default] + --Scores [1.x var file specific, yes|no, yes is default] + --Scores_VAF [2.0 var file specific, yes|no, yes is default] + --Scores_EAF [2.0 var file specific, yes|no, yes is default] + --Score_Qualities [yes|no, yes is default] + eg + perl List_Unique_Variants_2_0_11.pl \ + --File_Type V \ + --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ + --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ + --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ + --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ + --Var_Type All + --Component All + --Impact All + --Scores_VAF yes \ + --Scores_EAF yes \ + --Score_Qualities yes + + var fields + 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq + totalScore hapLink xRef + 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq + varScoreVAF varScoreEAF varQuality hapLink xRef + + gene fields + 1.x index locus allele chromosome begin end varType reference call xRef geneId + mrnaAcc proteinAcc symbol orientation component componentIndex + codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence + sampleSequence genomeRefSequence + 2.0 index locus allele chromosome begin end varType reference call xRef geneId + mrnaAcc proteinAcc symbol orientation component componentIndex hasCodingRegion + impact nucleotidePos proteinPos annotationRefSequence sampleSequence + genomeRefSequence pfam + + Parsing and storing input parameters + Only input_file fields can be repeated + input paramaters are case insensitive + + + + diff -r e4eff539a999 -r 96829b1b73ea cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl Wed Jun 06 16:58:26 2012 -0400 @@ -0,0 +1,583 @@ +#!/usr/bin/perl +use strict; +use File::Basename; +$| = 1; + +# List Unique Variants for Pipeline 1.x and 2.x +# [Uses header if available, checks for position of xref field if not] +# Take one or more var or gene files +# Extract a non-redundant set of variants + +# For var files: +# The fields used to define non-redundant variants are are: +# chromosome begin end varType reference alleleSeq xRef +# User can nominate class(es) of varType to filter on +# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately) +# Scores and qualities stored in separate fields, all values for a variant across a set of genomes. +# Values for different genomes separated by ':', for two hom entries for the same genome by '|' +# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output + +# For gene files: +# The fields used to define non-redundant gene variants are: +# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence +# User can nominate class(es) of varType, component or impact to filter on +# All gene entries kept ie multiple entries if multiple transcripts + +# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars +# Not fixed for gene recs yet + +# perl List_Unique_Variants_2_0_9.pl +# --File_Type [V|G] +# --Input_File input_file_1 [set of var or gene files] +# --Input_File input_file_2 +# ... +# --Input_File input_file_n +# --Output_File filename +# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma] +# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default] +# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default] +# --Scores [1.x var file specific, yes|no, yes is default] +# --Scores_VAF [2.0 var file specific, yes|no, yes is default] +# --Scores_EAF [2.0 var file specific, yes|no, yes is default] +# --Score_Qualities [yes|no, yes is default] +# eg +# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \ +# --File_Type V \ +# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ +# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ +# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ +# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ +# --Var_Type All +# --Component All +# --Impact All +# --Scores_VAF yes \ +# --Scores_EAF yes \ +# --Score_Qualities yes + +# var fields +# 1.x +# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef +# 2.0 +# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef + +# gene fields +# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation +# component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence +# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation +# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam + +# Parsing and storing input parameters +# Only input_file fields can be repeated +# input paramaters are case insensitive + +print "$0 @ARGV\nProcessing input parameters\n"; +my $NrParams; +my %ExpectedParams = GetExpectedParams (); # list of expected parms +my %EnteredParams = GetEnteredParams (); # list of entered params + +# Input Files +my $FileType = $EnteredParams{file_type}; +if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";} +my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files +print "File Type: $FileType\nInput Files:\n"; +my $NrInputFiles = int(@$FilesIn); +foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files +foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files +for my $n (0.. $NrInputFiles-2) # look for duplicates in list + {for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}} + +# Output Dir +#my $DirectoryOut = $EnteredParams{output_dir}; # output dir +#$DirectoryOut =~ s/\/$//; # remove trailing slash if present +#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can + +# Ouput File +my $FileOut = $EnteredParams{output_file}; # output file +print "FileOut: $FileOut\n"; +$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename +my $DirectoryOut = $1; # assign path # NEED MORE TESTING EG EMPTY PATH # +$FileOut = $2; # assign file prefix +print "File: $FileOut\nDir: $DirectoryOut\n"; #exit; +# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name +# { +# print "Output file $FileOut exists, modifying to unique file name "; +# $FileOut =~ /^(.+?)\./; # find name without extensions +# my $Stub = $1; # set stub to name without extensions +# $FileOut =~ /(\..+)?$/; # get extension(s) +# my $Ext = $1; # set ext to extensions +# my $n = 1; # n will increment to find a unique name +# my $Suff = ""; # suff tracks n +# while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename +# $FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique +# print "$FileOut\n"; +# } + +#print "Files\n",join("\n",@$FilesIn),"\n\n"; +#print "Ouput Dir\n$DirectoryOut\n"; +#print "Ouput File\n$FileOut\n"; + +# Extract Header & Column Header +my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format +my $Header = GetHeaderAsString ($IN); # get header +unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again +my $ColHeader = <$IN>; # get col header, first remaining line +chomp $ColHeader; + +# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed +my ($Version, $XrefField); +if ($FileType eq "V") +{ + ($Version, $XrefField) = GetVersion ($Header, $ColHeader); + #print "$Version $XrefField\n"; exit; + unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";} +} + +# Shared input params +my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc +$OutputVarTypes =~ s/\,/\|/g; # create regex string +$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces + +# Input Params for var file +my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras); +if ($FileType eq "V") +{ + if ($Version == 2) + { + $KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0 + $KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean + $KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0 + $KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean + $KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0 $KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0 + $KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean + } + else # Version 1 + { + $KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x + $KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean + } + $VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info +} + +# Input Params for gene file +my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc +my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc + +# Loading chr nrs, setting up var hash +my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', + 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', + 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order +my %Vars; # hash to store var records in an array for each chr +foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";} # set up hash of hashes, one for each chr +#print "\n"; #exit; + +# Create ouput col header +if ($FileType eq "V") +{ + # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef + # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef + my @Fields = split "\t", $ColHeader; + $ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField]; + if ($Version == 2) # 2.0 + { + if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";} + if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";} + if ($KeepQuals) {$ColHeader .= "\tvarQuality";} + } + else # 1.x + { + if ($KeepScores) {$ColHeader .= "\ttotalScore";} + } +} +elsif ($FileType eq "G") +{ + # 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation + # component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence + # 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation + # component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam + my @Fields = split "\t", $ColHeader; + $ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed + $OutputComponents =~ s/\,/\|/g; # create regex string + $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces + $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces + $OutputImpacts =~ s/\,/\|/g; # create regex string + $OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces + #print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n"; +} +else +{ + die "FileType $FileType not understood\n"; # redundant +} + +# Set up Processing Subs +if ($FileType eq "G") # use gene subs for 'G' +{ + *ExtractSub2Use = \&ExtractGeneFields; + *AddRecSub2Use = \&AddGeneRec; +} +else # use var subs for 'V' +{ + *ExtractSub2Use = \&ExtractVarFields; + *AddRecSub2Use = \&AddVarRec; +} +#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit; + +# Process Files +my $RecCount = 0; # total nr recs that are coding/splicing/spanning +my $FileCount = 0; # keeps track of nr files to use to format eg scores field +my $XRef; +foreach my $File (@$FilesIn) +{ + print "Processing file $File\n"; + $FileCount++; # counting nr files + + # Open file, process header, col header + my $IN = OpenFile ($File); # open the file with correct file format + my $Header = GetHeaderAsString ($IN); # get header + unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again + my $ColHeader = <$IN>; # get col header, first remaining line + unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"} + + my $Count = 0; # cnr recs for this file that are coding/splicing/spanning + while (<$IN>) # loop through remainder of file ie data + { + my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally + next unless $Rec; + AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty + $Count++; # increment count of coding/splicing/spanning vars + } + print "Nr matched records for this file: $Count\n"; + $RecCount += $Count; # add this file's count to total count + close $IN; +} +print "Nr matched records across all files:\t $RecCount\n"; # total count + +# Open file out, write col header +print "Sorting and Saving to file $DirectoryOut$FileOut ...\n"; +open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n"; +if ($FileType eq "V") +{ + print $OUT "variantId\t"; +} # first col header for var file +else +{ + $ColHeader =~ s/\tcall\t/\talleleSeq\t/; + print $OUT "index\t"; +} # first col header for gene file +print $OUT "$ColHeader\n"; # remainder of col header + +$RecCount = 0; # reuse total count for nr of non-reduntant vars +$FileCount--; # reduce by one, used below to add missing delimiters +foreach my $Chr (@ChrNames) # sort records in each chr array and print with count +{ + foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields + { + next unless $Rec; + $RecCount++; # increment count of coding/splicing/spanning vars + print $OUT "$RecCount\t$Rec"; # printing rec and count + print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files + + if ($VarExtras) + { + my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g; + #print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n"; + #exit if $TmpCount++ > 10; + my $Addition = ":" x ($FileCount - $FieldDelimiterCount); + if ($Version == 2) + { + print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF; + print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF; + print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals; + } + else + { + print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores; + } + #print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count + } + print $OUT "\n"; + } +} +print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve + + +########################################################################### +# SUBS # +########################################################################### + +sub GetExpectedParams +{ + my %Hash = # hash to store expected params + ( + "file_type" => -1, + "input_file" => [], + "output_file" => -1, + "var_type" => "all", + "component" => "ALL", + "impact" => "ALL", + "scores" => "yes", + "scores_eaf" => "yes", + "scores_vaf" => "yes", + "score_quality" => "yes", + ); + $NrParams = int keys %Hash; + return %Hash; +} + +sub GetEnteredParams +{ + # Processing @ARGV + my %Hash; + my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array + for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored] + { + $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces + my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val + $Key = lc $Key; # make lower case, ie case insensitive + if ($Key eq "input_file") # multiple entries expected, setting up array + { + push @{$Hash{$Key}}, $Val; # add input to input hash + + } + else + { + $Hash{$Key} = $Val; # make a hash entry out of key and val + } + } + return %Hash; # hash now has each --entry param, with associated values +} + +sub OpenFile +{ + my $File = shift; + my $FH; + open $FH, $File; + return $FH; +} + +sub OpenFileold +{ + my $File = shift; + my $FH; + + if ($File =~ /.bz2$/) + { + open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.gz$/) + { + open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); + } + elsif ($File =~ /.tsv$/ or $File =~ /.txt$/) + { + open ($FH, "cat $File |") or die ("$!: can't open file $File"); + } + else + { + print ("Do not recognise file type for file $File.\nOpening as text file\n"); + open ($FH, "cat $File |") or die ("$!: can't open file $File"); + } + return $FH; +} + +sub GetHeaderAsString +{ + my $FH = shift; + my $Header = ""; + my $Count = 0; + while (<$FH>) # loop until a line is empty + { + chomp; + if ($_ eq "") # exit when empty line + { + return $Header ; # return ref to array + } + else + { + $Header .= $_; + } + return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array + } +} + +sub GetVersion +{ + my $Header = shift; + my $ColHeader = shift; + + my $Version = 0; # need to know if it is 1.x or 2.x + my $XrefField = -1; + + if ($FileType eq "V") + { + if ($Header) + { + $Header =~ /#FORMAT_VERSION\t(\d)/; + if ($1 == 1) {$Version = 1; $XrefField = 11;} + elsif ($1 == 2) {$Version = 2; $XrefField = 13;} + else {print "Warning: Format Version not found in Header\n";} # not in header + } + unless ($Version) + { + my @ColHeader = split /\t/, $ColHeader; + for my $n (0..int(@ColHeader)-1) + { + if ($ColHeader eq "xRef") + { + $XrefField = $n; + if ($n == 11) {$Version = 1;} + elsif ($n == 13) {$Version = 2;} + last; + } + } + } + } + return ($Version, $XrefField); +} + +sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr +{ + my $Rec = shift; + # gene fields + # >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc + # proteinAcc symbol orientation component componentIndex hasCodingRegion impact + # nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam + + chomp $Rec; # remove return + my @Fields = split "\t", $Rec; + #print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n"; + unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave + unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave + unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave + my $Chr = $Fields[3]; # assign chr + $Rec = join("\t",@Fields[3..24]); + #$Rec =~ s/\t$//; # remove trailing tab if there is one + + return ($Rec, $Chr); +} + +sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr +{ + my $Rec = shift; + my $XrefField = shift; + # var fields + # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef + # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef + + chomp $Rec; # remove return + my @Fields = split "\t", $Rec; + #print "$Fields[6] $OutputVarTypes \n"; exit; + unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave + my $Chr = $Fields[3]; # assign chr + #$Rec = join("\t",@Fields[3..8]); + $Rec = join("\t",@Fields[3..8]); + #$Rec =~ s/\t$//; # remove trailing tab if there is one + + if ($VarExtras) + { + return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2; + return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1; + } + else + { + return ($Rec, $Chr); + } +} + +sub AddVarRec +{ + my $Rec = shift; + my $RecHash = shift; + my $ScoreVAF = shift; + my $ScoreEAF = shift; + my $ScoreQual = shift; + my $FileCount = shift; + my $XRef = shift; + + if ($VarExtras) # need to extract scores information + { + # locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef + # Set delimiter + my $Delimiter; + if ($RecHash->{$Rec}) # hash entry for this var already exists + { + if ($RecHash->{$Rec}->[0] == $FileCount) + { + $Delimiter = "|"; # same chr, var is hom, use | + } + else # diff chr, use : + { + my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims + $Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var + } + $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer + } + else # new var + { + $RecHash->{$Rec} = []; # create array to hold it + $Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var + $RecHash->{$Rec}->[4] = $XRef; # add xref + } + + # Process var + $RecHash->{$Rec}->[0] = $FileCount; + if ($Version == 2) + { + $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF + $RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF + $RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual + } + else + { + $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore + } + + } + else # just the rec, no var extras being extrcted + { + $RecHash->{$Rec}++; # hash with rec as key, increment count for this key + $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here + } +} + +sub AddGeneRec +{ + my $Rec = shift; + my $RecHash = shift; + + $RecHash->{$Rec}++; # hash with rec as key, increment count for this key +} + +sub SortStringsasArrays # sorts based on begin and end of two recs +{ + my $String1 = shift; # first string + my $String2 = shift; # second string + + my @Array1 = split "\t", $String1; # put fields into array + my @Array2 = split "\t", $String2; + + # array[1] is begin, array[2] is end, returning order based on these fields + if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2 + { + return -1; + } + elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2 + { + if ($Array1[2] < $Array2[2]) # end of 1 < end of 2 + { + return -1; + } + elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2 + { + return 0; + } + else # end of 1 > end of 2 + { + return 1; + } + } + else # begin of 1 > begin of 2 + { + return 1; + } + +}