# HG changeset patch # User bcrain-completegenomics # Date 1339094599 14400 # Node ID a006c9df8747d45438af14daf52ef9e253663785 # Parent e2e802a079f0a8f44ce81182f827a0ec5634d9a8 Deleted selected files diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/.DS_Store Binary file cgatools_suite/.DS_Store has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/._datatypes_conf.xml Binary file cgatools_suite/._datatypes_conf.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/._tool_data_table_conf.xml Binary file cgatools_suite/._tool_data_table_conf.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/datatypes_conf.xml --- a/cgatools_suite/datatypes_conf.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - - - - - - - - - - - - - - - - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/lib/galaxy/datatypes/._completegenomics.py Binary file cgatools_suite/lib/galaxy/datatypes/._completegenomics.py has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/lib/galaxy/datatypes/completegenomics.py --- a/cgatools_suite/lib/galaxy/datatypes/completegenomics.py Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -""" -Complete Genomics datatypes -Birgit Crain - Complete Genomics, Inc -""" - -import pkg_resources -pkg_resources.require( "bx-python" ) - -import logging -from galaxy.datatypes import data -from galaxy import util -from cgi import escape -from galaxy.datatypes import metadata -from galaxy.datatypes import tabular -from galaxy.datatypes.metadata import MetadataElement -from galaxy.datatypes.tabular import Tabular -import galaxy_utils.sequence.vcf -from galaxy.datatypes.sniff import * - -log = logging.getLogger(__name__) - -class CG_Var( Tabular ): - file_ext = 'cg_var' - def __init__(self, **kwd): - """Initialize CG_Var datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['locus', 'ploidy', 'allele', 'chromosome', 'begin', 'end', - 'varType', 'reference', 'alleleSeq', 'varScoreVAF', - 'varScoreEAF', 'varQuality', 'hapLink', 'xRef' - ] - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names ) - -class CG_MasterVar( Tabular ): - file_ext = 'cg_mastervar' - def __init__(self, **kwd): - """Initialize CG_MasterVar datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['locus', 'ploidy', 'chromosome', 'begin', 'end', 'zygosity', - 'varType', 'reference', 'allele1Seq', 'allele2Seq', - 'allele1VarScoreVAF', 'allele2VarScoreVAF', 'allele1VarScoreEAF', - 'allele2VarScoreEAF', 'allele1VarQuality', 'allele2VarQuality', - 'allele1HapLink', 'allele2HapLink', 'allele1XRef', 'allele2XRef', - 'evidenceIntervalId', 'allele1ReadCount', 'allele2ReadCount', - 'referenceAlleleRead', 'totalReadCount', 'allele1Gene', - 'allele2Gene pfam', 'miRBaseId', 'repeatMasker', 'segDupOverlap', - 'relativeCoverageDiploid', 'calledPloidy', - 'relativeCoverageNondiploid', 'calledLevel' - ] - - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names ) - -class CG_Gene( Tabular ): - file_ext = 'cg_gene' - def __init__(self, **kwd): - """Initialize CG_Gene datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = ['index', 'locus', 'allele', 'chromosome', 'begin', 'end', - 'varType', 'reference', 'call', 'xRef', 'geneId', - 'mrnaAcc', 'proteinAcc', 'symbol', 'orientation', 'component', - 'componentIndex', 'hasCodingRegion', 'impact', 'nucleotidePos', - 'proteinPos', 'annotationRefSequence', 'sampleSequence', - 'genomeRefSequence', 'pfam' - ] - - def display_peek( self, dataset ): - """Returns formated html of peek""" - return Tabular.make_html_table( self, dataset, column_names=self.column_names ) diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tool-data/._cg_crr_files.loc.sample Binary file cgatools_suite/tool-data/._cg_crr_files.loc.sample has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tool-data/cg_crr_files.loc.sample --- a/cgatools_suite/tool-data/cg_crr_files.loc.sample Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use .crr reference files. You will need to download or create -#the .crr reference files and then create a cg_crr_files.loc file -#similar to this one (store it in this directory) that points to -#the location of the files. The cg_crr_files.loc -#file has this format (white space characters are TAB characters): -# -# -# -#hg19 hg19 hg19.crr /Users/bcrain/Documents/hg19.crr - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tool_data_table_conf.xml --- a/cgatools_suite/tool_data_table_conf.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ - - - - - value, dbkey, name, path - -
- -
- diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/.DS_Store Binary file cgatools_suite/tools/cgatools/.DS_Store has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._calldiff.xml Binary file cgatools_suite/tools/cgatools/._calldiff.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._join.xml Binary file cgatools_suite/tools/cgatools/._join.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._junctiondiff.xml Binary file cgatools_suite/tools/cgatools/._junctiondiff.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._listtestvariants.xml Binary file cgatools_suite/tools/cgatools/._listtestvariants.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._listvariants.xml Binary file cgatools_suite/tools/cgatools/._listvariants.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._snpdiff.xml Binary file cgatools_suite/tools/cgatools/._snpdiff.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._testing.pl Binary file cgatools_suite/tools/cgatools/._testing.pl has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._testvariants.xml Binary file cgatools_suite/tools/cgatools/._testvariants.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._varfilter.xml Binary file cgatools_suite/tools/cgatools/._varfilter.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/._varfilter_wrapper.pl Binary file cgatools_suite/tools/cgatools/._varfilter_wrapper.pl has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/calldiff.xml --- a/cgatools_suite/tools/cgatools/calldiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,343 +0,0 @@ - - - compares two Complete Genomics variant files. - - - cgatools - - - - cgatools calldiff --beta - --reference ${crr.fields.path} - --variantsA $data_sources.inputA - --variantsB $data_sources.inputB - $validation - $diploid - --locus-stats-column-count $column - --max-hypothesis-count $hypothesis - --output-prefix cg_ - --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'` - #if $somatic.report6 == "SomaticOutput" - --genome-rootA $somatic.genomeA - --genome-rootB $somatic.genomeB - --calibration-root $somatic.calibration - #end if - - - - - (report1 == 'SuperlocusOutput') - - - (report2 == 'SuperlocusStats') - - - (report3 == 'LocusOutput') - - - (report4 == 'LocusStats') - - - (report5 == 'VariantOutput') - - - (report5 == 'VariantOutput') - - - (somatic['report6'] == 'SomaticOutput') - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool compares two Complete Genomics variant files. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - calldiff - Compares two Complete Genomics variant files. - - DESCRIPTION - Compares two Complete Genomics variant files. Divides the genome up into - superloci of nearby variants, then compares the superloci. Also refines the - comparison to determine per-call or per-locus comparison results. - - Comparison results are usually described by a semi-colon separated string, - one per allele. Each allele's comparison result is one of the following - classifications: - - ref-identical The alleles of the two variant files are identical, and - they are consistent with the reference. - alt-identical The alleles of the two variant files are identical, and - they are inconsistent with the reference. - ref-consistent The alleles of the two variant files are consistent, - and they are consistent with the reference. - alt-consistent The alleles of the two variant files are consistent, - and they are inconsistent with the reference. - onlyA The alleles of the two variant files are inconsistent, - and only file A is inconsistent with the reference. - onlyB The alleles of the two variant files are inconsistent, - and only file B is inconsistent with the reference. - mismatch The alleles of the two variant files are inconsistent, - and they are both inconsistent with the reference. - phase-mismatch The two variant files would be consistent if the - hapLink field had been empty, but they are - inconsistent. - ploidy-mismatch The superlocus did not have uniform ploidy. - - In some contexts, this classification is rolled up into a simplified - classification, which is one of "identical", "consistent", "onlyA", - "onlyB", or "mismatch". - - A good place to start looking at the results is the superlocus-output file. - It has columns defined as follows: - - SuperlocusId An identifier given to the superlocus. - Chromosome The name of the chromosome. - Begin The 0-based offset of the start of the superlocus. - End The 0-based offset of the base one past the end of the - superlocus. - Classification The match classification of the superlocus. - Reference The reference sequence. - AllelesA A semicolon-separated list of the alleles (one per - haplotype) for variant file A, for the phasing with the - best comparison result. - AllelesB A semicolon-separated list of the alleles (one per - haplotype) for variant file B, for the phasing with the - best comparison result. - - The locus-output file contains, for each locus in file A and file B that is - not consistent with the reference, an annotated set of calls for the locus. - The calls are annotated with the following columns: - - SuperlocusId The id of the superlocus containing the locus. - File The variant file (A or B). - LocusClassification The locus classification is determined by the - varType column of the call that is inconsistent - with the reference, concatenated with a - modifier that describes whether the locus is - heterozygous, homozygous, or contains no-calls. - If there is no one variant in the locus (i.e., - it is heterozygous alt-alt), the locus - classification begins with "other". - LocusDiffClassification The match classification for the locus. This is - defined to be the best of the comparison of the - locus to the same region in the other file, or - the comparison of the superlocus. - - The somatic output file contains a list of putative somatic variations of - genome A. The output includes only those loci that can be classified as - snp, del, ins or sub in file A, and are called reference in the file B. - Every locus is annotated with the following columns: - - VarCvgA The totalReadCount from file A for this locus - (computed on the fly if file A is not a - masterVar file). - VarScoreA The varScoreVAF from file A, or varScoreEAF if - the "--diploid" option is used. - RefCvgB The maximum of the uniqueSequenceCoverage - values for the locus in genome B. - RefScoreB Minimum of the reference scores of the locus in - genome B. - SomaticCategory The category used for determining the - calibrated scores and the SomaticRank. - VarScoreACalib The calibrated variant score of file A, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - VarScoreBCalib The calibrated reference score of file B, under - the model selected by using or not using the - "--diploid" option, and corrected for the count - of heterozygous variants observed in this - genome. See user guide for more information. - SomaticRank The estimated rank of this somatic mutation, - amongst all true somatic mutations within this - SomaticCategory. The value is a number between - 0 and 1; a value of 0.012 means, for example, - that an estimated 1.2% of the true somatic - mutations in this somaticCategory have a - somaticScore less than the somaticScore for - this mutation. See user guide for more - information. - SomaticScore An integer that provides a total order on - quality for all somatic mutations. It is equal - to -10*log10( P(false)/P(true) ), under the - assumption that this genome has a rate of - somatic mutation equal to 1/Mb for - SomaticCategory snp, 1/10Mb for SomaticCategory - ins, 1/10Mb for SomaticCategory del, and 1/20Mb - for SomaticCategory sub. The computation is - based on the assumptions described in the user - guide, and is affected by choice of variant - model selected by using or not using the - "--diploid" option. - SomaticQuality Equal to VQHIGH for all somatic mutations where - SomaticScore >= -10. Otherwise, this column is - empty. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variantsA arg - The "A" input variant file. - - --variantsB arg - The "B" input variant file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats) - Comma-separated list of reports to generate. (Beware any reports whose - name begins with "Debug".) A report is one of: - SuperlocusOutput Report for superlocus classification. - SuperlocusStats Report for superlocus classification stats. - LocusOutput Report for locus classification. - LocusStats Report for locus stats. - VariantOutput Both variant files annotated by comparison - results.If the somatic output report is - requested, file A is also annotated with the - same score ranks as produced in that report. - SomaticOutput Report for the list of simple variations that - are present only in file "A", annotated with - the score that indicates the probability of - the variation being truly somatic. Requires - beta, genome-rootA, and genome-rootB options - to be provided as well. Note: generating this - report slows calldiff by 10x-20x. - DebugCallOutput Report for call classification. - DebugSuperlocusOutput Report for debug superlocus information. - DebugSomaticOutput Report for distribution estimates used for - somatic rescoring. Only produced if - SomaticOutput is also turned on. - - --diploid - Uses varScoreEAF instead of varScoreVAF in somatic score computations. - Also, uses diploid variant model instead of variable allele mixture - model. - - --locus-stats-column-count arg (=15) - The number of columns for locus compare classification in the locus - stats file. - - --max-hypothesis-count arg (=32) - The maximum number of possible phasings to consider for a superlocus. - - --no-reference-cover-validation - Turns off validation that all bases of a chromosome are covered by - calls of the variant file. - - --genome-rootA arg - The "A" genome directory, for example /data/GS00118-DNA_A01; this - directory is expected to contain ASM/REF and ASM/EVIDENCE - subdirectories. - - --genome-rootB arg - The "B" genome directory. - - --calibration-root arg - The directory containing calibration data. For example, there should - exist a file calibration-root/0.0.0/metrics.tsv. - - --beta - This flag enables the SomaticOutput report, which is beta - functionality. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/join.xml --- a/cgatools_suite/tools/cgatools/join.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ - - - two tsv files based on equal fields or overlapping regions. - - - cgatools - - - - cgatools join --beta - --input $input1 - --input $input2 - --output $output - --output-mode $outmode - $dump - --select $col - #for $m in $matched - --match ${m.match} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool joins two tab-delimited files based on equal fields or overlapping regions. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - join - Joins two tab-delimited files based on equal fields or overlapping regions. - - DESCRIPTION - Joins two tab-delimited files based on equal fields or overlapping regions. - By default, an output record is produced for each match found between file - A and file B, but output format can be controlled by the --output-mode - parameter. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --input arg - File name to use as input (may be passed in as arguments at the end of - the command), or omitted for stdin). There must be exactly two input - files to join. If only one file is specified by name, file A is taken - to be stdin and file B is the named file. File B is read fully into - memory, and file A is streamed. File A's columns appear first in the - output. - - --output arg (=STDOUT) - The output file name (may be omitted for stdout). - - --match arg - A match specification, which is a column from A and a column from B - separated by a colon. - - --overlap arg - - -m [ --output-mode ] arg (=full) - Output mode, one of the following: - full Print an output record for each match found between - file A and file B. - compact Print at most one record for each record of file A, - joining the file B values by a semicolon and - suppressing repeated B values and empty B values. - compact-pct Same as compact, but for each distinct B value, - annotate with the percentage of the A record that is - overlapped by B records with that B value. Percentage - is rounded up to nearest integer. - - --overlap-mode arg (=strict) - Overlap mode, one of the following: - strict Range A and B overlap if A.begin < B.end and - B.begin < A.end. - allow-abutting-points Range A and B overlap they meet the strict - requirements, or if A.begin <= B.end and - B.begin <= A.end and either A or B has zero - length. - - --select arg (=A.*,B.*) - Set of fields to select for output. - - -a [ --always-dump ] - Dump every record of A, even if there are no matches with file B. - - --overlap-fraction-A arg (=0) - Minimum fraction of A region overlap for filtering output. - - --boundary-uncertainty-A arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) - - --overlap-fraction-B arg (=0) - Minimum fraction of B region overlap for filtering output. - - --boundary-uncertainty-B arg (=0) - Boundary uncertainty for overlap filtering. Specifically, records - failing the following predicate are filtered away: overlap >= - overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) - - SUPPORTED FORMAT_VERSION - Any - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/junctiondiff.xml --- a/cgatools_suite/tools/cgatools/junctiondiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,88 +0,0 @@ - - - reports difference between junction calls - - - cgatools - - - - cgatools junctiondiff --beta -h - - - - - - - - - - - -**What it does** - -This tool reports difference between junction calls of Complete Genomics junctions files - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - junctiondiff - Reports difference between junction calls of Complete Genomics junctions files. - - DESCRIPTION - junctiondiff takes two junction files A and B as input and produces the - following output: - - "diff-inputFileName" - the junctions from an input file A that are not - present in input file B. - - "report.txt" - a brief summary report (if --statout is used) - - Two junctions are considered equivalent if: - - they come from different files - - left and right positions of one junction are not more than "--distance" - bases apart from the corresponding positions of another junction - - the junction scores are equal or above the scoreThreshold - - they are on the same strands - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - -s [ --reference ] arg - Reference file. - - -a [ --junctionsA ] arg - input junction file A. - - -b [ --junctionsB ] arg - input junction file B. - - -A [ --scoreThresholdA ] arg (=10) - score threshold value for the input file A. - - -B [ --scoreThresholdB ] arg (=0) - score threshold value for the input file B. - - -d [ --distance ] arg (=200) - Max distance between coordinates of potentially compatible junctions. - - -l [ --minlength ] arg (=500) - Minimum deletion junction length to be included into the difference - file. - - -o [ --output-prefix ] arg - The path prefix for all the output reports. - - -S [ --statout ] - (Debug) Report various input file statistics. Experimental feature. - - SUPPORTED FORMAT_VERSION - 1.5 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/listtestvariants.xml --- a/cgatools_suite/tools/cgatools/listtestvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,239 +0,0 @@ - - - - - - - cgatools - - - - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output1 - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - ; - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output2 - --input $output1 - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/listvariants.xml --- a/cgatools_suite/tools/cgatools/listvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ - - - - lists all called variants - - - cgatools - - - - cgatools listvariants - --beta - --reference ${crr.fields.path} - --output $output - #if $include_list.listing == "yes" - --variant-listing $include_list.list - #end if - $longvar - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools listvariants to list all called variants present in the var or mastervar files. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - listvariants - Lists the variants present in a variant file. - - DESCRIPTION - Lists all called variants present in the specified variant files, in a - format suitable for processing by the testvariants command. The output is a - tab-delimited file consisting of the following columns: - - variantId Sequential id assigned to each variant. - chromosome The chromosome of the variant. - begin 0-based reference offset of the beginning of the variant. - end 0-based reference offset of the end of the variant. - varType The varType as extracted from the variant file. - reference The reference sequence. - alleleSeq The variant allele sequence as extracted from the variant - file. - xRef The xRef as extrated from the variant file. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be positional args). - - --variant-listing arg - The output of another listvariants run, to be merged in to produce the - output of this run. - - --list-long-variants - In addition to listing short variants, list longer variants as well - (10's of bases) by concatenating nearby calls. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/snpdiff.xml --- a/cgatools_suite/tools/cgatools/snpdiff.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ - - - compares snp calls to a Complete Genomics variant file. - - - cgatools - - - - cgatools snpdiff --beta -h - - - - - - - - - - - -**What it does** - -This tool ompares snp calls to a Complete Genomics variant file. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - snpdiff - Compares snp calls to a Complete Genomics variant file. - - DESCRIPTION - Compares the snp calls in the "genotypes" file to the calls in a Complete - Genomics variant file. The genotypes file is a tab-delimited file with at - least the following columns (additional columns may be given): - - Chromosome (Required) The name of the chromosome. - Offset0Based (Required) The 0-based offset in the chromosome. - GenotypesStrand (Optional) The strand of the calls in the Genotypes - column (+ or -, defaults to +). - Genotypes (Optional) The calls, one per allele. The following - calls are recognized: - A,C,G,T A called base. - N A no-call. - - A deleted base. - . A non-snp variation. - - The output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The calls made by the variant file, one per allele. - The character codes are the same as is described for - the Genotypes column. - DiscordantAlleles (Only if Genotypes is present) The number of - Genotypes alleles that are discordant with calls in - the VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then each genotype allele is compared - against the haploid call of the VariantFile. - NoCallAlleles (Only if Genotypes is present) The number of - Genotypes alleles that were no-called by the - VariantFile. If the VariantFile is described as - haploid at the given position but the Genotypes is - diploid, then a VariantFile no-call is counted twice. - - The verbose output is a tab-delimited file consisting of the columns of the - original genotypes file, plus the following additional columns: - - Reference The reference base at the given position. - VariantFile The call made by the variant file for one allele (there is - a line in this file for each allele). The character codes - are the same as is described for the Genotypes column. - [CALLS] The rest of the columns are pasted in from the VariantFile, - describing the variant file line used to make the call. - - The stats output is a comma-separated file with several tables describing - the results of the snp comparison, for each diploid genotype. The tables - all describe the comparison result (column headers) versus the genotype - classification (row labels) in different ways. The "Locus classification" - tables have the most detailed match classifications, while the "Locus - concordance" tables roll these match classifications up into "discordance" - and "no-call". A locus is considered discordant if it is discordant for - either allele. A locus is considered no-call if it is concordant for both - alleles but has a no-call on either allele. The "Allele concordance" - describes the comparison result on a per-allele basis. - - OPTIONS - -h [ --help ] - Print this help message. - - --reference arg - The input crr file. - - --variants arg - The input variant file. - - --genotypes arg - The input genotypes file. - - --output-prefix arg - The path prefix for all output reports. - - --reports arg (=Output,Verbose,Stats) - Comma-separated list of reports to generate. A report is one of: - Output The output genotypes file. - Verbose The verbose output file. - Stats The stats output file. - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/testing.pl --- a/cgatools_suite/tools/cgatools/testing.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#!/usr/bin/perl - -print "$0 @ARGV\n"; -open OUT, ">@ARGV[0]"; -print "test1 ok\ttest1 ok\ntest1 ok\ttest1 ok\n"; -print OUT "test ok\ttest ok\ntest ok\ttest ok\n"; -close OUT; -open OUT, ">somefile"; -print OUT "test2 ok\ttest2 ok\ntest2 ok\ttest2 ok\n"; -close OUT; \ No newline at end of file diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/testvariants.xml --- a/cgatools_suite/tools/cgatools/testvariants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ - - - - test for the presence of variants - - - cgatools - - - - cgatools testvariants - --beta - --reference ${crr.fields.path} - --output $output - --input $listing - --variants - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses the cgatools testvariants to test variant or mastervar files for the presence of variants. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - testvariants - Tests variant files for presence of variants. - - DESCRIPTION - Tests variant files for presence of variants. The output is a tab-delimited - file consisting of the columns of the input variants file, plus a column - for each assembly results file that contains a character code for each - allele. The character codes have meaning as follows: - - 0 This allele of this genome is consistent with the reference at this - locus but inconsistent with the variant. - 1 This allele of this genome has the input variant at this locus. - N This allele of this genome has no-calls but is consistent with the - input variant. - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta - flag. - - --reference arg - The reference crr file. - - --input arg (=STDIN) - The input variants to test for. - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - --variants arg - The input variant files (may be passed in as arguments at the end of - the command). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/varfilter.xml --- a/cgatools_suite/tools/cgatools/varfilter.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ - - - - copies input file, applying filters. - - - cgatools - - - - varfilter_wrapper.pl - --reference $crr.fields.path - --output $output - --input $file_types.data_sources.input - #for $f in $filters - --zygosity $f.zygosity - --vartype $f.vartype - --varscorevaf x$f.varscorevaf - --varscoreeaf x$f.varscoreeaf - --varquality $f.varquality - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool copies input var file or masterVar file to output, applying specified filters. - -cgatools: http://sourceforge.net/projects/cgatools/files/ - ------ - -**cgatools Manual**:: - - COMMAND NAME - varfilter - Copies input var file or masterVar file to output, applying - specified filters. - - DESCRIPTION - Copies input var file or masterVar file to output, applying specified - filters (which are available to all cgatools commands that read a var file - or masterVar file as input). Filters are specified by appending the filter - specification to the var file name on the command line. For example: - - /path/to/var.tsv.bz2#varQuality!=VQHIGH - - The preceding example filters out any calls marked as VQLOW. The filter - specification follows the "#" sign, and consists of a list of filters to - apply, separated by a comma. Each filter is a colon-separated list of call - selectors. Any scored call that passes all the colon-separated call - selectors for one or more of the comma-separated filters is turned into a - no-call. The following call selectors are available: - - hom Selects only calls in homozygous loci. - het Selects any scored call not selected by the hom selector. - varType=XX Selects calls whose varType is XX. - varScoreVAF<XX Selects calls whose varScoreVAF<XX. - varScoreEAF<XX Selects calls whose varScoreEAF<XX. - varQuality!=XX Selects calls whose varQuality is not XX. - - Here is an example that filters homozygous SNPs with varScoreVAF < 25 and - heterozygous insertions with varScoreEAF < 50: - - - '/path/to/var.tsv.bz2#hom:varType=snp:varScoreVAF<25,het:varType=ins:varScoreEAF<50' - - - OPTIONS - -h [ --help ] - Print this help message. - - --beta - This is a beta command. To run this command, you must pass the --beta flag. - - --reference arg - The reference crr file. - - --input arg - The input var file or masterVar file (typically with filters specified). - - --output arg (=STDOUT) - The output file (may be omitted for stdout). - - SUPPORTED FORMAT_VERSION - 0.3 or later - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgatools/varfilter_wrapper.pl --- a/cgatools_suite/tools/cgatools/varfilter_wrapper.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use vars qw($opt_reference $opt_input $opt_output @opt_zygosity @opt_vartype @opt_varscorevaf @opt_varscoreeaf @opt_varquality); -$| = 1; # set autoflush to screen - -# This is a wrapper for the cgatools varfilter function to run cgatools varfilter in Galaxy. -# The wrapper generates the filter(s) in the correct format to be used with the input file. -# written 6-1-2012 by bcrain@completegenomics.com - - -#print join("\n", @ARGV), "\n"; -&GetOptions("reference=s", "input=s", "output=s", "zygosity=s@", "vartype=s@", "varscorevaf=s@", "varscoreeaf=s@", "varquality=s@"); - -my $append = ''; - -for (my $i = 0; $i <= $#opt_zygosity; $i ++) -{ - my $filter = ''; - unless ($opt_zygosity[$i] eq 'NA') {$filter = $opt_zygosity[$i];} - unless ($opt_vartype[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varType=' . $opt_vartype[$i]; - } - unless ($opt_varscorevaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscorevaf[$i] =~ s/^x//; - $filter .= 'varScoreVAF<' . $opt_varscorevaf[$i]; - } - unless ($opt_varscoreeaf[$i] eq 'x') - { - $filter ne '' and $filter .= ':'; - $opt_varscoreeaf[$i] =~ s/^x//; - $filter .= 'varScoreEAF<' . $opt_varscoreeaf[$i]; - } - unless ($opt_varquality[$i] eq 'NA') - { - $filter ne '' and $filter .= ':'; - $filter .= 'varQuality!=' . $opt_varquality[$i]; - } - - if ($filter ne '') - { - if ($append eq '') {$append = '#' . $filter;} - else {$append .= ',' . $filter;} - } -} -print "cgatools varfilter ---beta ---reference $opt_reference ---output $opt_output ---input '${opt_input}${append}'\n"; - -`cgatools varfilter --beta --reference $opt_reference --output $opt_output --input '${opt_input}${append}'`; \ No newline at end of file diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/.DS_Store Binary file cgatools_suite/tools/cgi_scripts/.DS_Store has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Binary file cgatools_suite/tools/cgi_scripts/._Calculate_TestVariants_Variant_Frequencies_0_1_0.pl has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants.xml has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl Binary file cgatools_suite/tools/cgi_scripts/._List_Unique_Variants_2_1_0.pl has changed diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml --- a/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ - - - in cgatools-testvariants file - - - Calculate_TestVariants_Variant_Frequencies_0_1_0.pl - --Input $input - --First_Genome_Field_Nr $first_col - --Last_Genome_Field_Nr $last_col - --Output1 $output1 - --Output2 $output2 - - - - - - - - - - - - - - - - - - -**What it does** - -This tool calculates the allele frequencies for all variants present in the testvariant file. - ------ - -**Instructions**:: - - Calculate the frequencies of variants in a testvariants output file - Two values calculated: - Frequency vs all alleles - Frequency vs called alleles - - Input: testvariants file - Outputs: - All data to *-Freq.tsv, including scores and quals - vars and freqs to *-Freq_Short.tsv - Exceptions to *-Freq_Log - Stats to *-Freq_Stats - - - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input input_file \ - --First_Genome_Field_Nr col_nr1 \ - --Last_Genome_Field_Nr col_nr2 - --Output1 output1 \ - --Output2 output_short \ - eg - perl Calculate_TestVariants_Variant_Frequencies_0_0_3.pl \ - --Input /data/Family_Quartet_testvariants.tsv \ - --Output /data/Family_Quartet_testvariants - --First_Genome_Field_Nr 9 \ - --Last_Genome_Field_Nr 11 - --Output1 /data/Family_Quartet_testvariants - --Output2 /data/Family_Quartet_testvariants_short - - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl --- a/cgatools_suite/tools/cgi_scripts/Calculate_TestVariants_Variant_Frequencies_0_1_0.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,271 +0,0 @@ -#!/usr/bin/perl -use strict; -#use feature "say"; -#use File::Basename; -$| = 1; - -# Get_TestVariants_Variant_Frequencies -# Calculate the frequencies of variants in a testvariants output file -# Two values calculated: -# Frequency vs all alleles -# Frequency vs called alleles - -# Input is a testvariants file -# Outputs: -# All data to *-Freq.tsv, including scores and quals -# vars and freqs to *-Freq_Short.tsv -# Exceptions to *-Freq_Log -# Stats to *-Freq_Stats - -# Format: -# perl prog file dir -# ie -# perl Get_TestVariants_Variant_Frequencies \ -# --Input input_file \ -# --First_Genome_Field_Nr col_nr1 \ -# --Last_Genome_Field_Nr col_nr2 -# --Output1 output1 \ -# --Output2 output2 - -# eg -# perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \ -# --Input /data/Family_Quartet_testvariants.tsv \ -# --First_Genome_Field_Nr 9 \ -# --Last_Genome_Field_Nr 11 -# --Output1 output1 \ -# --Output2 output2 - - -# Rick Tearle 2010-11 - -my $Time -= time; # start time -my $Debug = 0; - -# Parsing and storing input parameters -# Only childfields can be repeated -print "$0 @ARGV\nProcessing input parameters\n"; -my %ExpectedParams = GetExpectedParams (); -my %EnteredParams = GetEnteredParams (); - -# Setting up prog paras from input paras -my $FileIn = $EnteredParams{input}; -unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file -#my $FileOut = $EnteredParams{output}; # -#$DirectoryOut =~ s/\/$//; # remove trailing slash if present -#unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file -#print "$FileIn\n$DirectoryOut\n"; -#$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions - -# my $FileOut1 = $FileOut."-Freq.tsv"; -# my $FileOut2 = $FileOut."-Freq_Short.tsv"; -# my $FileOut3 = $FileOut."-Freq_Stats.tsv"; -# my $FileOut4 = $FileOut."-Freq_Log.tsv"; - -print "\nOpening Input File:\n\t$FileIn\n"; -my $IN = OpenFile ($FileIn); # open the file with correct file format - -#print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit; -open my $OUT1, ">", $EnteredParams{output1}; -open my $OUT2, ">", $EnteredParams{output2}; -#open my $OUT3, ">", $EnteredParams{output3}; -#open my $OUT4, ">", $EnteredParams{output4}; - -# Get col header and genomes fields -my $ColHeader = <$IN>; # get col header -chomp $ColHeader; -my @ColHeader = split /\t/, $ColHeader; -my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based -if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";} -if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";} -if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";} -if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";} -if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";} -my $NrGenomes = $StopGenomes - $StartGenomes + 1; -#print "$StartGenomes\t$StopGenomes\n"; #exit; -#print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n"; -#print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n"; - -# print column headers -print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n"; -print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n"; -print join("\t",@ColHeader),"\n"; -print "First Genome Field: $ColHeader[$StartGenomes]\n"; -print "Last Genome Field: $ColHeader[$StopGenomes]\n"; -print "Nr Genomes: $NrGenomes\n\n"; - -print "\nProcessing Variants....\n"; -my $VariantCount = 0; # variant locus counter, not used -my %AllFreqCounts; # storing histogram of all freq counts -my %CalledFreqCounts; # storing histogram of called freq counts -my $Warnings; -while (<$IN>) -{ - # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM] - my $Line = $_; # save line for output below - chomp $Line; - my @F = split /\t/, $Line; # split in to fields - $VariantCount++; # increment variant counter - my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s - my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s - my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s - my $CountN = () = $UseFields =~ /N/g; # count the number of Ns - my $NrAlleles = $Count1 + $Count0 + $CountN; # total count - unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus - { - print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning - #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n"; - #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings - } - my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles - my $CalledFreq = sprintf("%0.3f",0); - if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any - $AllFreqCounts{$AllFreq}++; # increment all freq histogram - $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram - #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit; - print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var - print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var - #exit if $VariantCount > 20; -} -close $OUT1; -close $OUT2; - -# Print frequency histograms -print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";} - -print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header -foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";} - -$Time += time; -print "\ntime $Time\n"; - -########################################################################### -# SUBS # -########################################################################### - -sub GetExpectedParams -{ - my %Hash = - ( - "input" => -1, - "output_dir" => -1, - ); # store parameters and values - return %Hash; -} - -sub GetEnteredParams -{ - # Processing @ARGV - my %Hash; - - my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array - #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug; - #print "Key\tVal\n" if $Debug; #exit; - for my $n (1..$#ARGVs) # parse each - { - $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces - my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val - $Key = lc $Key; - $Hash{$Key} = $Val; # make a hash entry out of key and val - #print "$Key\t$EnteredParams{$Key}\n" if $Debug; - } - #print int(keys %Hash),"\n" if $Debug; - #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";} - #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug; - #exit if $Debug; - return %Hash; # hash now has each -- entry param, with associated values -} - -sub SaveArrayAsString -{ - my $FH = shift; - my $Fields = shift; - #print "$Fields\n"; - print $FH join("\t",@$Fields),"\n"; -} - -sub ConcatenateVariants -{ - my $ArrayIn = shift; # ptr to array - my $StateFieldNr = shift; # field to process - #print int(@$ArrayIn),"\n"; - my @ArrayOut; # array to store records out - my $Nr = -1; - foreach my $Entry (@$ArrayIn) - { - } - return \@ArrayOut; # return ptr to array -} - -sub LoadStateRecord -{ - my $Out = shift; - my $In = shift; - my $StateFieldNr = shift; - - $Out->{State} = $$In[$StateFieldNr]; # get state for new record - $Out->{Chr} = $$In[1]; # get chr - $Out->{Begin} = $$In[2]; # get begin of state range - $Out->{End} = $$In[3]; # get current end of state range - $Out->{Records}++; # record added to new count -} - -sub OpenFile -{ - my $File = shift; - my $FH; - open ($FH, "$File") or die ("$!: can't open file $File"); - return $FH; -} - -sub OpenFileold -{ - my $File = shift; - my $FH; - - if ($File =~ /.bz2$/) - { - open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.gz$/) - { - open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.tsv$/) - { - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - else - { - die ("$!: do not recognise file type $File"); - } - return $FH; -} - -sub LoadNewRecord -{ - my $In = shift; - my $Out = shift; - $Out->{Chr} = $In->{Chr}; - $Out->{State} = $In->{State}; - $Out->{Begin} = $In->{Begin}; - $Out->{End} = $In->{End}; - $Out->{Records} = $In->{Records}; -} - -sub NewStateRecord -{ - my $Record = - { - Chr => "", - Begin => -1, - End => -1, - State => "", - Records => 0, - MIEs => 0, - StateErrors => 0, - Length => -1, - }; - return $Record; -} diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml --- a/cgatools_suite/tools/cgi_scripts/List_Unique_Variants.xml Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,323 +0,0 @@ - - - with annotations from gene or var files - - - #if $file_types.file_type =="var2" - List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output - --Var_Type $file_types.variants - $file_types.scoresVAF - $file_types.scoresEAF - $file_types.varQuality - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - --Input_File ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - #else if $file_types.file_type =="var1" - List_Unique_Variants_2_1_0.pl --File_Type V --Output_File $output - --Var_Type $file_types.variants - $file_types.scores - #if $file_types.data_sources.data_source == "in" - #for $v in $file_types.data_sources.varfiles - --Input_File ${v.input} - #end for - #else - `cat $file_types.data_sources.varlist` - #end if - - #else if $file_types.file_type =="gene" - List_Unique_Variants_2_1_0.pl --File_Type G --Output_File $output - --Var_Type $file_types.variants - --Component $file_types.component - --Impact $file_types.impact - #if $file_types.data_sources.data_source == "in" - #for $g in $file_types.data_sources.genefiles - --Input_File ${g.input} - #end for - #else - `cat $file_types.data_sources.genelist` - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool identifies all called variants present in the var or gene files and generates annotated variant list. - ------ - -**Instructions**:: - - List Unique Variants for Pipeline 1.x and 2.x - [Uses header if available, checks for position of xref field if not] - Take one or more var or gene files - Extract a non-redundant set of variants - - For var files: - The fields used to define non-redundant variants are are: - chromosome begin end varType reference alleleSeq xRef - User can nominate class(es) of varType to filter on - Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn - them off (separately) - Scores and qualities stored in separate fields, all values for a variant across - a set of genomes. - Values for different genomes separated by ':', for two hom entries for the same - genome by '|' - Output is accepted by testvariants to generate a variant table, all fields kept - in testvariants output - - For gene files: - The fields used to define non-redundant gene variants are: - chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol - orientation component componentIndex codingRegionKnown impact nucleotidePos - proteinPos annotationRefSequence sampleSequence genomeRefSequence - User can nominate class(es) of varType, component or impact to filter on - All gene entries kept ie multiple entries if multiple transcripts - - NB Now treating xref as a separate component in var recs, as it is not consistent - between X and Y vars - Not fixed for gene recs yet - - perl List_Unique_Variants_2_0_11.pl - --File_Type [V|G] - --Input_File input_file_1 [set of var or gene files] - --Input_File input_file_2 - ... - --Input_File input_file_n - --Output_File filename - --Var_Type [For both file types, 'All' or any value from the varType field, - multiple values allowed, separated by comma] - --Component [Gene file specific,'All' or any value from component field of gene - file, multiple allowed; 'All" is default] - --Impact All [Gene file specific,'All' or any value from impact field of gene - file, multiple allowed; 'All" is default] - --Scores [1.x var file specific, yes|no, yes is default] - --Scores_VAF [2.0 var file specific, yes|no, yes is default] - --Scores_EAF [2.0 var file specific, yes|no, yes is default] - --Score_Qualities [yes|no, yes is default] - eg - perl List_Unique_Variants_2_0_11.pl \ - --File_Type V \ - --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ - --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ - --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ - --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ - --Var_Type All - --Component All - --Impact All - --Scores_VAF yes \ - --Scores_EAF yes \ - --Score_Qualities yes - - var fields - 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq - totalScore hapLink xRef - 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq - varScoreVAF varScoreEAF varQuality hapLink xRef - - gene fields - 1.x index locus allele chromosome begin end varType reference call xRef geneId - mrnaAcc proteinAcc symbol orientation component componentIndex - codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence - sampleSequence genomeRefSequence - 2.0 index locus allele chromosome begin end varType reference call xRef geneId - mrnaAcc proteinAcc symbol orientation component componentIndex hasCodingRegion - impact nucleotidePos proteinPos annotationRefSequence sampleSequence - genomeRefSequence pfam - - Parsing and storing input parameters - Only input_file fields can be repeated - input paramaters are case insensitive - - - - diff -r e2e802a079f0 -r a006c9df8747 cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl --- a/cgatools_suite/tools/cgi_scripts/List_Unique_Variants_2_1_0.pl Thu Jun 07 14:32:32 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,583 +0,0 @@ -#!/usr/bin/perl -use strict; -use File::Basename; -$| = 1; - -# List Unique Variants for Pipeline 1.x and 2.x -# [Uses header if available, checks for position of xref field if not] -# Take one or more var or gene files -# Extract a non-redundant set of variants - -# For var files: -# The fields used to define non-redundant variants are are: -# chromosome begin end varType reference alleleSeq xRef -# User can nominate class(es) of varType to filter on -# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately) -# Scores and qualities stored in separate fields, all values for a variant across a set of genomes. -# Values for different genomes separated by ':', for two hom entries for the same genome by '|' -# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output - -# For gene files: -# The fields used to define non-redundant gene variants are: -# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence -# User can nominate class(es) of varType, component or impact to filter on -# All gene entries kept ie multiple entries if multiple transcripts - -# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars -# Not fixed for gene recs yet - -# perl List_Unique_Variants_2_0_9.pl -# --File_Type [V|G] -# --Input_File input_file_1 [set of var or gene files] -# --Input_File input_file_2 -# ... -# --Input_File input_file_n -# --Output_File filename -# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma] -# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default] -# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default] -# --Scores [1.x var file specific, yes|no, yes is default] -# --Scores_VAF [2.0 var file specific, yes|no, yes is default] -# --Scores_EAF [2.0 var file specific, yes|no, yes is default] -# --Score_Qualities [yes|no, yes is default] -# eg -# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \ -# --File_Type V \ -# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \ -# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \ -# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \ -# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \ -# --Var_Type All -# --Component All -# --Impact All -# --Scores_VAF yes \ -# --Scores_EAF yes \ -# --Score_Qualities yes - -# var fields -# 1.x -# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef -# 2.0 -# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - -# gene fields -# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation -# component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence -# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation -# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - -# Parsing and storing input parameters -# Only input_file fields can be repeated -# input paramaters are case insensitive - -print "$0 @ARGV\nProcessing input parameters\n"; -my $NrParams; -my %ExpectedParams = GetExpectedParams (); # list of expected parms -my %EnteredParams = GetEnteredParams (); # list of entered params - -# Input Files -my $FileType = $EnteredParams{file_type}; -if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";} -my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files -print "File Type: $FileType\nInput Files:\n"; -my $NrInputFiles = int(@$FilesIn); -foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files -foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files -for my $n (0.. $NrInputFiles-2) # look for duplicates in list - {for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}} - -# Output Dir -#my $DirectoryOut = $EnteredParams{output_dir}; # output dir -#$DirectoryOut =~ s/\/$//; # remove trailing slash if present -#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can - -# Ouput File -my $FileOut = $EnteredParams{output_file}; # output file -print "FileOut: $FileOut\n"; -$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename -my $DirectoryOut = $1; # assign path # NEED MORE TESTING EG EMPTY PATH # -$FileOut = $2; # assign file prefix -print "File: $FileOut\nDir: $DirectoryOut\n"; #exit; -# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name -# { -# print "Output file $FileOut exists, modifying to unique file name "; -# $FileOut =~ /^(.+?)\./; # find name without extensions -# my $Stub = $1; # set stub to name without extensions -# $FileOut =~ /(\..+)?$/; # get extension(s) -# my $Ext = $1; # set ext to extensions -# my $n = 1; # n will increment to find a unique name -# my $Suff = ""; # suff tracks n -# while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename -# $FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique -# print "$FileOut\n"; -# } - -#print "Files\n",join("\n",@$FilesIn),"\n\n"; -#print "Ouput Dir\n$DirectoryOut\n"; -#print "Ouput File\n$FileOut\n"; - -# Extract Header & Column Header -my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format -my $Header = GetHeaderAsString ($IN); # get header -unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again -my $ColHeader = <$IN>; # get col header, first remaining line -chomp $ColHeader; - -# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed -my ($Version, $XrefField); -if ($FileType eq "V") -{ - ($Version, $XrefField) = GetVersion ($Header, $ColHeader); - #print "$Version $XrefField\n"; exit; - unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";} -} - -# Shared input params -my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc -$OutputVarTypes =~ s/\,/\|/g; # create regex string -$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces - -# Input Params for var file -my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras); -if ($FileType eq "V") -{ - if ($Version == 2) - { - $KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0 - $KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean - $KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0 - $KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean - $KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0 $KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0 - $KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean - } - else # Version 1 - { - $KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x - $KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean - } - $VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info -} - -# Input Params for gene file -my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc -my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc - -# Loading chr nrs, setting up var hash -my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', - 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', - 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order -my %Vars; # hash to store var records in an array for each chr -foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";} # set up hash of hashes, one for each chr -#print "\n"; #exit; - -# Create ouput col header -if ($FileType eq "V") -{ - # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef - # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - my @Fields = split "\t", $ColHeader; - $ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField]; - if ($Version == 2) # 2.0 - { - if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";} - if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";} - if ($KeepQuals) {$ColHeader .= "\tvarQuality";} - } - else # 1.x - { - if ($KeepScores) {$ColHeader .= "\ttotalScore";} - } -} -elsif ($FileType eq "G") -{ - # 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation - # component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence - # 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation - # component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - my @Fields = split "\t", $ColHeader; - $ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed - $OutputComponents =~ s/\,/\|/g; # create regex string - $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces - $OutputComponents =~ s/\,| //; # remove extraneous commas, spaces - $OutputImpacts =~ s/\,/\|/g; # create regex string - $OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces - #print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n"; -} -else -{ - die "FileType $FileType not understood\n"; # redundant -} - -# Set up Processing Subs -if ($FileType eq "G") # use gene subs for 'G' -{ - *ExtractSub2Use = \&ExtractGeneFields; - *AddRecSub2Use = \&AddGeneRec; -} -else # use var subs for 'V' -{ - *ExtractSub2Use = \&ExtractVarFields; - *AddRecSub2Use = \&AddVarRec; -} -#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit; - -# Process Files -my $RecCount = 0; # total nr recs that are coding/splicing/spanning -my $FileCount = 0; # keeps track of nr files to use to format eg scores field -my $XRef; -foreach my $File (@$FilesIn) -{ - print "Processing file $File\n"; - $FileCount++; # counting nr files - - # Open file, process header, col header - my $IN = OpenFile ($File); # open the file with correct file format - my $Header = GetHeaderAsString ($IN); # get header - unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again - my $ColHeader = <$IN>; # get col header, first remaining line - unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"} - - my $Count = 0; # cnr recs for this file that are coding/splicing/spanning - while (<$IN>) # loop through remainder of file ie data - { - my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally - next unless $Rec; - AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty - $Count++; # increment count of coding/splicing/spanning vars - } - print "Nr matched records for this file: $Count\n"; - $RecCount += $Count; # add this file's count to total count - close $IN; -} -print "Nr matched records across all files:\t $RecCount\n"; # total count - -# Open file out, write col header -print "Sorting and Saving to file $DirectoryOut$FileOut ...\n"; -open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n"; -if ($FileType eq "V") -{ - print $OUT "variantId\t"; -} # first col header for var file -else -{ - $ColHeader =~ s/\tcall\t/\talleleSeq\t/; - print $OUT "index\t"; -} # first col header for gene file -print $OUT "$ColHeader\n"; # remainder of col header - -$RecCount = 0; # reuse total count for nr of non-reduntant vars -$FileCount--; # reduce by one, used below to add missing delimiters -foreach my $Chr (@ChrNames) # sort records in each chr array and print with count -{ - foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields - { - next unless $Rec; - $RecCount++; # increment count of coding/splicing/spanning vars - print $OUT "$RecCount\t$Rec"; # printing rec and count - print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files - - if ($VarExtras) - { - my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g; - #print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n"; - #exit if $TmpCount++ > 10; - my $Addition = ":" x ($FileCount - $FieldDelimiterCount); - if ($Version == 2) - { - print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF; - print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF; - print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals; - } - else - { - print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores; - } - #print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count - } - print $OUT "\n"; - } -} -print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve - - -########################################################################### -# SUBS # -########################################################################### - -sub GetExpectedParams -{ - my %Hash = # hash to store expected params - ( - "file_type" => -1, - "input_file" => [], - "output_file" => -1, - "var_type" => "all", - "component" => "ALL", - "impact" => "ALL", - "scores" => "yes", - "scores_eaf" => "yes", - "scores_vaf" => "yes", - "score_quality" => "yes", - ); - $NrParams = int keys %Hash; - return %Hash; -} - -sub GetEnteredParams -{ - # Processing @ARGV - my %Hash; - my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array - for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored] - { - $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces - my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val - $Key = lc $Key; # make lower case, ie case insensitive - if ($Key eq "input_file") # multiple entries expected, setting up array - { - push @{$Hash{$Key}}, $Val; # add input to input hash - - } - else - { - $Hash{$Key} = $Val; # make a hash entry out of key and val - } - } - return %Hash; # hash now has each --entry param, with associated values -} - -sub OpenFile -{ - my $File = shift; - my $FH; - open $FH, $File; - return $FH; -} - -sub OpenFileold -{ - my $File = shift; - my $FH; - - if ($File =~ /.bz2$/) - { - open ($FH, "bzcat $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.gz$/) - { - open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File"); - } - elsif ($File =~ /.tsv$/ or $File =~ /.txt$/) - { - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - else - { - print ("Do not recognise file type for file $File.\nOpening as text file\n"); - open ($FH, "cat $File |") or die ("$!: can't open file $File"); - } - return $FH; -} - -sub GetHeaderAsString -{ - my $FH = shift; - my $Header = ""; - my $Count = 0; - while (<$FH>) # loop until a line is empty - { - chomp; - if ($_ eq "") # exit when empty line - { - return $Header ; # return ref to array - } - else - { - $Header .= $_; - } - return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array - } -} - -sub GetVersion -{ - my $Header = shift; - my $ColHeader = shift; - - my $Version = 0; # need to know if it is 1.x or 2.x - my $XrefField = -1; - - if ($FileType eq "V") - { - if ($Header) - { - $Header =~ /#FORMAT_VERSION\t(\d)/; - if ($1 == 1) {$Version = 1; $XrefField = 11;} - elsif ($1 == 2) {$Version = 2; $XrefField = 13;} - else {print "Warning: Format Version not found in Header\n";} # not in header - } - unless ($Version) - { - my @ColHeader = split /\t/, $ColHeader; - for my $n (0..int(@ColHeader)-1) - { - if ($ColHeader eq "xRef") - { - $XrefField = $n; - if ($n == 11) {$Version = 1;} - elsif ($n == 13) {$Version = 2;} - last; - } - } - } - } - return ($Version, $XrefField); -} - -sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr -{ - my $Rec = shift; - # gene fields - # >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc - # proteinAcc symbol orientation component componentIndex hasCodingRegion impact - # nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam - - chomp $Rec; # remove return - my @Fields = split "\t", $Rec; - #print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n"; - unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave - unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave - unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave - my $Chr = $Fields[3]; # assign chr - $Rec = join("\t",@Fields[3..24]); - #$Rec =~ s/\t$//; # remove trailing tab if there is one - - return ($Rec, $Chr); -} - -sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr -{ - my $Rec = shift; - my $XrefField = shift; - # var fields - # 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef - # 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - - chomp $Rec; # remove return - my @Fields = split "\t", $Rec; - #print "$Fields[6] $OutputVarTypes \n"; exit; - unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave - my $Chr = $Fields[3]; # assign chr - #$Rec = join("\t",@Fields[3..8]); - $Rec = join("\t",@Fields[3..8]); - #$Rec =~ s/\t$//; # remove trailing tab if there is one - - if ($VarExtras) - { - return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2; - return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1; - } - else - { - return ($Rec, $Chr); - } -} - -sub AddVarRec -{ - my $Rec = shift; - my $RecHash = shift; - my $ScoreVAF = shift; - my $ScoreEAF = shift; - my $ScoreQual = shift; - my $FileCount = shift; - my $XRef = shift; - - if ($VarExtras) # need to extract scores information - { - # locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef - # Set delimiter - my $Delimiter; - if ($RecHash->{$Rec}) # hash entry for this var already exists - { - if ($RecHash->{$Rec}->[0] == $FileCount) - { - $Delimiter = "|"; # same chr, var is hom, use | - } - else # diff chr, use : - { - my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims - $Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var - } - $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer - } - else # new var - { - $RecHash->{$Rec} = []; # create array to hold it - $Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var - $RecHash->{$Rec}->[4] = $XRef; # add xref - } - - # Process var - $RecHash->{$Rec}->[0] = $FileCount; - if ($Version == 2) - { - $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF - $RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF - $RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual - } - else - { - $RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore - } - - } - else # just the rec, no var extras being extrcted - { - $RecHash->{$Rec}++; # hash with rec as key, increment count for this key - $RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here - } -} - -sub AddGeneRec -{ - my $Rec = shift; - my $RecHash = shift; - - $RecHash->{$Rec}++; # hash with rec as key, increment count for this key -} - -sub SortStringsasArrays # sorts based on begin and end of two recs -{ - my $String1 = shift; # first string - my $String2 = shift; # second string - - my @Array1 = split "\t", $String1; # put fields into array - my @Array2 = split "\t", $String2; - - # array[1] is begin, array[2] is end, returning order based on these fields - if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2 - { - return -1; - } - elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2 - { - if ($Array1[2] < $Array2[2]) # end of 1 < end of 2 - { - return -1; - } - elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2 - { - return 0; - } - else # end of 1 > end of 2 - { - return 1; - } - } - else # begin of 1 > begin of 2 - { - return 1; - } - -}