Mercurial > repos > bcrain-completegenomics > testing3
diff CG_cgatools/tools/cgatools/join.xml @ 0:ffb2b0244ab2 draft
Uploaded
author | bcrain-completegenomics |
---|---|
date | Tue, 12 Jun 2012 11:54:38 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CG_cgatools/tools/cgatools/join.xml Tue Jun 12 11:54:38 2012 -0400 @@ -0,0 +1,157 @@ +<tool id="cga_join" name="join(beta)" version="0.0.1"> + + <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar--> + + <requirements> + <requirement type="binary">cgatools</requirement> + </requirements> + + <command> <!--run executable--> + cgatools join --beta + --input $input1 + --input $input2 + --output $output + --output-mode $outmode + $dump + --select $col + #for $m in $matched <!--get all matched columns--> + --match ${m.match} + #end for + </command> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <inputs> + <!--form field to select input file A--> + <param name="input1" type="data" format="tabular" label="Select first input file (A)"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="0" + message="cgatools is not currently available for this build."/> + </param> + + <!--form field to select input file B--> + <param name="input2" type="data" format="tabular" label="Select second input file (B)"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc" + metadata_name="dbkey" metadata_column="0" + message="cgatools is not currently available for this build."/> + </param> + + <!--form field to specify columns to match--> + <repeat name="matched" title="Matched column"> + <param name="match" type="text" label="Enter column A:column B"/> + </repeat> + + <!--form field to specify columns to print--> + <param name="col" type="text" value="A.*,B.*" label="Specify columns to print from file A and B in format A.col_name1,A.col_name2,B.col_name1" /> + + <!--form field to select output-mode--> + <param name="outmode" type="select" label="Select output mode"> + <option value="full" selected="true">full (1 line for each match of records in A and B)</option> + <option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option> + <option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option> + </param> + + <!--form field to select columns to match--> + <param name="dump" type="select" label="Select records to print"> + <option value="--always-dump" selected="true">print all records of A even if not matched in B</option> + <option value="">print only records of A that are matched in B</option> + </param> + </inputs> + + <help> + +**What it does** + +This tool joins two tab-delimited files based on equal fields or overlapping regions. + +cgatools: http://sourceforge.net/projects/cgatools/files/ + +----- + +**cgatools Manual**:: + + COMMAND NAME + join - Joins two tab-delimited files based on equal fields or overlapping regions. + + DESCRIPTION + Joins two tab-delimited files based on equal fields or overlapping regions. + By default, an output record is produced for each match found between file + A and file B, but output format can be controlled by the --output-mode + parameter. + + OPTIONS + -h [ --help ] + Print this help message. + + --beta + This is a beta command. To run this command, you must pass the --beta + flag. + + --input arg + File name to use as input (may be passed in as arguments at the end of + the command), or omitted for stdin). There must be exactly two input + files to join. If only one file is specified by name, file A is taken + to be stdin and file B is the named file. File B is read fully into + memory, and file A is streamed. File A's columns appear first in the + output. + + --output arg (=STDOUT) + The output file name (may be omitted for stdout). + + --match arg + A match specification, which is a column from A and a column from B + separated by a colon. + + --overlap arg + + -m [ --output-mode ] arg (=full) + Output mode, one of the following: + full Print an output record for each match found between + file A and file B. + compact Print at most one record for each record of file A, + joining the file B values by a semicolon and + suppressing repeated B values and empty B values. + compact-pct Same as compact, but for each distinct B value, + annotate with the percentage of the A record that is + overlapped by B records with that B value. Percentage + is rounded up to nearest integer. + + --overlap-mode arg (=strict) + Overlap mode, one of the following: + strict Range A and B overlap if A.begin < B.end and + B.begin < A.end. + allow-abutting-points Range A and B overlap they meet the strict + requirements, or if A.begin <= B.end and + B.begin <= A.end and either A or B has zero + length. + + --select arg (=A.*,B.*) + Set of fields to select for output. + + -a [ --always-dump ] + Dump every record of A, even if there are no matches with file B. + + --overlap-fraction-A arg (=0) + Minimum fraction of A region overlap for filtering output. + + --boundary-uncertainty-A arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-A * ( A-range-length - boundary-uncertainty-A ) + + --overlap-fraction-B arg (=0) + Minimum fraction of B region overlap for filtering output. + + --boundary-uncertainty-B arg (=0) + Boundary uncertainty for overlap filtering. Specifically, records + failing the following predicate are filtered away: overlap >= + overlap-fraction-B * ( B-range-length - boundary-uncertainty-B ) + + SUPPORTED FORMAT_VERSION + Any + </help> +</tool>