view cgatools/tools/cgatools/join.xml @ 0:fe973f1bef41 draft

Uploaded
author bcrain-completegenomics
date Thu, 07 Jun 2012 14:50:45 -0400
parents
children ed3c76be8a41
line wrap: on
line source

<tool id="cga_join" name="join(beta)" version="0.0.1">

  <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar-->

  <requirements>
  	<requirement type="binary">cgatools</requirement>
  </requirements>

  <command> <!--run executable-->
		cgatools join --beta 
		--input $input1 
		--input $input2 
		--output $output 
		--output-mode $outmode 
		$dump 
		--select $col
		#for $m in $matched <!--get all matched columns-->
		--match ${m.match}
		#end for
  </command>

  <outputs>
		<data format="tabular" name="output" />
  </outputs>
  
  <inputs>
   	<!--form field to select input file A-->
    <param name="input1" type="data" format="tabular" label="Select first input file (A)">
      <validator type="unspecified_build" />
			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
				metadata_name="dbkey" metadata_column="0"
				message="cgatools is not currently available for this build."/>
    </param>
    
  	<!--form field to select input file B-->
    <param name="input2" type="data" format="tabular" label="Select second input file (B)">
      <validator type="unspecified_build" />
			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
				metadata_name="dbkey" metadata_column="0"
				message="cgatools is not currently available for this build."/>
    </param>
    
  	<!--form field to specify columns to match-->
    <repeat name="matched" title="Matched column">
      <param name="match" type="text" label="Enter column A:column B"/>
    </repeat>

  	<!--form field to specify columns to print-->
    <param name="col" type="text" value="A.*,B.*" label="Specify columns to print from file A and B in format A.col_name1,A.col_name2,B.col_name1" />

  	<!--form field to select output-mode-->
		<param name="outmode" type="select" label="Select output mode">
			<option value="full" selected="true">full (1 line for each match of records in A and B)</option>
			<option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
			<option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
		</param>

		<!--form field to select columns to match-->
		<param name="dump" type="select" label="Select records to print">
			<option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
			<option value="">print only records of A that are matched in B</option>
		</param>
  </inputs>

  <help>
  
**What it does**

This tool joins two tab-delimited files based on equal fields or overlapping regions.

cgatools: http://sourceforge.net/projects/cgatools/files/

-----

**cgatools Manual**::

		COMMAND NAME
		  join - Joins two tab-delimited files based on equal fields or overlapping regions.
		
		DESCRIPTION
		  Joins two tab-delimited files based on equal fields or overlapping regions.
		  By default, an output record is produced for each match found between file 
		  A and file B, but output format can be controlled by the --output-mode 
		  parameter.
		
		OPTIONS
		  -h [ --help ] 
		      Print this help message.
		
		  --beta 
		      This is a beta command. To run this command, you must pass the --beta 
		      flag.
		
		  --input arg
		      File name to use as input (may be passed in as arguments at the end of 
		      the command), or omitted for stdin). There must be exactly two input 
		      files to join. If only one file is specified by name, file A is taken 
		      to be stdin and file B is the named file. File B is read fully into 
		      memory, and file A is streamed. File A's columns appear first in the 
		      output.
		
		  --output arg (=STDOUT)
		      The output file name (may be omitted for stdout).
		
		  --match arg
		      A match specification, which is a column from A and a column from B 
		      separated by a colon.
		
		  --overlap arg
						
		  -m [ --output-mode ] arg (=full)
		      Output mode, one of the following:
		        full        Print an output record for each match found between 
		                    file A and file B.
		        compact     Print at most one record for each record of file A, 
		                    joining the file B values by a semicolon and 
		                    suppressing repeated B values and empty B values.
		        compact-pct Same as compact, but for each distinct B value, 
		                    annotate with the percentage of the A record that is 
		                    overlapped by B records with that B value. Percentage 
		                    is rounded up to nearest integer.
		
		  --overlap-mode arg (=strict)
		      Overlap mode, one of the following:
		        strict                Range A and B overlap if A.begin &lt; B.end and 
		                              B.begin &lt; A.end.
		        allow-abutting-points Range A and B overlap they meet the strict 
		                              requirements, or if A.begin &lt;= B.end and 
		                              B.begin &lt;= A.end and either A or B has zero 
		                              length.

		  --select arg (=A.*,B.*)
		      Set of fields to select for output.
		
		  -a [ --always-dump ] 
		      Dump every record of A, even if there are no matches with file B.
		
		  --overlap-fraction-A arg (=0)
		      Minimum fraction of A region overlap for filtering output.
		
		  --boundary-uncertainty-A arg (=0)
		      Boundary uncertainty for overlap filtering. Specifically, records 
		      failing the following predicate are filtered away: overlap &gt;= 
		      overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
		
		  --overlap-fraction-B arg (=0)
		      Minimum fraction of B region overlap for filtering output.
		
		  --boundary-uncertainty-B arg (=0)
		      Boundary uncertainty for overlap filtering. Specifically, records 
		      failing the following predicate are filtered away: overlap &gt;= 
		      overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )

		SUPPORTED FORMAT_VERSION
		  Any
  </help>
</tool>