diff CG_cgatools/tools/cgatools/join.xml @ 0:ffb2b0244ab2 draft

Uploaded
author bcrain-completegenomics
date Tue, 12 Jun 2012 11:54:38 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CG_cgatools/tools/cgatools/join.xml	Tue Jun 12 11:54:38 2012 -0400
@@ -0,0 +1,157 @@
+<tool id="cga_join" name="join(beta)" version="0.0.1">
+
+  <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar-->
+
+  <requirements>
+  	<requirement type="binary">cgatools</requirement>
+  </requirements>
+
+  <command> <!--run executable-->
+		cgatools join --beta 
+		--input $input1 
+		--input $input2 
+		--output $output 
+		--output-mode $outmode 
+		$dump 
+		--select $col
+		#for $m in $matched <!--get all matched columns-->
+		--match ${m.match}
+		#end for
+  </command>
+
+  <outputs>
+		<data format="tabular" name="output" />
+  </outputs>
+  
+  <inputs>
+   	<!--form field to select input file A-->
+    <param name="input1" type="data" format="tabular" label="Select first input file (A)">
+      <validator type="unspecified_build" />
+			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
+				metadata_name="dbkey" metadata_column="0"
+				message="cgatools is not currently available for this build."/>
+    </param>
+    
+  	<!--form field to select input file B-->
+    <param name="input2" type="data" format="tabular" label="Select second input file (B)">
+      <validator type="unspecified_build" />
+			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
+				metadata_name="dbkey" metadata_column="0"
+				message="cgatools is not currently available for this build."/>
+    </param>
+    
+  	<!--form field to specify columns to match-->
+    <repeat name="matched" title="Matched column">
+      <param name="match" type="text" label="Enter column A:column B"/>
+    </repeat>
+
+  	<!--form field to specify columns to print-->
+    <param name="col" type="text" value="A.*,B.*" label="Specify columns to print from file A and B in format A.col_name1,A.col_name2,B.col_name1" />
+
+  	<!--form field to select output-mode-->
+		<param name="outmode" type="select" label="Select output mode">
+			<option value="full" selected="true">full (1 line for each match of records in A and B)</option>
+			<option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
+			<option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
+		</param>
+
+		<!--form field to select columns to match-->
+		<param name="dump" type="select" label="Select records to print">
+			<option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
+			<option value="">print only records of A that are matched in B</option>
+		</param>
+  </inputs>
+
+  <help>
+  
+**What it does**
+
+This tool joins two tab-delimited files based on equal fields or overlapping regions.
+
+cgatools: http://sourceforge.net/projects/cgatools/files/
+
+-----
+
+**cgatools Manual**::
+
+		COMMAND NAME
+		  join - Joins two tab-delimited files based on equal fields or overlapping regions.
+		
+		DESCRIPTION
+		  Joins two tab-delimited files based on equal fields or overlapping regions.
+		  By default, an output record is produced for each match found between file 
+		  A and file B, but output format can be controlled by the --output-mode 
+		  parameter.
+		
+		OPTIONS
+		  -h [ --help ] 
+		      Print this help message.
+		
+		  --beta 
+		      This is a beta command. To run this command, you must pass the --beta 
+		      flag.
+		
+		  --input arg
+		      File name to use as input (may be passed in as arguments at the end of 
+		      the command), or omitted for stdin). There must be exactly two input 
+		      files to join. If only one file is specified by name, file A is taken 
+		      to be stdin and file B is the named file. File B is read fully into 
+		      memory, and file A is streamed. File A's columns appear first in the 
+		      output.
+		
+		  --output arg (=STDOUT)
+		      The output file name (may be omitted for stdout).
+		
+		  --match arg
+		      A match specification, which is a column from A and a column from B 
+		      separated by a colon.
+		
+		  --overlap arg
+						
+		  -m [ --output-mode ] arg (=full)
+		      Output mode, one of the following:
+		        full        Print an output record for each match found between 
+		                    file A and file B.
+		        compact     Print at most one record for each record of file A, 
+		                    joining the file B values by a semicolon and 
+		                    suppressing repeated B values and empty B values.
+		        compact-pct Same as compact, but for each distinct B value, 
+		                    annotate with the percentage of the A record that is 
+		                    overlapped by B records with that B value. Percentage 
+		                    is rounded up to nearest integer.
+		
+		  --overlap-mode arg (=strict)
+		      Overlap mode, one of the following:
+		        strict                Range A and B overlap if A.begin &lt; B.end and 
+		                              B.begin &lt; A.end.
+		        allow-abutting-points Range A and B overlap they meet the strict 
+		                              requirements, or if A.begin &lt;= B.end and 
+		                              B.begin &lt;= A.end and either A or B has zero 
+		                              length.
+
+		  --select arg (=A.*,B.*)
+		      Set of fields to select for output.
+		
+		  -a [ --always-dump ] 
+		      Dump every record of A, even if there are no matches with file B.
+		
+		  --overlap-fraction-A arg (=0)
+		      Minimum fraction of A region overlap for filtering output.
+		
+		  --boundary-uncertainty-A arg (=0)
+		      Boundary uncertainty for overlap filtering. Specifically, records 
+		      failing the following predicate are filtered away: overlap &gt;= 
+		      overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
+		
+		  --overlap-fraction-B arg (=0)
+		      Minimum fraction of B region overlap for filtering output.
+		
+		  --boundary-uncertainty-B arg (=0)
+		      Boundary uncertainty for overlap filtering. Specifically, records 
+		      failing the following predicate are filtered away: overlap &gt;= 
+		      overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
+
+		SUPPORTED FORMAT_VERSION
+		  Any
+  </help>
+</tool>