Mercurial > repos > jjohnson > find_in_reference

<?xml version="1.0"?>
<tool id="find_in_reference" name="find in reference" version="0.0.1">
  <description>filter peptides that are present in proteins</description>
  <command interpreter="python">find_in_reference.py  --input "$input"
  --reference "$reference"
  #if $column.set == 'yes':
    --input_column $column.input_column
    --reference_column $column.reference_column
  #end if
  $case_insensitive
  #if 'novel' in $outputs.__str__ or not 'found' in $outputs.__str__:
    --output "$novel"
  #end if
  #if 'found' in $outputs.__str__:
    --filtered "$found"
  #end if
  </command>
  <inputs>
    <param name="input" type="data" format="tabular" label="Input file to be filtered"
           help="e.g. a peptide fasta converted to tabular"/>
    <param name="reference" type="data" format="tabular" label="reference file to search"
           help="e.g. a protein fasta converted to tabular"/>
    <conditional name="column">
      <param name="set" type="select" label="select columns to compare">
        <option value="no" selected="true">Use last column of input and reference</option>
        <option value="yes">Choose the column of input and reference to compare</option>
      </param>
      <when value="no"/>
      <when value="yes">
        <param name="input_column" type="data_column" data_ref="input" label="column in input (defaults to last column)"
           help=""/>
        <param name="reference_column" type="data_column" data_ref="reference" label="column in reference (defaults to last column)"
           help=""/>
      </when>
    </conditional>
    <param name="case_insensitive" type="boolean" truevalue="--case_insensitive" falsevalue="" checked="false" label="Ignore case when comparing"/>
    <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
      <option value="novel" selected="true">lines with no match in reference</option>
      <option value="found">lines with match in reference</option>
    </param>
  </inputs>
  <stdio>
    <exit_code range="1:" level="fatal" description="Error" />
  </stdio>
  <outputs>
    <data name="found" metadata_source="input" format_source="input" label="${tool.name} on ${on_string}: found">
      <filter>'found' in str(outputs)</filter>
    </data>
    <data name="novel" metadata_source="input" format_source="input" label="${tool.name} on ${on_string}: novel">
      <filter>'novel' in str(outputs) or not 'found' in str(outputs)</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <param name="input" value="human_peptides.tabular" ftype="tabular" dbkey="hg19"/>
      <param name="reference" value="human_proteins.tabular" ftype="tabular" dbkey="hg19"/>
      <output name="novel" file="novel_peptides.tabular"/>
    </test>
  </tests>
  <help>
**Find in Reference**

Filters lines of a tabular input file by checking if the selected input column value
is a substring of the selected column of any line in the reference file.

This can be used to check if peptides sequences are present in a set of reference proteins,
as a means of filtering out uninteresting peptide sequences.

For Example with::

  Input
    >pep1	LIL
    >pep2	WTF
    >pep3	ISK

  Reference
    >prot1	RLET
    >prot2	LLIL
    >prot3	LAPSE
    >prot3	RISKY

  The outputs

  Not found in reference
    >pep2	WTF

  Found in reference
    >pep1	LIL
    >pep3	ISK


  </help>
</tool>
author	Jim Johnson <jj@umn.edu>
date	Fri, 17 Jan 2014 14:50:53 -0600
parents	fe0327a3ba81
children	30975b3ff0dc