Mercurial > repos > nick > dunovo

--- a/align_families.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/align_families.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -1,13 +1,13 @@
 <?xml version="1.0"?>
-<tool id="align_families" name="Du Novo: Align families" version="2.14">
+<tool id="align_families" name="Du Novo: Align families" version="2.15">
   <description>of duplex sequencing reads</description>
   <requirements>
     <requirement type="package" version="7.221">mafft</requirement>
-    <requirement type="package" version="2.14">dunovo</requirement>
+    <requirement type="package" version="2.15">dunovo</requirement>
     <!-- TODO: require Python 2.7 -->
   </requirements>
   <version_command>align-families.py --version</version_command>
-  <command detect_errors="exit_code">align-families.py --aligner $aligner --galaxy $phone --processes \${GALAXY_SLOTS:-1} '$input' &gt; '$output'
+  <command detect_errors="exit_code">align-families.py $check_ids --aligner $aligner --galaxy $phone --processes \${GALAXY_SLOTS:-1} '$input' &gt; '$output'
   </command>
   <inputs>
     <param name="input" type="data" format="tabular" label="Input reads" help="with barcodes, grouped by family"/>
@@ -15,6 +15,7 @@
       <option value="kalign">Kalign2</option>
       <option value="mafft">MAFFT</option>
     </param>
+    <param name="check_ids" type="boolean" truevalue="" falsevalue="--no-check-ids" checked="True" label="Check read names" help="Make sure reads are properly paired up. The job will fail if there is a pair of reads where their ids aren't identical (minus any ending /1 or /2)."/>
     <param name="phone" type="boolean" truevalue="--phone-home" falsevalue="" checked="False" label="Send usage data" help="Report helpful usage data to the developer, to better understand the use cases and performance of the tool. The only data which will be recorded is the name and version of the tool, the size of the input data, the number of processes used, the time and memory taken to process it, the alignment algorithm selected, and the IP address of the machine running it. Also, if the tool fails, it will report the name of the exception thrown and the line of code it occurred in. The names of the input and output datasets are not sent. All the reporting and recording code is available at https://github.com/NickSto/ET."/>
   </inputs>
   <outputs>
--- a/correct_barcodes.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/correct_barcodes.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -1,16 +1,16 @@
 <?xml version="1.0"?>
-<tool id="correct_barcodes" name="Du Novo: Correct barcodes" version="2.14">
+<tool id="correct_barcodes" name="Du Novo: Correct barcodes" version="2.15">
   <description>of duplex sequencing reads</description>
   <requirements>
     <requirement type="package" version="1.1.2">bowtie</requirement>
     <requirement type="package" version="1.10">networkx</requirement>
-    <requirement type="package" version="2.14">dunovo</requirement>
+    <requirement type="package" version="2.15">dunovo</requirement>
     <!-- TODO: require Python 2.7 -->
   </requirements>
   <version_command>correct.py --version</version_command>
   <command detect_errors="exit_code"><![CDATA[
     baralign.sh -c $advanced.chunkmbs -t \${GALAXY_SLOTS:-1} '$input' refdir correct.sam
-    && correct.py --galaxy $phone --dist $dist --mapq $mapq --pos $pos
+    && correct.py --galaxy $phone $check_ids --dist $dist --mapq $mapq --pos $pos
       '$input' refdir/barcodes.fa correct.sam
       > families.corrected.tsv
     && sort families.corrected.tsv
@@ -22,6 +22,7 @@
     <param name="dist" type="integer" value="3" min="1" label="Maximum differences" help="Only use alignments where the barcodes differ by at most these many errors. Note that raising this beyond 3 probably won't have an effect, because of the inherent limit in bowtie's ability to match up distant barcodes."/>
     <param name="mapq" type="integer" value="20" min="0" label="Minimum mapping quality" help="Only use alignments whose MAPQ is at least this."/>
     <param name="pos" type="integer" value="2" min="0" label="Maximum start offset" help="Ignore alignments where the start positions differ by more than this."/>
+    <param name="check_ids" type="boolean" truevalue="" falsevalue="--no-check-ids" checked="True" label="Check read names" help="Make sure reads are properly paired up. The job will fail if there is a pair of reads where their ids aren't identical (minus any ending /1 or /2)."/>
     <param name="phone" type="boolean" truevalue="--phone-home" falsevalue="" checked="False" label="Send usage data" help="Report helpful usage data to the developer, to better understand the use cases and performance of the tool. The only data which will be recorded is the name and version of the tool, the size of the input data, the time and memory taken to process it, and the IP address of the machine running it. Also, if the tool fails, it will report the name of the exception thrown and the line of code it occurred in. The parameters and input/output dataset names are not sent. All the reporting and recording code is available at https://github.com/NickSto/ET"/>
     <section name="advanced" title="Advanced Options" expanded="false">
       <param name="chunkmbs" type="integer" value="512" min="16" label="bowtie --chunkmbs" help="This is the number of megabytes to give each bowtie thread for storing path descriptors. If you see warnings about &quot;Exhausted best-first chunk memory&quot; in stderr, you need to increase this."/>
--- a/dunovo.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/dunovo.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -1,13 +1,17 @@
 <?xml version="1.0"?>
-<tool id="dunovo" name="Du Novo: Make consensus reads" version="2.14">
+<tool id="dunovo" name="Du Novo: Make consensus reads" version="2.15">
   <description>from duplex sequencing alignments</description>
   <requirements>
-    <requirement type="package" version="2.14">dunovo</requirement>
+    <requirement type="package" version="2.15">dunovo</requirement>
     <!-- TODO: require Python 2.7 -->
   </requirements>
   <version_command>make-consensi.py --version</version_command>
   <command detect_errors="exit_code">
-    make-consensi.py --galaxy $phone --processes \${GALAXY_SLOTS:-1} --min-reads $min_reads --qual $qual_thres --qual-format $qual_format --cons-thres $cons_thres --min-cons-reads $min_cons_reads '$input' --dcs1 '$dcs1' --dcs2 '$dcs2'
+    make-consensi.py --galaxy $phone --processes \${GALAXY_SLOTS:-1}
+    #if $out_format.type == 'fastq':
+      --fastq-out $out_format.qual
+    #end if
+    --qual $qual_thres --qual-format $qual_format --min-reads $min_reads --cons-thres $cons_thres --min-cons-reads $min_cons_reads '$input' --dcs1 '$dcs1' --dcs2 '$dcs2'
     #if $keep_sscs:
       --sscs1 '$sscs1' --sscs2 '$sscs2'
     #end if
@@ -15,8 +19,17 @@
   <inputs>
     <param name="input" type="data" format="tabular" label="Aligned input reads" />
     <param name="min_reads" type="integer" value="3" min="1" label="Minimum reads for a consensus" help="This many reads are necessary to form a single-strand consensus sequence. Families smaller than this will be skipped."/>
-    <param name="cons_thres" type="float" value="0.5" min="0.5" max="1.0" label="Consensus % threshold" help="The consensus base must be present in more than this fraction of the reads, or &quot;N&quot; will be used."/>
-    <param name="min_cons_reads" type="integer" value="0" min="0" label="Minimum number of reads for a consensus base." help="If no base at the position appears in at least this many reads, &quot;N&quot; will be used."/>
+    <param name="cons_thres" type="float" value="0.5" min="0.5" max="1.0" label="Consensus % threshold" help="The consensus base must be present in more than this fraction of the reads, or &quot;N&quot; will be used instead."/>
+    <param name="min_cons_reads" type="integer" value="0" min="0" label="Minimum number of reads for a consensus base." help="The consensus base must be present in more than this absolute number of reads, or &quot;N&quot; will be used instead."/>
+    <conditional name="out_format">
+      <param name="type" type="select" label="Output format">
+        <option value="fasta" selected="true">FASTA</option>
+        <option value="fastq">FASTQ</option>
+      </param>
+      <when value="fastq">
+        <param name="qual" type="integer" value="40" min="0" max="93" label="Output PHRED score" help="There is currently no way to output a meaningful quality score for consensus bases. You'll have to specify an artificial one, which will be given to every base. A good value is 40, the maximum score normally output by sequencers. This means the bases won't be inadvertently filtered out by some downstream tool."/>
+      </when>
+    </conditional>
     <param name="qual_thres" type="integer" value="25" min="1" label="Minimum base quality" help="Bases with a PHRED score less than this will not be counted in the consensus making."/>
     <param name="qual_format" type="select" label="FASTQ format" help="Solexa should also work for Illumina 1.3+ and 1.5+, and Sanger should work for Illumina 1.8+">
       <option value="sanger" selected="true">Sanger (PHRED 0 = &quot;!&quot;)</option>
@@ -26,13 +39,31 @@
     <param name="phone" type="boolean" truevalue="--phone-home" falsevalue="" checked="False" label="Send usage data" help="Report helpful usage data to the developer, to better understand the use cases and performance of the tool. The only data which will be recorded is the name and version of the tool, the size of the input data, the number of processes used, the time and memory taken to process it, and the IP address of the machine running it. Also, if the tool fails, it will report the name of the exception thrown and the line of code it occurred in. The parameters and input/output dataset names are not sent. All the reporting and recording code is available at https://github.com/NickSto/ET."/>
   </inputs>
   <outputs>
-    <data name="dcs1" format="fasta" label="$tool.name on $on_string (mate 1)"/>
-    <data name="dcs2" format="fasta" label="$tool.name on $on_string (mate 2)"/>
-    <data name="sscs1" format="fasta" label="$tool.name on $on_string (SSCS mate 1)">
+    <data name="dcs1" label="$tool.name on $on_string (mate 1)">
+      <change_format>
+        <when input="out_format.type" value="fasta" format="fasta"/>
+        <when input="out_format.type" value="fastq" format="fastq"/>
+      </change_format>
+    </data>
+    <data name="dcs2" label="$tool.name on $on_string (mate 2)">
+      <change_format>
+        <when input="out_format.type" value="fasta" format="fasta"/>
+        <when input="out_format.type" value="fastq" format="fastq"/>
+      </change_format>
+    </data>
+    <data name="sscs1" label="$tool.name on $on_string (SSCS mate 1)">
       <filter>keep_sscs</filter>
+      <change_format>
+        <when input="out_format.type" value="fasta" format="fasta"/>
+        <when input="out_format.type" value="fastq" format="fastq"/>
+      </change_format>
     </data>
-    <data name="sscs2" format="fasta" label="$tool.name on $on_string (SSCS mate 2)">
+    <data name="sscs2" label="$tool.name on $on_string (SSCS mate 2)">
       <filter>keep_sscs</filter>
+      <change_format>
+        <when input="out_format.type" value="fasta" format="fasta"/>
+        <when input="out_format.type" value="fastq" format="fastq"/>
+      </change_format>
     </data>
   </outputs>
   <tests>
--- a/make_families.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/make_families.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -1,8 +1,8 @@
 <?xml version="1.0"?>
-<tool id="make_families" name="Du Novo: Make families" version="2.14">
+<tool id="make_families" name="Du Novo: Make families" version="2.15">
   <description>of duplex sequencing reads</description>
   <requirements>
-    <requirement type="package" version="2.14">dunovo</requirement>
+    <requirement type="package" version="2.15">dunovo</requirement>
   </requirements>
   <!-- TODO: Add dependency on coreutils to get paste? -->
   <version_command>make-consensi.py --version</version_command>
--- a/precheck.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/precheck.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -1,12 +1,12 @@
 <?xml version="1.0"?>
-<tool id="precheck" name="Du Novo: Check input" version="2.14">
+<tool id="precheck" name="Du Novo: Check input" version="2.15">
   <requirements>
-    <requirement type="package" version="2.14">dunovo</requirement>
+    <requirement type="package" version="2.15">dunovo</requirement>
   </requirements>
   <description>for family content</description>
   <command detect_errors="exit_code">
-    precheck.py $validate --tag-length $tag_len --constant-length $const_len --min-reads $min_reads
-      $fastq1 $fastq2 &gt; $output
+    precheck.py $check_ids --tag-length $tag_len --constant-length $const_len --min-reads $min_reads
+      '$fastq1' '$fastq2' &gt; '$output'
   </command>
   <inputs>
     <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/>
@@ -14,7 +14,7 @@
     <param name="min_reads" type="integer" value="3" min="0" label="Minimum reads per family" help="Single-strand families with fewer than this many reads will be skipped."/>
     <param name="tag_len" type="integer" value="12" min="0" label="Tag length" help="Length of each random barcode on the ends of the fragments."/>
     <param name="const_len" type="integer" value="5" min="0" label="Invariant sequence length" help="Length of the sequence between the tag and actual sample sequence (the restriction site, normally)."/>
-    <param name="validate" type="boolean" truevalue="--validate" falsevalue="" checked="False" label="Check read names" help="Make sure the names of the reads in each pair is the same. If checked, this will fail if there is a mismatch."/>
+    <param name="check_ids" type="boolean" truevalue="--validate" falsevalue="--no-check-ids" checked="True" label="Check read names" help="Make sure the ids of the reads in each pair is the same. If checked, this will fail if there is a mismatch."/>
   </inputs>
   <outputs>
     <data name="output" format="tabular"/>
--- a/tool_dependencies.xml	Fri Mar 23 21:51:35 2018 -0400
+++ b/tool_dependencies.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -9,21 +9,12 @@
   <package name="networkx" version="1.10">
     <repository changeset_revision="3469a6858fd4" name="package_networkx_1_10" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
   </package>
-  <package name="dunovo" version="2.14">
+  <package name="dunovo" version="2.15">
     <install version="1.0">
       <actions>
         <!-- download the main dunovo package files -->
-        <action sha256sum="e517a12794e7cac31b4ebcad99e6a6c0789027ace7d33f89d06ff2bc961515fc" type="download_by_url">https://github.com/galaxyproject/dunovo/archive/v2.14.tar.gz</action>
+        <action sha256sum="1cf02bcc37cc641a20bdf5e20c2b3b98d4dccd43b1182830cb1a9e7d173d1a78" type="download_by_url">https://github.com/galaxyproject/dunovo/archive/v2.15.tar.gz</action>
         <!-- TODO: Store dunovo version number in variable instead of hardcoding it everywhere. -->
-        <!-- Move precheck.py and its dependencies into the main source directory. -->
-        <action type="move_file">
-          <source>utils/precheck.py</source>
-          <destination>$TMP_WORK_DIR/dunovo-2.14</destination>
-        </action>
-        <action type="move_file">
-          <source>utils/getreads.py</source>
-          <destination>$TMP_WORK_DIR/dunovo-2.14</destination>
-        </action>
         <!-- install submodules -->
         <action type="shell_command">rmdir kalign utillib ET</action>
         <!-- download the utillib submodule -->
@@ -32,15 +23,15 @@
         <action type="shell_command">rm v0.1.0.tar.gz</action>
         <action rename_to="utillib" type="move_file">
           <source>utillib-0.1.0</source>
-          <destination>$TMP_WORK_DIR/dunovo-2.14</destination>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
         </action>
         <!-- download the kalign submodule -->
-        <action type="download_file">https://github.com/makrutenko/kalign-dunovo/archive/v0.2.0.tar.gz</action>
+        <action type="download_file">https://github.com/makovalab-psu/kalign-dunovo/archive/v0.2.0.tar.gz</action>
         <action type="shell_command">tar -zxvpf v0.2.0.tar.gz</action>
         <action type="shell_command">rm v0.2.0.tar.gz</action>
         <action rename_to="kalign" type="move_file">
           <source>kalign-dunovo-0.2.0</source>
-          <destination>$TMP_WORK_DIR/dunovo-2.14</destination>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
         </action>
         <!-- download the ET submodule -->
         <action type="download_file">https://github.com/NickSto/ET/archive/v0.2.2.tar.gz</action>
@@ -48,7 +39,28 @@
         <action type="shell_command">rm v0.2.2.tar.gz</action>
         <action rename_to="ET" type="move_file">
           <source>ET-0.2.2</source>
-          <destination>$TMP_WORK_DIR/dunovo-2.14</destination>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
+        </action>
+        <!-- download the bfx submodule -->
+        <action type="download_file">https://github.com/NickSto/bfx/archive/v0.2.0.tar.gz</action>
+        <action type="shell_command">tar -zxvpf v0.2.0.tar.gz</action>
+        <action type="shell_command">rm v0.2.0.tar.gz</action>
+        <action rename_to="bfx" type="move_file">
+          <source>bfx-0.2.0</source>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
+        </action>
+        <!-- Move some source files from subdirectories into the main source directory. -->
+        <action type="move_file">
+          <source>utils/precheck.py</source>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
+        </action>
+        <action type="move_file">
+          <source>bfx/trimmer.py</source>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
+        </action>
+        <action type="move_file">
+          <source>utils/getreads.py</source>
+          <destination>$TMP_WORK_DIR/dunovo-2.15</destination>
         </action>
         <!-- make and install -->
         <action type="shell_command">make clean</action>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimmer.xml	Fri Jun 01 16:51:02 2018 -0400
@@ -0,0 +1,84 @@
+<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer">
+  <description>trim reads based on certain bases</description>
+  <command interpreter="python">
+  trimmer.py $input1
+  #if $paired.is_paired:
+    $input2 $output1 $output2
+    #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension)
+      --error 'Both input files must be either fastq or fasta (no mixing the two).'
+    #end if
+  #end if
+  #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa'
+    -f fastq
+  #elif $input1.extension == 'fasta'
+    -f fasta
+  #else
+    -f $input1.extension
+  #end if
+  -b $bases -t $thres -w $win_len $invert
+  #if $min_len.has_min_len:
+    -m $min_len.value
+  #end if
+  #if not $paired.is_paired:
+    &gt; $output1
+  #end if
+  </command>
+  <inputs>
+    <conditional name="paired">
+      <param name="is_paired" type="select" label="Paired reads?">
+        <option value="" selected="True">Unpaired</option>
+        <option value="true">Paired</option>
+      </param>
+      <when value="true">
+        <param name="input1" type="data" format="fasta,fastq" label="Input reads (mate 1)"/>
+        <param name="input2" type="data" format="fasta,fastq" label="Input reads (mate 2)"/>
+      </when>
+      <when value="">
+        <param name="input1" type="data" format="fasta,fastq" label="Input reads"/>
+      </when>
+    </conditional>
+    <param name="bases" type="text" value="N" label="Bases to filter on"/>
+    <param name="thres" type="float" value="0.5" min="0" max="1" label="Frequency threshold" help="Trim when the frequency of filter bases (or non-filter bases, if inverting) exceeds this value."/>
+    <param name="win_len" type="integer" value="10" min="1" label="Size of the window"/>
+    <param name="invert" type="boolean" truevalue="--invert" falsevalue="" checked="False" label="Invert filter bases" help="Trim when the frequency of bases NOT in the &quot;filter bases&quot; list exceeds the threshold."/>
+    <conditional name="min_len">
+      <param name="has_min_len" type="boolean" truevalue="true" falsevalue="" checked="False" label="Set a minimum read length"/>
+      <when value="true">
+        <param name="value" type="integer" value="10" min="0" label="Minimum read length" help="Reads trimmed to less than this length will be omitted from the output. Pairs will be preserved: both must exceed this threshold to be kept."/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="output1" format_source="input1"/>
+    <data name="output2" format_source="input2">
+      <filter>paired['is_paired']</filter>
+    </data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool trims the 3' ends of reads based on the presence of the given bases. For instance, trim when N's are encountered or when the GC content exceeds a certain frequency.
+
+
+.. class:: infomark
+
+**How it works**
+
+This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, in which case it will trim once non-filter bases exceed the threshold).
+
+The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold).
+
+
+.. class:: infomark
+
+**Input**
+
+The inputs can be in the following formats: fasta, fastq, fastqsanger, fastqillumina, and fastqsolexa. Both must be either a fasta or fastq type (no mixing fastq and fasta).
+
+  </help>
+
+</tool>