Mercurial > repos > nick > dunovo

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/align_families.xml	Sat Feb 18 05:27:09 2017 -0500
@@ -0,0 +1,80 @@
+<?xml version="1.0"?>
+<tool id="align_families" name="Du Novo: Align families" version="0.7">
+  <description>of duplex sequencing reads</description>
+  <requirements>
+    <requirement type="package" version="7.221">mafft</requirement>
+    <requirement type="package" version="0.7">dunovo</requirement>
+    <!-- TODO: require Python 2.7 -->
+  </requirements>
+  <command detect_errors="exit_code">align_families.py -p \${GALAXY_SLOTS:-1} '$input' &gt; '$output'
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Input reads" help="with barcodes, grouped by family"/>
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="smoke.families.tsv"/>
+      <output name="output" file="smoke.families.aligned.tsv"/>
+    </test>
+    <test>
+      <param name="input" value="families.in.tsv"/>
+      <output name="output" file="families.sort.tsv"/>
+    </test>
+  </tests>
+  <citations>
+    <citation type="bibtex">@article{Stoler2016,
+      author = {Stoler, Nicholas and Arbeithuber, Barbara and Guiblet, Wilfried and Makova, Kateryna D and Nekrutenko, Anton},
+      doi = {10.1186/s13059-016-1039-4},
+      issn = {1474-760X},
+      journal = {Genome biology},
+      number = {1},
+      pages = {180},
+      pmid = {27566673},
+      publisher = {Genome Biology},
+      title = {{Streamlined analysis of duplex sequencing data with Du Novo.}},
+      url = {http://www.ncbi.nlm.nih.gov/pubmed/27566673},
+      volume = {17},
+      year = {2016}
+    }</citation>
+  </citations>
+  <help>
+
+**What it does**
+
+This is for processing duplex sequencing data. It does a multiple sequence alignment on each (single-stranded) family of reads.
+
+-----
+
+**Input**
+
+This expects the output format of the "Make families" tool.
+
+-----
+
+**Output**
+
+The output is a tabular file where each line corresponds to a (single) read.
+
+The columns are::
+
+  1: barcode (both tags)
+  2: tag order in barcode ("ab" or "ba")
+  3: read mate ("1" or "2")
+  4: read name
+  5: read sequence, aligned ("-" for gaps)
+  6: read quality scores, aligned (" " for gaps)
+
+-----
+
+**Alignments**
+
+The alignments are done using MAFFT, specifically the command
+::
+
+  $ mafft --nuc --quiet family.fa &gt; family.aligned.fa
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/correct_barcodes.xml	Sat Feb 18 05:27:09 2017 -0500
@@ -0,0 +1,57 @@
+<?xml version="1.0"?>
+<tool id="correct_barcodes" name="Du Novo: Correct barcodes" version="0.7">
+  <description>of duplex sequencing reads</description>
+  <requirements>
+    <requirement type="package" version="2.1.0">bowtie2</requirement>
+    <requirement type="package" version="0.1.18">samtools</requirement>
+    <requirement type="package" version="1.9">networkx</requirement>
+    <requirement type="package" version="0.7">dunovo</requirement>
+    <!-- TODO: require Python 2.7 -->
+  </requirements>
+  <command detect_errors="exit_code">correct-barcodes.sh -d $dist -m $mapq -p $pos '$input' &gt; '$output'
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Input reads" help="with barcodes, grouped by family"/>
+    <param name="dist" type="integer" value="1" min="1" label="Maximum edit distance" help="Only use alignments where the barcodes differ by at most these many errors."/>
+    <param name="mapq" type="integer" value="20" min="0" label="Minimum mapping quality" help="Only use alignments whose MAPQ is at least this."/>
+    <param name="pos" type="integer" value="2" min="0" label="Minimum start offset" help="Ignore alignments where the start positions differ by more than this."/>
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular"/>
+  </outputs>
+  <citations>
+    <citation type="bibtex">@article{Stoler2016,
+      author = {Stoler, Nicholas and Arbeithuber, Barbara and Guiblet, Wilfried and Makova, Kateryna D and Nekrutenko, Anton},
+      doi = {10.1186/s13059-016-1039-4},
+      issn = {1474-760X},
+      journal = {Genome biology},
+      number = {1},
+      pages = {180},
+      pmid = {27566673},
+      publisher = {Genome Biology},
+      title = {{Streamlined analysis of duplex sequencing data with Du Novo.}},
+      url = {http://www.ncbi.nlm.nih.gov/pubmed/27566673},
+      volume = {17},
+      year = {2016}
+    }</citation>
+  </citations>
+  <help>
+
+**What it does**
+
+This is for processing duplex sequencing data. This will correct duplex barcodes and create new, larger families. Errors in barcodes normally prevent them from being recognized as the same as the other barcodes in their family. Correcting these errors allows the original, full families to be reconstructed, saving reads which would otherwise be lost. This tool accomplishes this by doing an all vs. all alignment between the barcodes with bowtie2. This identifies ones which are identical except a few, small differences.
+
+-----
+
+**Input**
+
+This expects the output format of the "Make families" tool.
+
+-----
+
+**Output**
+
+The output format is the same as the input format, ready to be consumed by the "Align families" tool.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dunovo.xml	Sat Feb 18 05:27:09 2017 -0500
@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<tool id="dunovo" name="Du Novo: Make consensus reads" version="0.7">
+  <description>from duplex sequencing alignments</description>
+  <requirements>
+    <requirement type="package" version="0.7">dunovo</requirement>
+    <!-- TODO: require Python 2.7 -->
+  </requirements>
+  <command detect_errors="exit_code">dunovo.sh -r $min_reads -q $qual_thres -F $qual_format '$input' '$dcs1' '$dcs2'
+    #if $keep_sscs:
+      '$sscs1' '$sscs2'
+    #end if
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Aligned input reads" />
+    <param name="min_reads" type="integer" value="3" min="1" label="Minimum reads per family" help="Single-strand families with fewer than this many reads will be skipped."/>
+    <param name="qual_thres" type="integer" value="25" min="1" label="Minimum base quality" help="Bases with a PHRED score less than this will not be counted in the consensus making."/>
+    <param name="qual_format" type="select" label="FASTQ format" help="Solexa should also work for Illumina 1.3+ and 1.5+, and Sanger should work for Illumina 1.8+">
+      <option value="sanger" selected="true">Sanger (PHRED 0 = &quot;!&quot;)</option>
+      <option value="solexa">Solexa (PHRED 0 = &quot;@&quot;)</option>
+    </param>
+    <param name="keep_sscs" type="boolean" truevalue="true" falsevalue="" label="Output single-strand consensus sequences as well" />
+  </inputs>
+  <outputs>
+    <data name="dcs1" format="fasta" label="$tool.name on $on_string (mate 1)"/>
+    <data name="dcs2" format="fasta" label="$tool.name on $on_string (mate 2)"/>
+    <data name="sscs1" format="fasta" label="$tool.name on $on_string (SSCS mate 1)">
+      <filter>keep_sscs</filter>
+    </data>
+    <data name="sscs2" format="fasta" label="$tool.name on $on_string (SSCS mate 2)">
+      <filter>keep_sscs</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="families.msa.tsv"/>
+      <output name="dcs1" file="families.cons_1.fa"/>
+      <output name="dcs2" file="families.cons_2.fa"/>
+    </test>
+  </tests>
+  <citations>
+    <citation type="bibtex">@article{Stoler2016,
+      author = {Stoler, Nicholas and Arbeithuber, Barbara and Guiblet, Wilfried and Makova, Kateryna D and Nekrutenko, Anton},
+      doi = {10.1186/s13059-016-1039-4},
+      issn = {1474-760X},
+      journal = {Genome biology},
+      number = {1},
+      pages = {180},
+      pmid = {27566673},
+      publisher = {Genome Biology},
+      title = {{Streamlined analysis of duplex sequencing data with Du Novo.}},
+      url = {http://www.ncbi.nlm.nih.gov/pubmed/27566673},
+      volume = {17},
+      year = {2016}
+    }</citation>
+  </citations>
+  <help>
+
+**What it does**
+
+This is for processing duplex sequencing data. It creates single-strand and duplex consensus reads from aligned read families.
+
+-----
+
+**Input**
+
+This expects the output format of the "Align families" tool.
+
+-----
+
+**Output**
+
+This will output final, duplex consensus reads in two FASTA files (first and second reads in the pairs). Optionally, you can save the single-strand reads too, in a separate FASTA file.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_families.xml	Sat Feb 18 05:27:09 2017 -0500
@@ -0,0 +1,95 @@
+<?xml version="1.0"?>
+<tool id="make_families" name="Du Novo: Make families" version="0.7">
+  <description>of duplex sequencing reads</description>
+  <requirements>
+    <requirement type="package" version="0.7">dunovo</requirement>
+  </requirements>
+  <!-- TODO: Add dependency on coreutils to get paste? -->
+  <command detect_errors="exit_code">make-families.sh -t $taglen -i $invariant '$fastq1' '$fastq2' &gt; '$output'
+  </command>
+  <inputs>
+    <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/>
+    <param name="fastq2" type="data" format="fastq" label="Sequencing reads, mate 2"/>
+    <param name="taglen" type="integer" value="12" min="0" label="Tag length" help="length of each random barcode on the ends of the fragments"/>
+    <param name="invariant" type="integer" value="5" min="0" label="Invariant sequence length" help="length of the sequence between the tag and actual sample sequence (the restriction site, normally)"/>
+  </inputs>
+  <outputs>
+    <data name="output" format="tabular"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="fastq1" value="smoke_1.fq"/>
+      <param name="fastq2" value="smoke_2.fq"/>
+      <param name="taglen" value="5"/>
+      <param name="invariant" value="1"/>
+      <output name="output" file="smoke.families.tsv"/>
+    </test>
+    <test>
+      <param name="fastq1" value="smoke_1.fq"/>
+      <param name="fastq2" value="smoke_2.fq"/>
+      <param name="taglen" value="5"/>
+      <param name="invariant" value="0"/>
+      <output name="output" file="smoke.families.i0.tsv"/>
+    </test>
+  </tests>
+  <citations>
+    <citation type="bibtex">@article{Stoler2016,
+      author = {Stoler, Nicholas and Arbeithuber, Barbara and Guiblet, Wilfried and Makova, Kateryna D and Nekrutenko, Anton},
+      doi = {10.1186/s13059-016-1039-4},
+      issn = {1474-760X},
+      journal = {Genome biology},
+      number = {1},
+      pages = {180},
+      pmid = {27566673},
+      publisher = {Genome Biology},
+      title = {{Streamlined analysis of duplex sequencing data with Du Novo.}},
+      url = {http://www.ncbi.nlm.nih.gov/pubmed/27566673},
+      volume = {17},
+      year = {2016}
+    }</citation>
+  </citations>
+  <help>
+
+**What it does**
+
+This tool is for processing raw duplex sequencing data, removing the barcodes and grouping by them into families of reads from the same fragment.
+
+-----
+
+**Output**
+
+The output will be a tabular file where each line corresponds to a pair of input reads.
+
+The columns are::
+
+  1: barcode (both tags joined and ordered)
+  2: tag order in barcode ("ab" or "ba")
+  3: read1 name
+  4: read1 sequence (minus the tag and invariant sequences)
+  5: read1 quality scores (minus the same tag and invariant)
+  6: read2 name
+  7: read2 sequence (minus the tag and invariant sequences)
+  8: read2 quality scores (minus the same tag and invariant)
+
+-----
+
+**Barcode creation**
+
+For each pair, the tool will remove the tag at the beginning of each read and create a barcode by concatenating the two tags. The order of the tags is determined by a string comparison so that it will make an identical barcode from pairs of either order. The original tag order will be noted in the second column.
+
+Since pairs from opposite strands will have the same tags, but in the reverse order, this produces the same barcode for reads from the same fragment, regardless of strand. Then a simple sort will group all reads from the same strand together, separated into strands by the different "order" values.
+
+Examples::
+
+  +---------------+-----------------+
+  |  input tags   |     output      |
+  +-------+-------+-------+---------+
+  | read1 | read2 | order | barcode |
+  +-------+-------+-------+---------+
+  |  ATG  |  CCT  |  ab   | ATGCCT  |
+  +-------+-------+-------+---------+
+  |  CCT  |  ATG  |  ba   | ATGCCT  |
+  +-------+-------+-------+---------+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sat Feb 18 05:27:09 2017 -0500
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="bowtie2" version="2.1.0">
+    <repository changeset_revision="606d435a57a4" name="package_bowtie2_2_1_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="samtools" version="0.1.18">
+    <repository changeset_revision="f499719dad6e" name="package_samtools_0_1_18" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="mafft" version="7.221">
+    <repository changeset_revision="dd4a533a0e3c" name="mafft" owner="rnateam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="networkx" version="1.9">
+    <repository changeset_revision="2ca57555a756" name="package_networkx_1_9" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="dunovo" version="0.7">
+    <install version="1.0">
+      <actions>
+        <action type="download_by_url">https://github.com/galaxyproject/dunovo/archive/v0.7.tar.gz</action>
+        <action type="shell_command">make</action>
+        <action type="move_directory_files">
+          <source_directory>.</source_directory>
+          <destination_directory>$INSTALL_DIR</destination_directory>
+        </action>
+        <action type="set_environment">
+          <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable>
+        </action>
+      </actions>
+    </install>
+  </package>
+</tool_dependency>