changeset 14:570a7de9f151 draft

read from bam; fix header issue
author rnateam
date Mon, 30 Nov 2015 07:53:36 -0500
parents 258b6f9e19ab
children 11ef9c403c72
files extract_aln_ends.py extract_aln_ends.xml flexbar_named_output.pl flexbar_named_output.xml merge_pcr_duplicates.py
diffstat 5 files changed, 528 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/extract_aln_ends.py	Fri Nov 27 04:33:02 2015 -0500
+++ b/extract_aln_ends.py	Mon Nov 30 07:53:36 2015 -0500
@@ -14,7 +14,7 @@
 By default output is written to stdout.
 
 Input:
-* sam file containing alignments (paired-end sequencing)
+* alignments in SAM or BAM format (paired-end sequencing)
 
 Output:
 * bed6 file containing outer coordinates (sorted by read id)
@@ -57,8 +57,8 @@
                                  formatter_class=DefaultsRawDescriptionHelpFormatter)
 # positional arguments
 parser.add_argument(
-    "sam",
-    help="Path to sam file containing alignments.")
+    "infile",
+    help="Path to alignments in SAM or BAM format.")
 # optional arguments
 parser.add_argument(
     "-o", "--outfile",
@@ -75,7 +75,7 @@
 parser.add_argument(
     '--version',
     action='version',
-    version='0.1.0')
+    version='0.2.0')
 
 args = parser.parse_args()
 
@@ -86,7 +86,7 @@
 else:
     logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
 logging.info("Parsed arguments:")
-logging.info("  sam: '{}'".format(args.sam))
+logging.info("  infile: '{}'".format(args.infile))
 if args.outfile:
     logging.info("  outfile: enabled writing to file")
     logging.info("  outfile: '{}'".format(args.outfile))
@@ -103,7 +103,7 @@
 
     # sort by id
     logging.debug("calling samtools sort")
-    pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
+    pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
 
     # fix mate information
     # also removes secondary and unmapped reads
--- a/extract_aln_ends.xml	Fri Nov 27 04:33:02 2015 -0500
+++ b/extract_aln_ends.xml	Mon Nov 30 07:53:36 2015 -0500
@@ -14,7 +14,7 @@
 
 > $default]]></command>
   <inputs>
-    <param area="false" label="Sam input." name="positional_1" type="data" format="sam"/>
+    <param area="false" label="Alignments in SAM or BAM format." name="positional_1" type="data" format="sam,bam"/>
   </inputs>
   <outputs>
     <data format="bed" hidden="false" name="default"/>
@@ -26,7 +26,7 @@
     </test>
   </tests>
   <help><![CDATA[
-Extract alignment ends from sam file.
+Extract alignment ends from bam file.
 
 The resulting bed file contains the outer coordinates of the alignments. The
 bed name field is set to the read id and the score field is set to the edit
@@ -38,7 +38,7 @@
 
 Input:
 
-* sam file containing alignments (paired-end sequencing)
+* bam file containing alignments (paired-end sequencing)
 
 Output:
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flexbar_named_output.pl	Mon Nov 30 07:53:36 2015 -0500
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+
+# Flexbar wrapper for Galaxy tool definition, version 2.5
+# Author: Johannes Roehr
+# modified by Daniel Maticzka to include additional user-choosable output prefix
+
+use warnings;
+use strict;
+
+# this parses the last 4 arguments
+# what is id -> xml $output.id
+# what is folder -> xml $__new_file_path__
+# what is format? -> xml $reads.ext
+my ($outFile, $id, $folder, $format, $output_prefix) = @ARGV[($#ARGV - 4) .. $#ARGV];
+
+# this parses all but the last four arguments
+# contains the call to the flexbar actual
+my $call = join " ", @ARGV[0..($#ARGV - 5)];
+
+# this calls flexbar and
+# prefix for output files will be "FlexbarTargetFile"
+system $call .' --target FlexbarTargetFile > '. $outFile and exit 1;
+
+# now we parse all output files
+foreach(<FlexbarTargetFile*>){
+
+	# determine filetype
+	my $fileType;
+
+	$fileType = $1         if /\.(\w+)$/;
+	$fileType = $format    if /\.\w*fast\w$/;
+	$fileType = 'fasta'    if /\.fasta$/;
+	$fileType = 'csfasta'  if /\.csfasta$/;
+	$fileType = 'tabular'  if /\.lengthdist$/;
+
+	# this is just the filename from the for loop
+	my $file = $_;
+
+	# replace underscores by minus in $_
+	s/_/-/g;
+
+	# set new name for output files
+	my $name = "primary_". $id ."_". $_ ."_visible_". $fileType;
+
+	# rename output file to a pattern recognized by flexbar?
+	rename $file, $name;
+	# best guess: this seems to move the file into a folder
+	# rename behavious is not specified by perl... or implementation differs.
+	rename $name, $folder;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flexbar_named_output.xml	Mon Nov 30 07:53:36 2015 -0500
@@ -0,0 +1,466 @@
+
+<!-- Flexbar tool definition for Galaxy, version 2.5 -->
+<!-- Author: Johannes Roehr -->
+<!-- Modified by Daniel Maticzka -->
+<!--     * changed dependency to use separate flexbar package -->
+
+<tool id="flexbar" name="Flexbar" version="2.5" force_history_refresh="True">
+
+    <description>flexible barcode and adapter removal</description>
+
+    <requirements>
+        <requirement type="package" version="2.5">flexbar</requirement>
+    </requirements>
+
+    <version_command>flexbar --version</version_command>
+
+    <command interpreter="perl">
+
+        flexbar.pl flexbar
+
+        --threads \${GALAXY_SLOTS:-1}
+
+        --reads $reads
+
+        #if $cReads2.select == "on":
+            #if $cReads2.reads2.ext == $reads.ext:
+                --reads2 $cReads2.reads2
+            #end if
+        #end if
+
+        #if $reads.ext == "fastqsanger":
+            --format sanger
+        #end if
+        #if $reads.ext == "fastqsolexa":
+            --format solexa
+        #end if
+        #if $reads.ext == "fastqillumina":
+            --format i1.3
+        #end if
+        #if $reads.ext == "csfasta":
+            --color-space
+        #end if
+        #if $reads.ext == "fastqcssanger":
+            --color-space
+        #end if
+
+
+        --max-uncalled $maxUncalled
+        --min-read-length $minReadLen
+
+        #if $trimEnds.select == "on":
+            --pre-trim-left $trimEnds.trimLeft
+            --pre-trim-right $trimEnds.trimRight
+        #end if
+
+        #if $cTrimPhred.select == "on":
+            --pre-trim-phred $cTrimPhred.trimPhred
+        #end if
+
+        #if $cTrimLen.select == "on":
+            --post-trim-length $cTrimLen.trimLen
+        #end if
+
+
+        #if $cBarcodes.select == "on":
+            --barcodes $cBarcodes.barcodes
+
+            #if $cBarcodes.cbReads.select == "yes":
+                --barcode-reads $cBarcodes.cbReads.bReads
+            #end if
+
+            #if $cBarcodes.cbReads.select == "no":
+                $cBarcodes.cbReads.bKeep
+            #end if
+
+            $cBarcodes.bUnassigned
+
+            --barcode-trim-end $cBarcodes.bTrimEnd
+
+            #if $cBarcodes.cbTailLen.select == "yes":
+                --barcode-tail-length $cBarcodes.cbTailLen.bTailLen
+            #end if
+
+            #if $cBarcodes.cbMinOverlap.select == "yes":
+                --barcode-min-overlap $cBarcodes.cbMinOverlap.bMinOverlap
+            #end if
+
+            --barcode-threshold $cBarcodes.bThresh
+
+            #if $cBarcodes.cbAlignScores.select == "yes":
+                --barcode-match    $bMatch
+                --barcode-mismatch $bMismatch
+                --barcode-gap      $bGap
+            #end if
+        #end if
+
+
+        #if $cAdapters.select == "on":
+
+            #if $cAdapters.ccAdapters.select == "data":
+                --adapters $cAdapters.ccAdapters.adaptersData
+            #end if
+
+            #if $cAdapters.ccAdapters.select == "seq":
+                --adapter-seq $cAdapters.ccAdapters.adapterSeq
+            #end if
+
+            --adapter-trim-end $cAdapters.aTrimEnd
+
+            #if $cAdapters.caTailLen.select == "yes":
+                --adapter-tail-length $cAdapters.caTailLen.aTailLen
+            #end if
+
+            $cAdapters.aReadSet
+
+            --adapter-min-overlap $cAdapters.aMinOverlap
+            --adapter-threshold   $cAdapters.aThresh
+
+            #if $cAdapters.caAlignScores.select == "yes":
+                --adapter-match    $aMatch
+                --adapter-mismatch $aMismatch
+                --adapter-gap      $aGap
+            #end if
+        #end if
+
+
+        #if $cOutput.select == "show":
+            $cOutput.fastaOutput
+            $cOutput.lenDist
+            $cOutput.singleReads
+        #end if
+
+        #if $cLogging.select == "show":
+            $cLogging.logLevel
+            $cLogging.numTags
+            $cLogging.remTags
+            $cLogging.rndTags
+        #end if
+
+
+        $output $output.id $__new_file_path__ $reads.ext
+
+    </command>
+
+
+    <inputs>
+
+        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina,csfasta,fastqcssanger" name="reads" type="data" label="Sequencing reads" optional="false"/>
+
+
+        <conditional name="cReads2">
+            <param name="select" type="select" label="2nd read set (paired)">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="on">
+                <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina,csfasta,fastqcssanger" name="reads2" type="data" label="Reads 2" optional="false" help="same format as first read set"/>
+            </when>
+        </conditional>
+
+
+        <param name="maxUncalled" size="4" type="integer" value="0"  label="1) Max uncalled" optional="false" help="allowed uncalled bases per read"/>
+
+        <conditional name="trimEnds">
+            <param name="select" type="select" label="2) Trimming of ends">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="on">
+                <param name="trimLeft" size="4" type="integer" value="0" label="Left" optional="false"/>
+                <param name="trimRight" size="4" type="integer" value="0" label="Right" optional="false" help="trims specified number of bases from read ends"/>
+            </when>
+        </conditional>
+
+        <conditional name="cTrimPhred">
+            <param name="select" type="select" label="3) Phred-trimming">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="on">
+                <param name="trimPhred" size="4" type="integer" value="10" label="Threshold" optional="false" help="trim right end until specified or higher quality reached"/>
+            </when>
+        </conditional>
+
+
+        <conditional name="cBarcodes">
+            <param name="select" type="select" label="4) Barcode detection">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="on">
+                <param format="fasta" name="barcodes" type="data" label="Barcodes" optional="false"/>
+
+                <conditional name="cbReads">
+                    <param name="select" type="select" label="Separate barcode reads">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="yes">
+                        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina,csfasta,fastqcssanger" name="bReads" type="data" label="Separate barcode reads" optional="false"/>
+                    </when>
+                    <when value="no">
+                        <param name="bKeep" type="select" label="Remove barcodes within reads">
+                            <option value="" selected="true">Yes</option>
+                            <option value="--barcode-keep">No</option>
+                        </param>
+                    </when>
+                </conditional>
+
+                <param name="bUnassigned" type="select" label="Include unassigned reads">
+                    <option value="" selected="true">No</option>
+                    <option value="--barcode-unassigned">Yes</option>
+                </param>
+
+                <param name="bTrimEnd" type="select" label="Trim-end mode" optional="false">
+                    <option value="ANY" selected="true">Any</option>
+                    <option value="RIGHT">Right</option>
+                    <option value="RIGHT_TAIL">Right tail</option>
+                    <option value="LEFT">Left</option>
+                    <option value="LEFT_TAIL">Left tail</option>
+                </param>
+
+                <conditional name="cbTailLen">
+                    <param name="select" type="select" label="Change tail length">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no">
+                    </when>
+                    <when value="yes">
+                        <param name="bTailLen" size="4" type="integer" value="10" label="Tail length" optional="false"/>
+                    </when>
+                </conditional>
+
+                <conditional name="cbMinOverlap">
+                    <param name="select" type="select" label="Change min-overlap" help="default: barcode length">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no">
+                    </when>
+                    <when value="yes">
+                        <param name="bMinOverlap" size="4" type="integer" value="8" label="Min-overlap" optional="false"/>
+                    </when>
+                </conditional>
+
+                <param name="bThresh" size="4" type="integer" value="1" label="Threshold" optional="false" help="allowed mismatches and indels per 10 bases"/>
+
+                <conditional name="cbAlignScores">
+                    <param name="select" type="select" label="Modify alignment scores">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no">
+                    </when>
+                    <when value="yes">
+                        <param name="bMatch"    size="3" type="integer" value="1"  label="Match" optional="false"/>
+                        <param name="bMismatch" size="3" type="integer" value="-1" label="Mismatch" optional="false"/>
+                        <param name="bGap"      size="3" type="integer" value="-9" label="Gap" optional="false"/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+
+
+        <conditional name="cAdapters">
+            <param name="select" type="select" label="5) Adapter removal">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+
+            <when value="off">
+            </when>
+            <when value="on">
+                <conditional name="ccAdapters">
+                    <param name="select" type="select" label="Adapter source">
+                        <option value="data" selected="true">Fasta</option>
+                        <option value="seq">Sequence</option>
+                        <!-- <option value="file">File</option> -->
+                    </param>
+                    <when value="data">
+                        <param format="fasta" name="adaptersData" type="data" label="Adapters" optional="false"/>
+                    </when>
+                    <when value="seq">
+                        <param name="adapterSeq" size="40" label="Adapter" type="text" value="AAAAAAAAAAAAAA" optional="false"/>
+                    </when>
+                    <!-- <when value="file">
+                        <param name="adaptersFile" type="file" label="Adapters file" optional="false"/>
+                    </when> -->
+                </conditional>
+
+                <param name="aTrimEnd" type="select" label="Trim-end mode" optional="false">
+                    <option value="ANY">Any</option>
+                    <option value="RIGHT" selected="true">Right</option>
+                    <option value="RIGHT_TAIL">Right tail</option>
+                    <option value="LEFT">Left</option>
+                    <option value="LEFT_TAIL">Left tail</option>
+                </param>
+
+                <conditional name="caTailLen">
+                    <param name="select" type="select" label="Change tail length">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no">
+                    </when>
+                    <when value="yes">
+                        <param name="aTailLen" size="4" type="integer" value="10" label="Tail length" optional="false"/>
+                    </when>
+                </conditional>
+
+                <param name="aReadSet" type="select" label="Removal for single read set">
+                    <option value="" selected="true">No</option>
+                    <option value="--adapter-read-set 1">1st</option>
+                    <option value="--adapter-read-set 2">2nd</option>
+                </param>
+
+                <param name="aMinOverlap" size="4" type="integer" value="1" label="Min-overlap" optional="false"/>
+                <param name="aThresh" size="4" type="integer" value="3" label="Threshold" optional="false" help="allowed mismatches and indels per 10 bases"/>
+
+                <conditional name="caAlignScores">
+                    <param name="select" type="select" label="Modify alignment scores">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no">
+                    </when>
+                    <when value="yes">
+                        <param name="aMatch"    size="3" type="integer" value="1"  label="Match" optional="false"/>
+                        <param name="aMismatch" size="3" type="integer" value="-1" label="Mismatch" optional="false"/>
+                        <param name="aGap"      size="3" type="integer" value="-7" label="Gap" optional="false"/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+
+
+        <conditional name="cTrimLen">
+            <param name="select" type="select" label="6) Trimming to length">
+                <option value="off" selected="true">Off</option>
+                <option value="on">On</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="on">
+                <param name="trimLen" size="4" type="integer" value="30" label="Length" optional="false" help="trim reads to certain length from right"/>
+            </when>
+        </conditional>
+
+        <param name="minReadLen"  size="4" type="integer" value="18" label="7) Minimum read length" optional="false" help="shorter reads are discarded"/>
+
+        <conditional name="cOutput">
+            <param name="select" type="select" label="Output selection">
+                <option value="off" selected="true">Off</option>
+                <option value="show">Show</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="show">
+                <param name="fastaOutput" type="select" label="Fasta output">
+                    <option value="" selected="true">Off</option>
+                    <option value="--fasta-output">Always</option>
+                </param>
+
+                <param name="lenDist" type="select" label="Read length distribution">
+                    <option value="" selected="true">Off</option>
+                    <option value="--length-dist">On</option>
+                </param>
+
+                <param name="singleReads" type="select" label="Single reads">
+                    <option value="" selected="true">Off</option>
+                    <option value="--single-reads">On</option>
+                </param>
+            </when>
+        </conditional>
+
+        <conditional name="cLogging">
+            <param name="select" type="select" label="Logging and tagging options">
+                <option value="off" selected="true">Off</option>
+                <option value="show">Show</option>
+            </param>
+            <when value="off">
+            </when>
+            <when value="show">
+                <param name="logLevel" type="select" label="Alignment logging">
+                    <option value="" selected="true">Off</option>
+                    <option value="--log-level ALL">All</option>
+                    <option value="--log-level MOD">Modified</option>
+                    <option value="--log-level TAB">Tabular</option>
+                </param>
+
+                <param name="numTags" type="select" label="Number tags">
+                    <option value="" selected="true">Off</option>
+                    <option value="--number-tags">On</option>
+                </param>
+
+                <param name="remTags" type="select" label="Removal tags">
+                    <option value="" selected="true">Off</option>
+                    <option value="--removal-tags">On</option>
+                </param>
+
+                <param name="rndTags" type="select" label="Random tags">
+                    <option value="" selected="true">Off</option>
+                    <option value="--random-tags">On</option>
+                </param>
+            </when>
+        </conditional>
+
+      </inputs>
+
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Error!" />
+    </stdio>
+
+    <outputs>
+        <data format="txt" name="output" metadata_source="reads"/>
+    </outputs>
+
+
+    <help>
+
+**Description**
+
+Flexbar preprocesses high-throughput sequencing data efficiently. It demultiplexes barcoded runs and removes adapter sequences. Moreover, trimming and filtering features are provided. Flexbar increases read mapping rates and improves genome and transcriptome assemblies. It supports next-generation sequencing data in fasta/q and csfasta/q format from Illumina, Roche 454, and the SOLiD platform. Flexbar is available on the project_ page.
+
+.. _project: https://github.com/seqan/flexbar
+
+------
+
+**Trim-end modes**
+
+**Any:** longer side of read remains after overlap removal
+
+**Left:** right side remains after removal, align before or at read end
+
+**Right:** left part remains after removal, align after or at read start
+
+**Left tail:** consider first n bases of reads in alignment
+
+**Right tail:** use only last n bases, see tail-length options
+
+------
+
+**Documentation**
+
+Further documentation is available on the `manual`__ wiki page and via the command line help screen.
+
+.. __: https://github.com/seqan/flexbar/wiki
+
+------
+
+**Reference**
+
+Matthias Dodt, Johannes T. Roehr, Rina Ahmed, Christoph Dieterich: Flexbar — flexible barcode and adapter processing for next-generation sequencing platforms. Biology 2012, 1(3):895-905.
+
+    </help>
+
+</tool>
--- a/merge_pcr_duplicates.py	Fri Nov 27 04:33:02 2015 -0500
+++ b/merge_pcr_duplicates.py	Mon Nov 30 07:53:36 2015 -0500
@@ -112,6 +112,9 @@
     sep="\t",
     names=["chrom", "start", "stop", "read_id", "score", "strand"])
 
+# keep id parts up to first whitespace
+alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
+
 # combine barcode library and alignments
 bcalib = pd.merge(
     bcs, alns,