Mercurial > repos > devteam > fastq_paired_end_joiner

--- a/fastq_paired_end_joiner.py	Mon Dec 14 16:03:18 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,158 +0,0 @@
-"""
-Extended version of Dan Blankenberg's fastq joiner ( adds support for
-recent Illumina headers ).
-"""
-
-import sys, re
-import galaxy_utils.sequence.fastq as fq
-
-
-class IDManager( object ):
-
-    def __init__( self, sep="\t" ):
-        """
-        Recent Illumina FASTQ header format::
-
-          @<COORDS> <FLAGS>
-          COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y>
-          FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence>
-
-        where the whitespace character between <COORDS> and <FLAGS> can be
-        either a space or a tab.
-        """
-        self.sep = sep
-
-    def parse_id( self, identifier ):
-        try:
-            coords, flags = identifier.strip()[1:].split( self.sep, 1 )
-        except ValueError:
-            raise RuntimeError( "bad identifier: %r" % ( identifier, ))
-        return coords.split( ":" ), flags.split( ":" )
-
-    def join_id( self, parsed_id ):
-        coords, flags = parsed_id
-        return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags ))
-
-    def get_read_number( self, parsed_id ):
-        return int( parsed_id[1][0] )
-
-    def set_read_number( self, parsed_id, n ):
-        parsed_id[1][0] = "%d" % n
-
-    def get_paired_identifier( self, read ):
-        t = self.parse_id( read.identifier )
-        n = self.get_read_number( t )
-        if n == 1:
-            pn = 2
-        elif n == 2:
-            pn = 1
-        else:
-            raise RuntimeError( "Unknown read number '%d'" % n )
-        self.set_read_number( t, pn )
-        return self.join_id( t )
-
-
-class FastqJoiner( fq.fastqJoiner ):
-
-    def __init__( self, format, force_quality_encoding=None, sep="\t", paste="" ):
-        super( FastqJoiner, self ).__init__( format, force_quality_encoding, paste=paste )
-        self.id_manager = IDManager( sep )
-
-    def join( self, read1, read2 ):
-        force_quality_encoding = self.force_quality_encoding
-        if not force_quality_encoding:
-            if read1.is_ascii_encoded():
-                force_quality_encoding = 'ascii'
-            else:
-                force_quality_encoding = 'decimal'
-        read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
-        read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
-        #--
-        t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ]
-        if self.id_manager.get_read_number( t1 ) == 2:
-            if not self.id_manager.get_read_number( t2 ) == 1:
-                raise RuntimeError( "input files are not from mated pairs" )
-            read1, read2 = read2, read1
-            t1, t2 = t2, t1
-        #--
-        rval = fq.FASTQ_FORMATS[self.format]()
-        rval.identifier = read1.identifier
-        rval.description = "+"
-        if len( read1.description ) > 1:
-            rval.description += rval.identifier[1:]
-        if rval.sequence_space == 'color':
-            # convert to nuc space, join, then convert back
-            rval.sequence = rval.convert_base_to_color_space(
-                read1.convert_color_to_base_space( read1.sequence ) +
-                self.paste_sequence +
-                read2.convert_color_to_base_space( read2.sequence )
-                )
-        else:
-            rval.sequence = read1.sequence + self.paste_sequence + read2.sequence
-        if force_quality_encoding == 'ascii':
-            rval.quality = read1.quality + self.paste_ascii_quality + read2.quality
-        else:
-            rval.quality = "%s %s" % (
-                read1.quality.strip(), self.paste_decimal_quality
-                )
-            rval.quality = ("%s %s" % (
-                rval.quality.strip(), read2.quality.strip()
-                )).strip()
-        return rval
-
-    def get_paired_identifier( self, read ):
-        return self.id_manager.get_paired_identifier( read )
-
-
-def sniff_sep( fastq_fn ):
-    header = ""
-    with open( fastq_fn ) as f:
-        while header == "":
-            try:
-                header = f.next().strip()
-            except StopIteration:
-                raise RuntimeError( "%r: empty file" % ( fastq_fn, ) )
-    return re.search( r"\s", header ).group()
-
-def main():
-    #Read command line arguments
-    input1_filename = sys.argv[1]
-    input1_type = sys.argv[2] or 'sanger'
-    input2_filename = sys.argv[3]
-    input2_type = sys.argv[4] or 'sanger'
-    output_filename = sys.argv[5]
-
-    fastq_style = sys.argv[6] or 'old'
-
-    paste = sys.argv[7] or ''
-    #--
-    if input1_type != input2_type:
-        print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
-
-    if fastq_style == 'new':
-        sep = sniff_sep( input1_filename )
-        joiner = FastqJoiner( input1_type, sep=sep, paste=paste )
-    else:
-        joiner = fq.fastqJoiner( input1_type, paste=paste )
-    #--
-    input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
-    out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type )
-    i = None
-    skip_count = 0
-    for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ):
-        identifier = joiner.get_paired_identifier( fastq_read )
-        fastq_paired = input2.get( identifier )
-        if fastq_paired is None:
-            skip_count += 1
-        else:
-            out.write( joiner.join( fastq_read, fastq_paired ) )
-    out.close()
-
-    if i is None:
-        print "Your file contains no valid FASTQ reads."
-    else:
-        print input2.has_data()
-        print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 )
-
-if __name__ == "__main__":
-    main()
--- a/fastq_paired_end_joiner.xml	Mon Dec 14 16:03:18 2015 -0500
+++ b/fastq_paired_end_joiner.xml	Sat Sep 30 14:57:26 2017 -0400
@@ -1,29 +1,31 @@
-<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.1">
-  <description>on paired end reads</description>
-  <requirements>
-    <requirement type="package" version="1.0.1">galaxy_sequence_utils</requirement>
-  </requirements>
-  <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file' '$style' '${paste_sequence}'</command>
-  <inputs>
-    <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand Reads" />
-    <param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand Reads" />
-    <param name="style" type="select" label="FASTQ Header Style">
-      <option value="old" selected="true">old</option>
-      <option value="new">new</option>
-    </param>
-    <param name="paste_sequence" type="text" label="Bases to insert between joined reads" value="" help="Values are in Base-space and quality scores of maximal value will be used"/>
-  </inputs>
-  <outputs>
-    <data name="output_file" format="input" />
-  </outputs>
-  <tests>
-    <test>
-      <param name="input1_file" value="split_pair_reads_1.fastqsanger" ftype="fastqsanger" />
-      <param name="input2_file" value="split_pair_reads_2.fastqsanger" ftype="fastqsanger" />
-      <output name="output_file" file="3.fastqsanger" />
-    </test>
-  </tests>
-  <help>
+<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.1.0">
+    <description>on paired end reads</description>
+    <requirements>
+        <requirement type="package" version="1.1.1">galaxy_sequence_utils</requirement>
+    </requirements>
+    <command><![CDATA[
+gx-fastq-paired-end-joiner '$input1_file' '${input1_file.extension[len('fastq'):]}' '$input2_file' '${input2_file.extension[len('fastq'):]}' '$output_file' $style '${paste_sequence}'
+    ]]></command>
+    <inputs>
+        <param name="input1_file" type="data" format="fastqsanger,fastqcssanger,fastqsanger.gz,fastqcssanger.gz,fastqsanger.bz2,fastqcssanger.bz2" label="Left-hand Reads" />
+        <param name="input2_file" type="data" format="fastqsanger,fastqcssanger,fastqsanger.gz,fastqcssanger.gz,fastqsanger.bz2,fastqcssanger.bz2" label="Right-hand Reads" />
+        <param name="style" type="select" label="FASTQ Header Style">
+            <option value="old" selected="true">old</option>
+            <option value="new">new</option>
+        </param>
+        <param name="paste_sequence" type="text" value="" label="Bases to insert between joined reads" help="Values are in Base-space and quality scores of maximal value will be used"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format_source="input1_file" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1_file" value="split_pair_reads_1.fastqsanger" ftype="fastqsanger" />
+            <param name="input2_file" value="split_pair_reads_2.fastqsanger" ftype="fastqsanger" />
+            <output name="output_file" file="3.fastqsanger" ftype="fastqsanger" />
+        </test>
+    </tests>
+    <help><![CDATA[
 **What it does**

 This tool joins paired end FASTQ reads from two separate files into a
@@ -81,16 +83,9 @@

 **Credits**

-This is an extended version (adds support for "new" style FASTQ headers)
-of D. Blankenberg's fastq joiner:
-
-`Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
-
-New style header support added by Simone Leo &lt;simone.leo@crs4.it&gt;
-  </help>
-
-  <citations>
-    <citation type="doi">10.1093/bioinformatics/btq281</citation>
-  </citations>
-
+New style header support added by Simone Leo <simone.leo@crs4.it> .
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btq281</citation>
+    </citations>
 </tool>
--- a/tool_dependencies.xml	Mon Dec 14 16:03:18 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-  <package name="galaxy_sequence_utils" version="1.0.1">
-      <repository changeset_revision="c38bd3fe9da6" name="package_galaxy_sequence_utils_1_0_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>