Mercurial > repos > devteam > fastq_paired_end_joiner
changeset 4:bce792b8e239 draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/galaxy_sequence_utils/fastq_paired_end_joiner commit f2582539542b33240234e8ea6093e25d0aee9b6a
author | devteam |
---|---|
date | Sat, 30 Sep 2017 14:57:26 -0400 |
parents | e659bd662045 |
children | ff56900af0c0 |
files | fastq_paired_end_joiner.py fastq_paired_end_joiner.xml tool_dependencies.xml |
diffstat | 3 files changed, 33 insertions(+), 202 deletions(-) [+] |
line wrap: on
line diff
--- a/fastq_paired_end_joiner.py Mon Dec 14 16:03:18 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,158 +0,0 @@ -""" -Extended version of Dan Blankenberg's fastq joiner ( adds support for -recent Illumina headers ). -""" - -import sys, re -import galaxy_utils.sequence.fastq as fq - - -class IDManager( object ): - - def __init__( self, sep="\t" ): - """ - Recent Illumina FASTQ header format:: - - @<COORDS> <FLAGS> - COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y> - FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence> - - where the whitespace character between <COORDS> and <FLAGS> can be - either a space or a tab. - """ - self.sep = sep - - def parse_id( self, identifier ): - try: - coords, flags = identifier.strip()[1:].split( self.sep, 1 ) - except ValueError: - raise RuntimeError( "bad identifier: %r" % ( identifier, )) - return coords.split( ":" ), flags.split( ":" ) - - def join_id( self, parsed_id ): - coords, flags = parsed_id - return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags )) - - def get_read_number( self, parsed_id ): - return int( parsed_id[1][0] ) - - def set_read_number( self, parsed_id, n ): - parsed_id[1][0] = "%d" % n - - def get_paired_identifier( self, read ): - t = self.parse_id( read.identifier ) - n = self.get_read_number( t ) - if n == 1: - pn = 2 - elif n == 2: - pn = 1 - else: - raise RuntimeError( "Unknown read number '%d'" % n ) - self.set_read_number( t, pn ) - return self.join_id( t ) - - -class FastqJoiner( fq.fastqJoiner ): - - def __init__( self, format, force_quality_encoding=None, sep="\t", paste="" ): - super( FastqJoiner, self ).__init__( format, force_quality_encoding, paste=paste ) - self.id_manager = IDManager( sep ) - - def join( self, read1, read2 ): - force_quality_encoding = self.force_quality_encoding - if not force_quality_encoding: - if read1.is_ascii_encoded(): - force_quality_encoding = 'ascii' - else: - force_quality_encoding = 'decimal' - read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) - read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) - #-- - t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ] - if self.id_manager.get_read_number( t1 ) == 2: - if not self.id_manager.get_read_number( t2 ) == 1: - raise RuntimeError( "input files are not from mated pairs" ) - read1, read2 = read2, read1 - t1, t2 = t2, t1 - #-- - rval = fq.FASTQ_FORMATS[self.format]() - rval.identifier = read1.identifier - rval.description = "+" - if len( read1.description ) > 1: - rval.description += rval.identifier[1:] - if rval.sequence_space == 'color': - # convert to nuc space, join, then convert back - rval.sequence = rval.convert_base_to_color_space( - read1.convert_color_to_base_space( read1.sequence ) + - self.paste_sequence + - read2.convert_color_to_base_space( read2.sequence ) - ) - else: - rval.sequence = read1.sequence + self.paste_sequence + read2.sequence - if force_quality_encoding == 'ascii': - rval.quality = read1.quality + self.paste_ascii_quality + read2.quality - else: - rval.quality = "%s %s" % ( - read1.quality.strip(), self.paste_decimal_quality - ) - rval.quality = ("%s %s" % ( - rval.quality.strip(), read2.quality.strip() - )).strip() - return rval - - def get_paired_identifier( self, read ): - return self.id_manager.get_paired_identifier( read ) - - -def sniff_sep( fastq_fn ): - header = "" - with open( fastq_fn ) as f: - while header == "": - try: - header = f.next().strip() - except StopIteration: - raise RuntimeError( "%r: empty file" % ( fastq_fn, ) ) - return re.search( r"\s", header ).group() - -def main(): - #Read command line arguments - input1_filename = sys.argv[1] - input1_type = sys.argv[2] or 'sanger' - input2_filename = sys.argv[3] - input2_type = sys.argv[4] or 'sanger' - output_filename = sys.argv[5] - - fastq_style = sys.argv[6] or 'old' - - paste = sys.argv[7] or '' - #-- - if input1_type != input2_type: - print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) - - if fastq_style == 'new': - sep = sniff_sep( input1_filename ) - joiner = FastqJoiner( input1_type, sep=sep, paste=paste ) - else: - joiner = fq.fastqJoiner( input1_type, paste=paste ) - #-- - input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) - out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type ) - i = None - skip_count = 0 - for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ): - identifier = joiner.get_paired_identifier( fastq_read ) - fastq_paired = input2.get( identifier ) - if fastq_paired is None: - skip_count += 1 - else: - out.write( joiner.join( fastq_read, fastq_paired ) ) - out.close() - - if i is None: - print "Your file contains no valid FASTQ reads." - else: - print input2.has_data() - print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 ) - -if __name__ == "__main__": - main()
--- a/fastq_paired_end_joiner.xml Mon Dec 14 16:03:18 2015 -0500 +++ b/fastq_paired_end_joiner.xml Sat Sep 30 14:57:26 2017 -0400 @@ -1,29 +1,31 @@ -<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.1"> - <description>on paired end reads</description> - <requirements> - <requirement type="package" version="1.0.1">galaxy_sequence_utils</requirement> - </requirements> - <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file' '$style' '${paste_sequence}'</command> - <inputs> - <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand Reads" /> - <param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand Reads" /> - <param name="style" type="select" label="FASTQ Header Style"> - <option value="old" selected="true">old</option> - <option value="new">new</option> - </param> - <param name="paste_sequence" type="text" label="Bases to insert between joined reads" value="" help="Values are in Base-space and quality scores of maximal value will be used"/> - </inputs> - <outputs> - <data name="output_file" format="input" /> - </outputs> - <tests> - <test> - <param name="input1_file" value="split_pair_reads_1.fastqsanger" ftype="fastqsanger" /> - <param name="input2_file" value="split_pair_reads_2.fastqsanger" ftype="fastqsanger" /> - <output name="output_file" file="3.fastqsanger" /> - </test> - </tests> - <help> +<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="2.0.1.0"> + <description>on paired end reads</description> + <requirements> + <requirement type="package" version="1.1.1">galaxy_sequence_utils</requirement> + </requirements> + <command><![CDATA[ +gx-fastq-paired-end-joiner '$input1_file' '${input1_file.extension[len('fastq'):]}' '$input2_file' '${input2_file.extension[len('fastq'):]}' '$output_file' $style '${paste_sequence}' + ]]></command> + <inputs> + <param name="input1_file" type="data" format="fastqsanger,fastqcssanger,fastqsanger.gz,fastqcssanger.gz,fastqsanger.bz2,fastqcssanger.bz2" label="Left-hand Reads" /> + <param name="input2_file" type="data" format="fastqsanger,fastqcssanger,fastqsanger.gz,fastqcssanger.gz,fastqsanger.bz2,fastqcssanger.bz2" label="Right-hand Reads" /> + <param name="style" type="select" label="FASTQ Header Style"> + <option value="old" selected="true">old</option> + <option value="new">new</option> + </param> + <param name="paste_sequence" type="text" value="" label="Bases to insert between joined reads" help="Values are in Base-space and quality scores of maximal value will be used"/> + </inputs> + <outputs> + <data name="output_file" format_source="input1_file" /> + </outputs> + <tests> + <test> + <param name="input1_file" value="split_pair_reads_1.fastqsanger" ftype="fastqsanger" /> + <param name="input2_file" value="split_pair_reads_2.fastqsanger" ftype="fastqsanger" /> + <output name="output_file" file="3.fastqsanger" ftype="fastqsanger" /> + </test> + </tests> + <help><![CDATA[ **What it does** This tool joins paired end FASTQ reads from two separate files into a @@ -81,16 +83,9 @@ **Credits** -This is an extended version (adds support for "new" style FASTQ headers) -of D. Blankenberg's fastq joiner: - -`Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ - -New style header support added by Simone Leo <simone.leo@crs4.it> - </help> - - <citations> - <citation type="doi">10.1093/bioinformatics/btq281</citation> - </citations> - +New style header support added by Simone Leo <simone.leo@crs4.it> . + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btq281</citation> + </citations> </tool>
--- a/tool_dependencies.xml Mon Dec 14 16:03:18 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="galaxy_sequence_utils" version="1.0.1"> - <repository changeset_revision="c38bd3fe9da6" name="package_galaxy_sequence_utils_1_0_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>