comparison fastq_paired_end_joiner.py @ 1:ce853b881881 draft

Uploaded version 2.0.0 of tool.
author devteam
date Mon, 07 Jul 2014 15:17:40 -0400
parents d86b8db06e05
children e659bd662045
comparison
equal deleted inserted replaced
0:d86b8db06e05 1:ce853b881881
1 #Dan Blankenberg 1 """
2 import sys, os, shutil 2 Extended version of Dan Blankenberg's fastq joiner ( adds support for
3 from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner 3 recent Illumina headers ).
4 """
5
6 import sys, re
7 import galaxy_utils.sequence.fastq as fq
8
9
10 class IDManager( object ):
11
12 def __init__( self, sep="\t" ):
13 """
14 Recent Illumina FASTQ header format::
15
16 @<COORDS> <FLAGS>
17 COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y>
18 FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence>
19
20 where the whitespace character between <COORDS> and <FLAGS> can be
21 either a space or a tab.
22 """
23 self.sep = sep
24
25 def parse_id( self, identifier ):
26 try:
27 coords, flags = identifier.strip()[1:].split( self.sep, 1 )
28 except ValueError:
29 raise RuntimeError( "bad identifier: %r" % ( identifier, ))
30 return coords.split( ":" ), flags.split( ":" )
31
32 def join_id( self, parsed_id ):
33 coords, flags = parsed_id
34 return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags ))
35
36 def get_read_number( self, parsed_id ):
37 return int( parsed_id[1][0] )
38
39 def set_read_number( self, parsed_id, n ):
40 parsed_id[1][0] = "%d" % n
41
42 def get_paired_identifier( self, read ):
43 t = self.parse_id( read.identifier )
44 n = self.get_read_number( t )
45 if n == 1:
46 pn = 2
47 elif n == 2:
48 pn = 1
49 else:
50 raise RuntimeError( "Unknown read number '%d'" % n )
51 self.set_read_number( t, pn )
52 return self.join_id( t )
53
54
55 class FastqJoiner( fq.fastqJoiner ):
56
57 def __init__( self, format, force_quality_encoding=None, sep="\t" ):
58 super( FastqJoiner, self ).__init__( format, force_quality_encoding )
59 self.id_manager = IDManager( sep )
60
61 def join( self, read1, read2 ):
62 force_quality_encoding = self.force_quality_encoding
63 if not force_quality_encoding:
64 if read1.is_ascii_encoded():
65 force_quality_encoding = 'ascii'
66 else:
67 force_quality_encoding = 'decimal'
68 read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
69 read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
70 #--
71 t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ]
72 if self.id_manager.get_read_number( t1 ) == 2:
73 if not self.id_manager.get_read_number( t2 ) == 1:
74 raise RuntimeError( "input files are not from mated pairs" )
75 read1, read2 = read2, read1
76 t1, t2 = t2, t1
77 #--
78 rval = fq.FASTQ_FORMATS[self.format]()
79 rval.identifier = read1.identifier
80 rval.description = "+"
81 if len( read1.description ) > 1:
82 rval.description += rval.identifier[1:]
83 if rval.sequence_space == 'color':
84 # convert to nuc space, join, then convert back
85 rval.sequence = rval.convert_base_to_color_space(
86 read1.convert_color_to_base_space( read1.sequence ) +
87 read2.convert_color_to_base_space( read2.sequence )
88 )
89 else:
90 rval.sequence = read1.sequence + read2.sequence
91 if force_quality_encoding == 'ascii':
92 rval.quality = read1.quality + read2.quality
93 else:
94 rval.quality = "%s %s" % (
95 read1.quality.strip(), read2.quality.strip()
96 )
97 return rval
98
99 def get_paired_identifier( self, read ):
100 return self.id_manager.get_paired_identifier( read )
101
102
103 def sniff_sep( fastq_fn ):
104 header = ""
105 with open( fastq_fn ) as f:
106 while header == "":
107 try:
108 header = f.next().strip()
109 except StopIteration:
110 raise RuntimeError( "%r: empty file" % ( fastq_fn, ) )
111 return re.search( r"\s", header ).group()
4 112
5 def main(): 113 def main():
6 #Read command line arguments 114 #Read command line arguments
7 input1_filename = sys.argv[1] 115 input1_filename = sys.argv[1]
8 input1_type = sys.argv[2] or 'sanger' 116 input1_type = sys.argv[2] or 'sanger'
9 input2_filename = sys.argv[3] 117 input2_filename = sys.argv[3]
10 input2_type = sys.argv[4] or 'sanger' 118 input2_type = sys.argv[4] or 'sanger'
11 output_filename = sys.argv[5] 119 output_filename = sys.argv[5]
12 120
121 fastq_style = sys.argv[6] or 'old'
122 #--
13 if input1_type != input2_type: 123 if input1_type != input2_type:
14 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) 124 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
15 125
16 input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) 126 if fastq_style == 'new':
17 joiner = fastqJoiner( input1_type ) 127 sep = sniff_sep( input1_filename )
18 out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) 128 joiner = FastqJoiner( input1_type, sep=sep )
19 129 else:
130 joiner = fq.fastqJoiner( input1_type )
131 #--
132 input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
133 out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type )
20 i = None 134 i = None
21 skip_count = 0 135 skip_count = 0
22 for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): 136 for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ):
23 identifier = joiner.get_paired_identifier( fastq_read ) 137 identifier = joiner.get_paired_identifier( fastq_read )
24 fastq_paired = input2.get( identifier ) 138 fastq_paired = input2.get( identifier )
25 if fastq_paired is None: 139 if fastq_paired is None:
26 skip_count += 1 140 skip_count += 1
27 else: 141 else:
30 144
31 if i is None: 145 if i is None:
32 print "Your file contains no valid FASTQ reads." 146 print "Your file contains no valid FASTQ reads."
33 else: 147 else:
34 print input2.has_data() 148 print input2.has_data()
35 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) 149 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 )
36 150
37 if __name__ == "__main__": 151 if __name__ == "__main__":
38 main() 152 main()