Mercurial > repos > devteam > fastq_paired_end_joiner
comparison fastq_paired_end_joiner.py @ 1:ce853b881881 draft
Uploaded version 2.0.0 of tool.
author | devteam |
---|---|
date | Mon, 07 Jul 2014 15:17:40 -0400 |
parents | d86b8db06e05 |
children | e659bd662045 |
comparison
equal
deleted
inserted
replaced
0:d86b8db06e05 | 1:ce853b881881 |
---|---|
1 #Dan Blankenberg | 1 """ |
2 import sys, os, shutil | 2 Extended version of Dan Blankenberg's fastq joiner ( adds support for |
3 from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner | 3 recent Illumina headers ). |
4 """ | |
5 | |
6 import sys, re | |
7 import galaxy_utils.sequence.fastq as fq | |
8 | |
9 | |
10 class IDManager( object ): | |
11 | |
12 def __init__( self, sep="\t" ): | |
13 """ | |
14 Recent Illumina FASTQ header format:: | |
15 | |
16 @<COORDS> <FLAGS> | |
17 COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y> | |
18 FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence> | |
19 | |
20 where the whitespace character between <COORDS> and <FLAGS> can be | |
21 either a space or a tab. | |
22 """ | |
23 self.sep = sep | |
24 | |
25 def parse_id( self, identifier ): | |
26 try: | |
27 coords, flags = identifier.strip()[1:].split( self.sep, 1 ) | |
28 except ValueError: | |
29 raise RuntimeError( "bad identifier: %r" % ( identifier, )) | |
30 return coords.split( ":" ), flags.split( ":" ) | |
31 | |
32 def join_id( self, parsed_id ): | |
33 coords, flags = parsed_id | |
34 return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags )) | |
35 | |
36 def get_read_number( self, parsed_id ): | |
37 return int( parsed_id[1][0] ) | |
38 | |
39 def set_read_number( self, parsed_id, n ): | |
40 parsed_id[1][0] = "%d" % n | |
41 | |
42 def get_paired_identifier( self, read ): | |
43 t = self.parse_id( read.identifier ) | |
44 n = self.get_read_number( t ) | |
45 if n == 1: | |
46 pn = 2 | |
47 elif n == 2: | |
48 pn = 1 | |
49 else: | |
50 raise RuntimeError( "Unknown read number '%d'" % n ) | |
51 self.set_read_number( t, pn ) | |
52 return self.join_id( t ) | |
53 | |
54 | |
55 class FastqJoiner( fq.fastqJoiner ): | |
56 | |
57 def __init__( self, format, force_quality_encoding=None, sep="\t" ): | |
58 super( FastqJoiner, self ).__init__( format, force_quality_encoding ) | |
59 self.id_manager = IDManager( sep ) | |
60 | |
61 def join( self, read1, read2 ): | |
62 force_quality_encoding = self.force_quality_encoding | |
63 if not force_quality_encoding: | |
64 if read1.is_ascii_encoded(): | |
65 force_quality_encoding = 'ascii' | |
66 else: | |
67 force_quality_encoding = 'decimal' | |
68 read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) | |
69 read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) | |
70 #-- | |
71 t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ] | |
72 if self.id_manager.get_read_number( t1 ) == 2: | |
73 if not self.id_manager.get_read_number( t2 ) == 1: | |
74 raise RuntimeError( "input files are not from mated pairs" ) | |
75 read1, read2 = read2, read1 | |
76 t1, t2 = t2, t1 | |
77 #-- | |
78 rval = fq.FASTQ_FORMATS[self.format]() | |
79 rval.identifier = read1.identifier | |
80 rval.description = "+" | |
81 if len( read1.description ) > 1: | |
82 rval.description += rval.identifier[1:] | |
83 if rval.sequence_space == 'color': | |
84 # convert to nuc space, join, then convert back | |
85 rval.sequence = rval.convert_base_to_color_space( | |
86 read1.convert_color_to_base_space( read1.sequence ) + | |
87 read2.convert_color_to_base_space( read2.sequence ) | |
88 ) | |
89 else: | |
90 rval.sequence = read1.sequence + read2.sequence | |
91 if force_quality_encoding == 'ascii': | |
92 rval.quality = read1.quality + read2.quality | |
93 else: | |
94 rval.quality = "%s %s" % ( | |
95 read1.quality.strip(), read2.quality.strip() | |
96 ) | |
97 return rval | |
98 | |
99 def get_paired_identifier( self, read ): | |
100 return self.id_manager.get_paired_identifier( read ) | |
101 | |
102 | |
103 def sniff_sep( fastq_fn ): | |
104 header = "" | |
105 with open( fastq_fn ) as f: | |
106 while header == "": | |
107 try: | |
108 header = f.next().strip() | |
109 except StopIteration: | |
110 raise RuntimeError( "%r: empty file" % ( fastq_fn, ) ) | |
111 return re.search( r"\s", header ).group() | |
4 | 112 |
5 def main(): | 113 def main(): |
6 #Read command line arguments | 114 #Read command line arguments |
7 input1_filename = sys.argv[1] | 115 input1_filename = sys.argv[1] |
8 input1_type = sys.argv[2] or 'sanger' | 116 input1_type = sys.argv[2] or 'sanger' |
9 input2_filename = sys.argv[3] | 117 input2_filename = sys.argv[3] |
10 input2_type = sys.argv[4] or 'sanger' | 118 input2_type = sys.argv[4] or 'sanger' |
11 output_filename = sys.argv[5] | 119 output_filename = sys.argv[5] |
12 | 120 |
121 fastq_style = sys.argv[6] or 'old' | |
122 #-- | |
13 if input1_type != input2_type: | 123 if input1_type != input2_type: |
14 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) | 124 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) |
15 | 125 |
16 input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) | 126 if fastq_style == 'new': |
17 joiner = fastqJoiner( input1_type ) | 127 sep = sniff_sep( input1_filename ) |
18 out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) | 128 joiner = FastqJoiner( input1_type, sep=sep ) |
19 | 129 else: |
130 joiner = fq.fastqJoiner( input1_type ) | |
131 #-- | |
132 input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) | |
133 out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type ) | |
20 i = None | 134 i = None |
21 skip_count = 0 | 135 skip_count = 0 |
22 for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): | 136 for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ): |
23 identifier = joiner.get_paired_identifier( fastq_read ) | 137 identifier = joiner.get_paired_identifier( fastq_read ) |
24 fastq_paired = input2.get( identifier ) | 138 fastq_paired = input2.get( identifier ) |
25 if fastq_paired is None: | 139 if fastq_paired is None: |
26 skip_count += 1 | 140 skip_count += 1 |
27 else: | 141 else: |
30 | 144 |
31 if i is None: | 145 if i is None: |
32 print "Your file contains no valid FASTQ reads." | 146 print "Your file contains no valid FASTQ reads." |
33 else: | 147 else: |
34 print input2.has_data() | 148 print input2.has_data() |
35 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) | 149 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 ) |
36 | 150 |
37 if __name__ == "__main__": | 151 if __name__ == "__main__": |
38 main() | 152 main() |