0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Provides wrappers and utilities for working with MAF files and alignments.
|
|
4 """
|
|
5 #Dan Blankenberg
|
|
6 import bx.align.maf
|
|
7 import bx.intervals
|
|
8 import bx.interval_index_file
|
|
9 import sys, os, string, tempfile
|
|
10 import logging
|
|
11 from copy import deepcopy
|
|
12
|
|
13 assert sys.version_info[:2] >= ( 2, 4 )
|
|
14
|
|
15 log = logging.getLogger(__name__)
|
|
16
|
|
17
|
|
18 GAP_CHARS = [ '-' ]
|
|
19 SRC_SPLIT_CHAR = '.'
|
|
20
|
|
21 def src_split( src ):
|
|
22 fields = src.split( SRC_SPLIT_CHAR, 1 )
|
|
23 spec = fields.pop( 0 )
|
|
24 if fields:
|
|
25 chrom = fields.pop( 0 )
|
|
26 else:
|
|
27 chrom = spec
|
|
28 return spec, chrom
|
|
29
|
|
30 def src_merge( spec, chrom, contig = None ):
|
|
31 if None in [ spec, chrom ]:
|
|
32 spec = chrom = spec or chrom
|
|
33 return bx.align.maf.src_merge( spec, chrom, contig )
|
|
34
|
|
35 def get_species_in_block( block ):
|
|
36 species = []
|
|
37 for c in block.components:
|
|
38 spec, chrom = src_split( c.src )
|
|
39 if spec not in species:
|
|
40 species.append( spec )
|
|
41 return species
|
|
42
|
|
43 def tool_fail( msg = "Unknown Error" ):
|
|
44 print >> sys.stderr, "Fatal Error: %s" % msg
|
|
45 sys.exit()
|
|
46
|
|
47 #an object corresponding to a reference layered alignment
|
|
48 class RegionAlignment( object ):
|
|
49
|
|
50 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
|
|
51 MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
|
|
52
|
|
53 def __init__( self, size, species = [] ):
|
|
54 assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
|
|
55 self.size = size
|
|
56 self.sequences = {}
|
|
57 if not isinstance( species, list ):
|
|
58 species = [species]
|
|
59 for spec in species:
|
|
60 self.add_species( spec )
|
|
61
|
|
62 #add a species to the alignment
|
|
63 def add_species( self, species ):
|
|
64 #make temporary sequence files
|
|
65 self.sequences[species] = tempfile.TemporaryFile()
|
|
66 self.sequences[species].write( "-" * self.size )
|
|
67
|
|
68 #returns the names for species found in alignment, skipping names as requested
|
|
69 def get_species_names( self, skip = [] ):
|
|
70 if not isinstance( skip, list ): skip = [skip]
|
|
71 names = self.sequences.keys()
|
|
72 for name in skip:
|
|
73 try: names.remove( name )
|
|
74 except: pass
|
|
75 return names
|
|
76
|
|
77 #returns the sequence for a species
|
|
78 def get_sequence( self, species ):
|
|
79 self.sequences[species].seek( 0 )
|
|
80 return self.sequences[species].read()
|
|
81
|
|
82 #returns the reverse complement of the sequence for a species
|
|
83 def get_sequence_reverse_complement( self, species ):
|
|
84 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
|
|
85 complement.reverse()
|
|
86 return "".join( complement )
|
|
87
|
|
88 #sets a position for a species
|
|
89 def set_position( self, index, species, base ):
|
|
90 if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
|
|
91 return self.set_range( index, species, base )
|
|
92 #sets a range for a species
|
|
93 def set_range( self, index, species, bases ):
|
|
94 if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
|
|
95 if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
|
|
96 if species not in self.sequences.keys(): self.add_species( species )
|
|
97 self.sequences[species].seek( index )
|
|
98 self.sequences[species].write( bases )
|
|
99
|
|
100 #Flush temp file of specified species, or all species
|
|
101 def flush( self, species = None ):
|
|
102 if species is None:
|
|
103 species = self.sequences.keys()
|
|
104 elif not isinstance( species, list ):
|
|
105 species = [species]
|
|
106 for spec in species:
|
|
107 self.sequences[spec].flush()
|
|
108
|
|
109 class GenomicRegionAlignment( RegionAlignment ):
|
|
110
|
|
111 def __init__( self, start, end, species = [] ):
|
|
112 RegionAlignment.__init__( self, end - start, species )
|
|
113 self.start = start
|
|
114 self.end = end
|
|
115
|
|
116 class SplicedAlignment( object ):
|
|
117
|
|
118 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
|
|
119
|
|
120 def __init__( self, exon_starts, exon_ends, species = [] ):
|
|
121 if not isinstance( exon_starts, list ):
|
|
122 exon_starts = [exon_starts]
|
|
123 if not isinstance( exon_ends, list ):
|
|
124 exon_ends = [exon_ends]
|
|
125 assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
|
|
126 self.exons = []
|
|
127 for i in range( len( exon_starts ) ):
|
|
128 self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
|
|
129
|
|
130 #returns the names for species found in alignment, skipping names as requested
|
|
131 def get_species_names( self, skip = [] ):
|
|
132 if not isinstance( skip, list ): skip = [skip]
|
|
133 names = []
|
|
134 for exon in self.exons:
|
|
135 for name in exon.get_species_names( skip = skip ):
|
|
136 if name not in names:
|
|
137 names.append( name )
|
|
138 return names
|
|
139
|
|
140 #returns the sequence for a species
|
|
141 def get_sequence( self, species ):
|
|
142 sequence = tempfile.TemporaryFile()
|
|
143 for exon in self.exons:
|
|
144 if species in exon.get_species_names():
|
|
145 sequence.write( exon.get_sequence( species ) )
|
|
146 else:
|
|
147 sequence.write( "-" * exon.size )
|
|
148 sequence.seek( 0 )
|
|
149 return sequence.read()
|
|
150
|
|
151 #returns the reverse complement of the sequence for a species
|
|
152 def get_sequence_reverse_complement( self, species ):
|
|
153 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
|
|
154 complement.reverse()
|
|
155 return "".join( complement )
|
|
156
|
|
157 #Start and end of coding region
|
|
158 @property
|
|
159 def start( self ):
|
|
160 return self.exons[0].start
|
|
161 @property
|
|
162 def end( self ):
|
|
163 return self.exons[-1].end
|
|
164
|
|
165 #Open a MAF index using a UID
|
|
166 def maf_index_by_uid( maf_uid, index_location_file ):
|
|
167 for line in open( index_location_file ):
|
|
168 try:
|
|
169 #read each line, if not enough fields, go to next line
|
|
170 if line[0:1] == "#" : continue
|
|
171 fields = line.split('\t')
|
|
172 if maf_uid == fields[1]:
|
|
173 try:
|
|
174 maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
|
|
175 return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
|
|
176 except Exception, e:
|
|
177 raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
|
|
178 except:
|
|
179 pass
|
|
180 return None
|
|
181
|
|
182 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
|
|
183 def open_or_build_maf_index( maf_file, index_filename, species = None ):
|
|
184 try:
|
|
185 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
|
|
186 except:
|
|
187 return build_maf_index( maf_file, species = species )
|
|
188
|
|
189 def build_maf_index_species_chromosomes( filename, index_species = None ):
|
|
190 species = []
|
|
191 species_chromosomes = {}
|
|
192 indexes = bx.interval_index_file.Indexes()
|
|
193 blocks = 0
|
|
194 try:
|
|
195 maf_reader = bx.align.maf.Reader( open( filename ) )
|
|
196 while True:
|
|
197 pos = maf_reader.file.tell()
|
|
198 block = maf_reader.next()
|
|
199 if block is None:
|
|
200 break
|
|
201 blocks += 1
|
|
202 for c in block.components:
|
|
203 spec = c.src
|
|
204 chrom = None
|
|
205 if "." in spec:
|
|
206 spec, chrom = spec.split( ".", 1 )
|
|
207 if spec not in species:
|
|
208 species.append( spec )
|
|
209 species_chromosomes[spec] = []
|
|
210 if chrom and chrom not in species_chromosomes[spec]:
|
|
211 species_chromosomes[spec].append( chrom )
|
|
212 if index_species is None or spec in index_species:
|
|
213 forward_strand_start = c.forward_strand_start
|
|
214 forward_strand_end = c.forward_strand_end
|
|
215 try:
|
|
216 forward_strand_start = int( forward_strand_start )
|
|
217 forward_strand_end = int( forward_strand_end )
|
|
218 except ValueError:
|
|
219 continue #start and end are not integers, can't add component to index, goto next component
|
|
220 #this likely only occurs when parse_e_rows is True?
|
|
221 #could a species exist as only e rows? should the
|
|
222 if forward_strand_end > forward_strand_start:
|
|
223 #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
|
|
224 indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
|
|
225 except Exception, e:
|
|
226 #most likely a bad MAF
|
|
227 log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
|
|
228 return ( None, [], {}, 0 )
|
|
229 return ( indexes, species, species_chromosomes, blocks )
|
|
230
|
|
231 #builds and returns ( index, index_filename ) for specified maf_file
|
|
232 def build_maf_index( maf_file, species = None ):
|
|
233 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
|
|
234 if indexes is not None:
|
|
235 fd, index_filename = tempfile.mkstemp()
|
|
236 out = os.fdopen( fd, 'w' )
|
|
237 indexes.write( out )
|
|
238 out.close()
|
|
239 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
|
|
240 return ( None, None )
|
|
241
|
|
242 def component_overlaps_region( c, region ):
|
|
243 if c is None: return False
|
|
244 start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
|
|
245 if region.start >= end or region.end <= start:
|
|
246 return False
|
|
247 return True
|
|
248
|
|
249 def chop_block_by_region( block, src, region, species = None, mincols = 0 ):
|
|
250 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
|
|
251 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
|
|
252 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
|
|
253 # comments welcome
|
|
254 slice_start = block.text_size #max for the min()
|
|
255 slice_end = 0 #min for the max()
|
|
256 old_score = block.score #save old score for later use
|
|
257 # We no longer assume only one occurance of src per block, so we need to check them all
|
|
258 for c in iter_components_by_src( block, src ):
|
|
259 if component_overlaps_region( c, region ):
|
|
260 if c.text is not None:
|
|
261 rev_strand = False
|
|
262 if c.strand == "-":
|
|
263 #We want our coord_to_col coordinates to be returned from positive stranded component
|
|
264 rev_strand = True
|
|
265 c = c.reverse_complement()
|
|
266 start = max( region.start, c.start )
|
|
267 end = min( region.end, c.end )
|
|
268 start = c.coord_to_col( start )
|
|
269 end = c.coord_to_col( end )
|
|
270 if rev_strand:
|
|
271 #need to orient slice coordinates to the original block direction
|
|
272 slice_len = end - start
|
|
273 end = len( c.text ) - start
|
|
274 start = end - slice_len
|
|
275 slice_start = min( start, slice_start )
|
|
276 slice_end = max( end, slice_end )
|
|
277
|
|
278 if slice_start < slice_end:
|
|
279 block = block.slice( slice_start, slice_end )
|
|
280 if block.text_size > mincols:
|
|
281 # restore old score, may not be accurate, but it is better than 0 for everything?
|
|
282 block.score = old_score
|
|
283 if species is not None:
|
|
284 block = block.limit_to_species( species )
|
|
285 block.remove_all_gap_columns()
|
|
286 return block
|
|
287 return None
|
|
288
|
|
289 def orient_block_by_region( block, src, region, force_strand = None ):
|
|
290 #loop through components matching src,
|
|
291 #make sure each of these components overlap region
|
|
292 #cache strand for each of overlaping regions
|
|
293 #if force_strand / region.strand not in strand cache, reverse complement
|
|
294 ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
|
|
295 strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
|
|
296 if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
|
|
297 block = block.reverse_complement()
|
|
298 return block
|
|
299
|
|
300 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
|
|
301 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
|
|
302 yield block
|
|
303 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
|
|
304 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
|
|
305 yield orient_block_by_region( block, src, region, force_strand ), idx, offset
|
|
306
|
|
307 #split a block with multiple occurances of src into one block per src
|
|
308 def iter_blocks_split_by_src( block, src ):
|
|
309 for src_c in iter_components_by_src( block, src ):
|
|
310 new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) )
|
|
311 new_block.text_size = block.text_size
|
|
312 for c in block.components:
|
|
313 if c == src_c or c.src != src:
|
|
314 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components
|
|
315 yield new_block
|
|
316
|
|
317 #split a block into multiple blocks with all combinations of a species appearing only once per block
|
|
318 def iter_blocks_split_by_species( block, species = None ):
|
|
319 def __split_components_by_species( components_by_species, new_block ):
|
|
320 if components_by_species:
|
|
321 #more species with components to add to this block
|
|
322 components_by_species = deepcopy( components_by_species )
|
|
323 spec_comps = components_by_species.pop( 0 )
|
|
324 for c in spec_comps:
|
|
325 newer_block = deepcopy( new_block )
|
|
326 newer_block.add_component( deepcopy( c ) )
|
|
327 for value in __split_components_by_species( components_by_species, newer_block ):
|
|
328 yield value
|
|
329 else:
|
|
330 #no more components to add, yield this block
|
|
331 yield new_block
|
|
332
|
|
333 #divide components by species
|
|
334 spec_dict = {}
|
|
335 if not species:
|
|
336 species = []
|
|
337 for c in block.components:
|
|
338 spec, chrom = src_split( c.src )
|
|
339 if spec not in spec_dict:
|
|
340 spec_dict[ spec ] = []
|
|
341 species.append( spec )
|
|
342 spec_dict[ spec ].append( c )
|
|
343 else:
|
|
344 for spec in species:
|
|
345 spec_dict[ spec ] = []
|
|
346 for c in iter_components_by_src_start( block, spec ):
|
|
347 spec_dict[ spec ].append( c )
|
|
348
|
|
349 empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
|
|
350 empty_block.text_size = block.text_size
|
|
351 #call recursive function to split into each combo of spec/blocks
|
|
352 for value in __split_components_by_species( spec_dict.values(), empty_block ):
|
|
353 sort_block_components_by_block( value, block ) #restore original component order
|
|
354 yield value
|
|
355
|
|
356
|
|
357 #generator yielding only chopped and valid blocks for a specified region
|
|
358 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
|
|
359 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
|
|
360 yield block
|
|
361 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
|
|
362 for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
|
|
363 block = chop_block_by_region( block, src, region, species, mincols )
|
|
364 if block is not None:
|
|
365 yield block, idx, offset
|
|
366
|
|
367 #returns a filled region alignment for specified regions
|
|
368 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
|
369 if species is not None: alignment = RegionAlignment( end - start, species )
|
|
370 else: alignment = RegionAlignment( end - start, primary_species )
|
|
371 return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
|
|
372
|
|
373 #reduces a block to only positions exisiting in the src provided
|
|
374 def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
|
|
375 #returns ( startIndex, {species:texts}
|
|
376 #where texts' contents are reduced to only positions existing in the primary genome
|
|
377 src = "%s.%s" % ( species, chromosome )
|
|
378 ref = block.get_component_by_src( src )
|
|
379 start_offset = ref.start - region_start
|
|
380 species_texts = {}
|
|
381 for c in block.components:
|
|
382 species_texts[ c.src.split( '.' )[0] ] = list( c.text )
|
|
383 #remove locations which are gaps in the primary species, starting from the downstream end
|
|
384 for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
|
|
385 if species_texts[ species ][i] == '-':
|
|
386 for text in species_texts.values():
|
|
387 text.pop( i )
|
|
388 for spec, text in species_texts.items():
|
|
389 species_texts[spec] = ''.join( text )
|
|
390 return ( start_offset, species_texts )
|
|
391
|
|
392 #fills a region alignment
|
|
393 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
|
394 region = bx.intervals.Interval( start, end )
|
|
395 region.chrom = chrom
|
|
396 region.strand = strand
|
|
397 primary_src = "%s.%s" % ( primary_species, chrom )
|
|
398
|
|
399 #Order blocks overlaping this position by score, lowest first
|
|
400 blocks = []
|
|
401 for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
|
|
402 score = float( block.score )
|
|
403 for i in range( 0, len( blocks ) ):
|
|
404 if score < blocks[i][0]:
|
|
405 blocks.insert( i, ( score, idx, offset ) )
|
|
406 break
|
|
407 else:
|
|
408 blocks.append( ( score, idx, offset ) )
|
|
409
|
|
410 #gap_chars_tuple = tuple( GAP_CHARS )
|
|
411 gap_chars_str = ''.join( GAP_CHARS )
|
|
412 #Loop through ordered blocks and layer by increasing score
|
|
413 for block_dict in blocks:
|
|
414 for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
|
|
415 if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
|
|
416 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
|
|
417 block = orient_block_by_region( block, primary_src, region ) #orient block
|
|
418 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
|
|
419 for spec, text in species_texts.items():
|
|
420 #we should trim gaps from both sides, since these are not positions in this species genome (sequence)
|
|
421 text = text.rstrip( gap_chars_str )
|
|
422 gap_offset = 0
|
|
423 while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith()
|
|
424 #while text.startswith( gap_chars_tuple ):
|
|
425 gap_offset += 1
|
|
426 text = text[1:]
|
|
427 if not text:
|
|
428 break
|
|
429 if text:
|
|
430 if overwrite_with_gaps:
|
|
431 alignment.set_range( start_offset + gap_offset, spec, text )
|
|
432 else:
|
|
433 for i, char in enumerate( text ):
|
|
434 if char not in GAP_CHARS:
|
|
435 alignment.set_position( start_offset + gap_offset + i, spec, char )
|
|
436 return alignment
|
|
437
|
|
438 #returns a filled spliced region alignment for specified region with start and end lists
|
|
439 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
|
440 #create spliced alignment object
|
|
441 if species is not None: alignment = SplicedAlignment( starts, ends, species )
|
|
442 else: alignment = SplicedAlignment( starts, ends, [primary_species] )
|
|
443 for exon in alignment.exons:
|
|
444 fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
|
|
445 return alignment
|
|
446
|
|
447 #loop through string array, only return non-commented lines
|
|
448 def line_enumerator( lines, comment_start = '#' ):
|
|
449 i = 0
|
|
450 for line in lines:
|
|
451 if not line.startswith( comment_start ):
|
|
452 i += 1
|
|
453 yield ( i, line )
|
|
454
|
|
455 #read a GeneBed file, return list of starts, ends, raw fields
|
|
456 def get_starts_ends_fields_from_gene_bed( line ):
|
|
457 #Starts and ends for exons
|
|
458 starts = []
|
|
459 ends = []
|
|
460
|
|
461 fields = line.split()
|
|
462 #Requires atleast 12 BED columns
|
|
463 if len(fields) < 12:
|
|
464 raise Exception( "Not a proper 12 column BED line (%s)." % line )
|
|
465 chrom = fields[0]
|
|
466 tx_start = int( fields[1] )
|
|
467 tx_end = int( fields[2] )
|
|
468 name = fields[3]
|
|
469 strand = fields[5]
|
|
470 if strand != '-': strand='+' #Default strand is +
|
|
471 cds_start = int( fields[6] )
|
|
472 cds_end = int( fields[7] )
|
|
473
|
|
474 #Calculate and store starts and ends of coding exons
|
|
475 region_start, region_end = cds_start, cds_end
|
|
476 exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
|
|
477 exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
|
|
478 exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
|
|
479 exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
|
|
480 for start, end in zip( exon_starts, exon_ends ):
|
|
481 start = max( start, region_start )
|
|
482 end = min( end, region_end )
|
|
483 if start < end:
|
|
484 starts.append( start )
|
|
485 ends.append( end )
|
|
486 return ( starts, ends, fields )
|
|
487
|
|
488 def iter_components_by_src( block, src ):
|
|
489 for c in block.components:
|
|
490 if c.src == src:
|
|
491 yield c
|
|
492
|
|
493 def get_components_by_src( block, src ):
|
|
494 return [ value for value in iter_components_by_src( block, src ) ]
|
|
495
|
|
496 def iter_components_by_src_start( block, src ):
|
|
497 for c in block.components:
|
|
498 if c.src.startswith( src ):
|
|
499 yield c
|
|
500
|
|
501 def get_components_by_src_start( block, src ):
|
|
502 return [ value for value in iter_components_by_src_start( block, src ) ]
|
|
503
|
|
504 def sort_block_components_by_block( block1, block2 ):
|
|
505 #orders the components in block1 by the index of the component in block2
|
|
506 #block1 must be a subset of block2
|
|
507 #occurs in-place
|
|
508 return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
|
|
509
|
|
510 def get_species_in_maf( maf_filename ):
|
|
511 species = []
|
|
512 for block in bx.align.maf.Reader( open( maf_filename ) ):
|
|
513 for spec in get_species_in_block( block ):
|
|
514 if spec not in species:
|
|
515 species.append( spec )
|
|
516 return species
|
|
517
|
|
518 def parse_species_option( species ):
|
|
519 if species:
|
|
520 species = species.split( ',' )
|
|
521 if 'None' not in species:
|
|
522 return species
|
|
523 return None #provided species was '', None, or had 'None' in it
|
|
524
|
|
525 def remove_temp_index_file( index_filename ):
|
|
526 try: os.unlink( index_filename )
|
|
527 except: pass
|
|
528
|
|
529 #Below are methods to deal with FASTA files
|
|
530
|
|
531 def get_fasta_header( component, attributes = {}, suffix = None ):
|
|
532 header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
|
|
533 for key, value in attributes.iteritems():
|
|
534 header = "%s%s=%s|" % ( header, key, value )
|
|
535 if suffix:
|
|
536 header = "%s%s" % ( header, suffix )
|
|
537 else:
|
|
538 header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
|
|
539 return header
|
|
540
|
|
541 def get_attributes_from_fasta_header( header ):
|
|
542 if not header: return {}
|
|
543 attributes = {}
|
|
544 header = header.lstrip( '>' )
|
|
545 header = header.strip()
|
|
546 fields = header.split( '|' )
|
|
547 try:
|
|
548 region = fields[0]
|
|
549 region = region.split( '(', 1 )
|
|
550 temp = region[0].split( '.', 1 )
|
|
551 attributes['species'] = temp[0]
|
|
552 if len( temp ) == 2:
|
|
553 attributes['chrom'] = temp[1]
|
|
554 else:
|
|
555 attributes['chrom'] = temp[0]
|
|
556 region = region[1].split( ')', 1 )
|
|
557 attributes['strand'] = region[0]
|
|
558 region = region[1].lstrip( ':' ).split( '-' )
|
|
559 attributes['start'] = int( region[0] )
|
|
560 attributes['end'] = int( region[1] )
|
|
561 except:
|
|
562 #fields 0 is not a region coordinate
|
|
563 pass
|
|
564 if len( fields ) > 2:
|
|
565 for i in xrange( 1, len( fields ) - 1 ):
|
|
566 prop = fields[i].split( '=', 1 )
|
|
567 if len( prop ) == 2:
|
|
568 attributes[ prop[0] ] = prop[1]
|
|
569 if len( fields ) > 1:
|
|
570 attributes['__suffix__'] = fields[-1]
|
|
571 return attributes
|
|
572
|
|
573 def iter_fasta_alignment( filename ):
|
|
574 class fastaComponent:
|
|
575 def __init__( self, species, text = "" ):
|
|
576 self.species = species
|
|
577 self.text = text
|
|
578 def extend( self, text ):
|
|
579 self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
|
|
580 #yields a list of fastaComponents for a FASTA file
|
|
581 f = open( filename, 'rb' )
|
|
582 components = []
|
|
583 #cur_component = None
|
|
584 while True:
|
|
585 line = f.readline()
|
|
586 if not line:
|
|
587 if components:
|
|
588 yield components
|
|
589 return
|
|
590 line = line.strip()
|
|
591 if not line:
|
|
592 if components:
|
|
593 yield components
|
|
594 components = []
|
|
595 elif line.startswith( '>' ):
|
|
596 attributes = get_attributes_from_fasta_header( line )
|
|
597 components.append( fastaComponent( attributes['species'] ) )
|
|
598 elif components:
|
|
599 components[-1].extend( line )
|
|
600
|