annotate scripts/build_profile_indexes.py @ 0:4414f0739808 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 10:59:42 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
2 #Dan Blankenberg
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
3
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
4 VERSION = '1.0.0' # version of this script
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
5
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
6 from optparse import OptionParser
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
7 import os, gzip, struct, time
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
8 from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
9
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
10 #import md5 from hashlib; if python2.4 or less, use old md5
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
11 try:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
12 from hashlib import md5
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
13 except ImportError:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
14 from md5 import new as md5
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
15
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
16 #import BitSet from bx-python, try using eggs and package resources, fall back to any local installation
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
17 try:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
18 from galaxy import eggs
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
19 import pkg_resources
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
20 pkg_resources.require( "bx-python" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
21 except: pass #Maybe there is a local installation available
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
22 from bx.bitset import BitSet
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
23
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
24 #Define constants
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
25 STRUCT_FMT = '<I'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
26 STRUCT_SIZE = struct.calcsize( STRUCT_FMT )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
27 DEFAULT_BITSET_SIZE = 300000000
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
28 CHUNK_SIZE = 1024
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
29
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
30 #Headers used to parse .sql files to determine column indexes for chromosome name, start and end
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
31 alias_spec = {
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
32 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ],
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
33 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ],
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
34 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ],
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
35 }
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
36
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
37 #Headers used to parse trackDb.txt.gz
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
38 #TODO: these should be parsed directly from trackDb.sql
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
39 trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
40
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
41 def get_columns( filename ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
42 input_sql = open( filename ).read()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
43 input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
44 input_sql = input_sql.split( ' (', 1 )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
45 table_name = input_sql[0].strip().strip( '`' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
46 input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
47 print input_sql
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
48 chrom_col = None
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
49 start_col = None
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
50 end_col = None
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
51 for col_name in alias_spec['chromCol']:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
52 for i, header_name in enumerate( input_sql ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
53 if col_name == header_name:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
54 chrom_col = i
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
55 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
56 if chrom_col is not None:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
57 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
58
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
59 for col_name in alias_spec['startCol']:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
60 for i, header_name in enumerate( input_sql ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
61 if col_name == header_name:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
62 start_col = i
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
63 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
64 if start_col is not None:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
65 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
66
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
67 for col_name in alias_spec['endCol']:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
68 for i, header_name in enumerate( input_sql ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
69 if col_name == header_name:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
70 end_col = i
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
71 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
72 if end_col is not None:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
73 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
74
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
75 return table_name, chrom_col, start_col, end_col
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
76
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
77
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
78 def create_grouping_xml( input_dir, output_dir, dbkey ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
79 output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
80 def load_groups( file_name = 'grp.txt.gz' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
81 groups = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
82 for line in gzip.open( os.path.join( input_dir, file_name ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
83 fields = line.split( '\t' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
84 groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] }
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
85 return groups
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
86 f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
87 out = open( output_filename, 'wb' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
88 tables = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
89 cur_buf = ''
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
90 while True:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
91 line = f.readline()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
92 if not line: break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
93 #remove new lines
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
94 line = line.rstrip( '\n\r' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
95 line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
96 cur_buf += "%s\n" % line.rstrip( '\\' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
97 if line.endswith( '\\' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
98 continue #line is wrapped, next line
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
99 #all fields should be loaded now...
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
100 fields = cur_buf.split( '\t' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
101 cur_buf = '' #reset buffer
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
102 assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
103 table_name = fields[ 0 ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
104 tables[ table_name ] = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
105 for field_name, field_value in zip( trackDb_headers, fields ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
106 tables[ table_name ][ field_name ] = field_value
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
107 #split settings fields into dict
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
108 fields = fields[-1].split( '\n' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
109 tables[ table_name ][ 'settings' ] = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
110 for field in fields:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
111 setting_fields = field.split( ' ', 1 )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
112 setting_name = setting_value = setting_fields[ 0 ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
113 if len( setting_fields ) > 1:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
114 setting_value = setting_fields[ 1 ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
115 if setting_name or setting_value:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
116 tables[ table_name ][ 'settings' ][ setting_name ] = setting_value
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
117 #Load Groups
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
118 groups = load_groups()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
119 in_groups = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
120 for table_name, values in tables.iteritems():
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
121 if os.path.exists( os.path.join( output_dir, table_name ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
122 group = values['grp']
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
123 if group not in in_groups:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
124 in_groups[group]={}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
125 #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
126 #subTrack = values.get('settings', {} ).get( 'subTrack', table_name )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
127 subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
128 if subTrack not in in_groups[group]:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
129 in_groups[group][subTrack]=[]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
130 in_groups[group][subTrack].append( table_name )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
131
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
132 assigned_tables = []
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
133 out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
134 out.write( " <options>\n" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
135 for group, subTracks in sorted( in_groups.iteritems() ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
136 out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
137 for sub_name, sub_tracks in subTracks.iteritems():
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
138 if len( sub_tracks ) > 1:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
139 out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
140 sub_tracks.sort()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
141 for track in sub_tracks:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
142 track_label = track
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
143 if "$" not in tables[track]['shortLabel']:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
144 track_label = tables[track]['shortLabel']
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
145 out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
146 assigned_tables.append( track )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
147 out.write( " </option>\n" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
148 else:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
149 track = sub_tracks[0]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
150 track_label = track
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
151 if "$" not in tables[track]['shortLabel']:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
152 track_label = tables[track]['shortLabel']
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
153 out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
154 assigned_tables.append( track )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
155 out.write( " </option>\n" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
156 unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
157 if unassigned_tables:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
158 out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
159 for table_name in unassigned_tables:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
160 out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
161 out.write( " </option>\n" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
162 out.write( " </options>\n" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
163 out.write( """</filter>\n""" )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
164 out.close()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
165
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
166 def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
167 #generate hash for profiled table directories
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
168 #sort directories off output root (files in output root not hashed, including the profiler_info.txt file)
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
169 #sort files in each directory and hash file contents
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
170 profiled_hash = md5()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
171 for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
172 for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
173 f = open( os.path.join( output_dir, table_dir, filename ), 'rb' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
174 while True:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
175 hash_chunk = f.read( CHUNK_SIZE )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
176 if not hash_chunk:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
177 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
178 profiled_hash.update( hash_chunk )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
179 profiled_hash = profiled_hash.hexdigest()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
180
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
181 #generate hash for input dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
182 #sort directories off input root
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
183 #sort files in each directory and hash file contents
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
184 database_hash = md5()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
185 for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
186 for filename in sorted( filenames ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
187 f = open( os.path.join( input_dir, dirpath, filename ), 'rb' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
188 while True:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
189 hash_chunk = f.read( CHUNK_SIZE )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
190 if not hash_chunk:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
191 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
192 database_hash.update( hash_chunk )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
193 database_hash = database_hash.hexdigest()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
194
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
195 #write out info file
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
196 out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
197 out.write( 'dbkey\t%s\n' % ( dbkey ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
198 out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
199 out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
200 for line in open( os.path.join( input_dir, 'trackDb.sql' ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
201 line = line.strip()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
202 if line.startswith( '-- Dump completed on ' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
203 line = line[ len( '-- Dump completed on ' ): ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
204 out.write( 'dump_time\t%s\n' % ( line ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
205 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
206 out.write( 'dump_hash\t%s\n' % ( database_hash ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
207 out.write( 'profiler_time\t%s\n' % ( time.time() ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
208 out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
209 out.write( 'profiler_version\t%s\n' % ( VERSION ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
210 out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
211 out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
212 out.close()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
213
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
214 def __main__():
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
215 usage = "usage: %prog options"
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
216 parser = OptionParser( usage=usage )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
217 parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
218 parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
219 parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
220 parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
221 parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
222 parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
223 parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
224
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
225 ( options, args ) = parser.parse_args()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
226
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
227 input_dir = options.input_dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
228 if '%' in input_dir:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
229 input_dir = input_dir % options.dbkey
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
230 assert os.path.exists( input_dir ), 'Input directory does not exist'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
231 output_dir = options.output_dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
232 if '%' in output_dir:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
233 output_dir = output_dir % options.dbkey
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
234 assert not os.path.exists( output_dir ), 'Output directory already exists'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
235 os.makedirs( output_dir )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
236 ftp_path = options.ftp_path
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
237 if '%' in ftp_path:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
238 ftp_path = ftp_path % options.dbkey
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
239
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
240 #Get chromosome names and lengths
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
241 chrom_lengths = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
242 if options.chromosomes:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
243 for chrom in options.chromosomes.split( ',' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
244 fields = chrom.split( '=' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
245 chrom = fields[0]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
246 if len( fields ) > 1:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
247 chrom_len = int( fields[1] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
248 else:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
249 chrom_len = options.bitset_size
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
250 chrom_lengths[ chrom ] = chrom_len
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
251 chroms = chrom_lengths.keys()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
252 print 'Chrom info taken from command line option.'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
253 else:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
254 try:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
255 for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
256 fields = line.strip().split( '\t' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
257 chrom_lengths[ fields[0] ] = int( fields[ 1 ] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
258 chroms = chrom_lengths.keys()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
259 print 'Chrom info taken from chromInfo.txt.gz.'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
260 except Exception, e:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
261 print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.'
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
262 chrom_lengths = {} #zero out chrom_lengths
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
263 chroms = []
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
264 ftp = FTP( options.ftp_site )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
265 ftp.login()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
266 for name in ftp.nlst( ftp_path ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
267 if name.endswith( '.fa.gz' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
268 chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
269 ftp.close()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
270 for chrom in chroms:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
271 chrom_lengths[ chrom ] = options.bitset_size
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
272 #sort chroms by length of name, decending; necessary for when table names start with chrom name
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
273 chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
274
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
275 #parse tables from local files
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
276 #loop through directory contents, if file ends in '.sql', process table
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
277 for filename in os.listdir( input_dir ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
278 if filename.endswith ( '.sql' ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
279 base_filename = filename[ 0:-len( '.sql' ) ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
280 table_out_dir = os.path.join( output_dir, base_filename )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
281 #some tables are chromosome specific, lets strip off the chrom name
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
282 for chrom in chroms:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
283 if base_filename.startswith( "%s_" % chrom ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
284 #found chromosome
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
285 table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
286 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
287 #create table dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
288 if not os.path.exists( table_out_dir ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
289 os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
290 print "Created table dir (%s)." % table_out_dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
291 else:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
292 print "Table dir (%s) already exists." % table_out_dir
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
293 #find column assignments
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
294 table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
295 if chrom_col is None or start_col is None or end_col is None:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
296 print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
297 if not os.listdir( table_out_dir ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
298 print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
299 os.rmdir( table_out_dir )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
300 continue
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
301 #build bitsets from table
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
302 bitset_dict = {}
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
303 for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ):
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
304 fields = line.strip().split( '\t' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
305 chrom = fields[ chrom_col ]
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
306 start = int( fields[ start_col ] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
307 end = int( fields[ end_col ] )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
308 if chrom not in bitset_dict:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
309 bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
310 bitset_dict[ chrom ].set_range( start, end - start )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
311 #write bitsets as profiled annotations
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
312 for chrom_name, chrom_bits in bitset_dict.iteritems():
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
313 out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
314 end = 0
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
315 total_regions = 0
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
316 total_coverage = 0
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
317 max_size = chrom_lengths.get( chrom_name, options.bitset_size )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
318 while True:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
319 start = chrom_bits.next_set( end )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
320 if start >= max_size:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
321 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
322 end = chrom_bits.next_clear( start )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
323 out.write( struct.pack( STRUCT_FMT, start ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
324 out.write( struct.pack( STRUCT_FMT, end ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
325 total_regions += 1
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
326 total_coverage += end - start
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
327 if end >= max_size:
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
328 break
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
329 out.close()
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
330 open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
331 open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
332
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
333 #create xml
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
334 create_grouping_xml( input_dir, output_dir, options.dbkey )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
335 #create database dump info file, for database version control
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
336 write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size )
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
337
4414f0739808 Imported from capsule None
devteam
parents:
diff changeset
338 if __name__ == "__main__": __main__()