# HG changeset patch # User blankenberg # Date 1422302056 18000 # Node ID 0e5299e77334f4e09f90160f340dcd150c53cb70 # Parent d59e1d23b38862177107dca40eeda1a914c860e4 Uploaded diff -r d59e1d23b388 -r 0e5299e77334 data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py --- a/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py Wed Jul 02 00:35:38 2014 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py Mon Jan 26 14:54:16 2015 -0500 @@ -14,7 +14,7 @@ import gzip import bz2 -from galaxy.util.json import from_json_string, to_json_string +from json import loads, dumps CHUNK_SIZE = 2**20 #1mb @@ -55,7 +55,7 @@ def _get_stream_readers_for_tar( file_obj, tmp_dir ): fasta_tar = tarfile.open( fileobj=file_obj, mode='r:*' ) - return [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] + return filter( lambda x: x is not None, [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] ) def _get_stream_readers_for_zip( file_obj, tmp_dir ): fasta_zip = zipfile.ZipFile( file_obj, 'r' ) @@ -177,7 +177,6 @@ def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ): UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' - UCSC_CHROM_FA_FILENAME = 'chromFa' UCSC_DOWNLOAD_PATH = '/goldenPath/%s/bigZips/' COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ] @@ -186,6 +185,8 @@ email = 'anonymous@example.com' ucsc_dbkey = params['param_dict']['reference_source']['requested_dbkey'] or dbkey + UCSC_CHROM_FA_FILENAMES = [ '%s.chromFa' % ucsc_dbkey, 'chromFa' ] + ftp = FTP( UCSC_FTP_SERVER ) ftp.login( 'anonymous', email ) @@ -195,9 +196,13 @@ ucsc_file_name = None get_stream_reader = None ext = None - for ext, get_stream_reader in COMPRESSED_EXTENSIONS: - if "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) in path_contents: - ucsc_file_name = "%s%s%s" % ( ucsc_path, UCSC_CHROM_FA_FILENAME, ext ) + ucsc_chrom_fa_filename = None + for ucsc_chrom_fa_filename in UCSC_CHROM_FA_FILENAMES: + for ext, get_stream_reader in COMPRESSED_EXTENSIONS: + if "%s%s" % ( ucsc_chrom_fa_filename, ext ) in path_contents: + ucsc_file_name = "%s%s%s" % ( ucsc_path, ucsc_chrom_fa_filename, ext ) + break + if ucsc_file_name: break if not ucsc_file_name: @@ -205,7 +210,7 @@ tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' ) - ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( UCSC_CHROM_FA_FILENAME, ext ) ) + ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( ucsc_chrom_fa_filename, ext ) ) fasta_base_filename = "%s.fa" % sequence_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) @@ -237,7 +242,7 @@ requested_identifier = params['param_dict']['reference_source']['requested_identifier'] url = NCBI_DOWNLOAD_URL % requested_identifier - fasta_reader = urllib2.urlopen( url ) + fasta_readers = urllib2.urlopen( url ) for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): if data_table_entry: @@ -246,7 +251,7 @@ def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ): #TODO: we should automatically do decompression here urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) - fasta_reader = [ urllib2.urlopen( url ) for url in urls ] + fasta_readers = [ urllib2.urlopen( url ) for url in urls ] for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): if data_table_entry: @@ -256,9 +261,9 @@ #TODO: allow multiple FASTA input files input_filename = params['param_dict']['reference_source']['input_fasta'] if isinstance( input_filename, list ): - fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] + fasta_readers = [ open( filename, 'rb' ) for filename in input_filename ] else: - fasta_reader = open( input_filename ) + fasta_readers = open( input_filename ) for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): if data_table_entry: @@ -271,10 +276,10 @@ data_table_entries = _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) else: if isinstance( input_filename, list ): - fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] + fasta_readers = [ open( filename, 'rb' ) for filename in input_filename ] else: - fasta_reader = open( input_filename ) - data_table_entries = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) + fasta_readers = open( input_filename ) + data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) for data_table_name, data_table_entry in data_table_entries: if data_table_entry: _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) @@ -391,7 +396,7 @@ filename = args[0] - params = from_json_string( open( filename ).read() ) + params = loads( open( filename ).read() ) target_directory = params[ 'output_data' ][0]['extra_files_path'] os.mkdir( target_directory ) data_manager_dict = {} @@ -405,6 +410,6 @@ REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ) #save info to json file - open( filename, 'wb' ).write( to_json_string( data_manager_dict ) ) + open( filename, 'wb' ).write( dumps( data_manager_dict ) ) if __name__ == "__main__": main() diff -r d59e1d23b388 -r 0e5299e77334 data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml --- a/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml Wed Jul 02 00:35:38 2014 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml Mon Jan 26 14:54:16 2015 -0500 @@ -2,9 +2,9 @@ fetching data_manager_fetch_genome_all_fasta_dbkeys.py "${out_file}" #if str( $dbkey_source.dbkey_source_selector ) == 'existing': - --dbkey_description ${ dbkey.get_display_text() } + --dbkey_description ${ dbkey_source.dbkey.get_display_text() } #else - --dbkey_description "${ dbkey_source.dbkey_name or dbkey_source.dbkey }" + --dbkey_description "${ dbkey_source.dbkey_name or $dbkey_source.dbkey }" #end if @@ -24,7 +24,6 @@ - @@ -48,7 +47,7 @@ - +