Mercurial > repos > bgruening > data_manager_diamond_database_builder
changeset 3:574c3895b0ca draft
Uploaded
author | bgruening |
---|---|
date | Sat, 07 Feb 2015 22:26:05 -0500 |
parents | b9e8963a5eee |
children | 8bb8bec1a084 |
files | data_manager/data_manager_diamond_database_builder.py |
diffstat | 1 files changed, 22 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_diamond_database_builder.py Sat Feb 07 22:21:26 2015 -0500 +++ b/data_manager/data_manager_diamond_database_builder.py Sat Feb 07 22:26:05 2015 -0500 @@ -26,20 +26,6 @@ sys.exit(1) -def get_id_name( params ): - #TODO: ensure sequence_id is unique and does not already appear in location file - sequence_id = params['param_dict']['sequence_id'] - if not sequence_id: - sequence_id = dbkey - - sequence_name = params['param_dict']['sequence_name'] - if not sequence_name: - sequence_name = fasta_description - if not sequence_name: - sequence_name = dbkey - return sequence_id, sequence_name - - def _get_files_in_ftp_path( ftp, path ): path_contents = [] ftp.retrlines( 'MLSD %s' % ( path ), path_contents.append ) @@ -68,7 +54,7 @@ return [ bz2.BZ2File( file_obj.name, 'rb' ) ] -def download_from_ncbi( data_manager_dict, params, target_directory, sequence_id, sequence_name ): +def download_from_ncbi( data_manager_dict, params, target_directory, database_id, database_name ): NCBI_FTP_SERVER = 'ftp.ncbi.nlm.nih.gov' NCBI_DOWNLOAD_PATH = '/blast/db/FASTA/' COMPRESSED_EXTENSIONS = [ ( '.tar.gz', _get_stream_readers_for_tar ), ( '.tar.bz2', _get_stream_readers_for_tar ), ( '.zip', _get_stream_readers_for_zip ), ( '.fa.gz', _get_stream_readers_for_gzip ), ( '.fa.bz2', _get_stream_readers_for_bz2 ) ] @@ -83,17 +69,17 @@ get_stream_reader = None ext = None for ext, get_stream_reader in COMPRESSED_EXTENSIONS: - if "%s%s" % ( sequence_name, ext ) in path_contents: - ucsc_file_name = "%s%s%s" % ( UCSC_DOWNLOAD_PATH, sequence_name, ext ) + if "%s%s" % ( database_name, ext ) in path_contents: + ucsc_file_name = "%s%s%s" % ( UCSC_DOWNLOAD_PATH, database_name, ext ) break if not ucsc_file_name: raise Exception( 'Unable to determine filename for UCSC Genome for %s: %s' % ( ucsc_dbkey, path_contents ) ) tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-ucsc-' ) - ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( sequence_name, ext ) ) + ucsc_fasta_filename = os.path.join( tmp_dir, "%s%s" % ( database_name, ext ) ) - fasta_base_filename = "%s.fa" % sequence_id + fasta_base_filename = "%s.fa" % database_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) fasta_writer = open( fasta_filename, 'wb+' ) @@ -109,7 +95,7 @@ fasta_readers = get_stream_reader( tmp_fasta, tmp_extract_dir ) - data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, sequence_id, sequence_name, params ) + data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, database_id, database_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) for fasta_reader in fasta_readers: @@ -118,16 +104,16 @@ cleanup_before_exit( tmp_dir ) -def download_from_url( data_manager_dict, params, target_directory, sequence_id, sequence_name ): +def download_from_url( data_manager_dict, params, target_directory, database_id, database_name ): #TODO: we should automatically do decompression here urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) fasta_reader = [ urllib2.urlopen( url ) for url in urls ] - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, sequence_id, sequence_name, params ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, database_id, database_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) -def download_from_history( data_manager_dict, params, target_directory, sequence_id, sequence_name ): +def download_from_history( data_manager_dict, params, target_directory, database_id, database_name ): #TODO: allow multiple FASTA input files input_filename = params['param_dict']['reference_source']['input_fasta'] if isinstance( input_filename, list ): @@ -135,21 +121,21 @@ else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, sequence_id, sequence_name, params ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, database_id, database_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) -def copy_from_directory( data_manager_dict, params, target_directory, sequence_id, sequence_name ): +def copy_from_directory( data_manager_dict, params, target_directory, database_id, database_name ): input_filename = params['param_dict']['reference_source']['fasta_filename'] create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' if create_symlink: - data_table_entry = _create_symlink( input_filename, target_directory, sequence_id, sequence_name ) + data_table_entry = _create_symlink( input_filename, target_directory, database_id, database_name ) else: if isinstance( input_filename, list ): fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, sequence_id, sequence_name, params ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, database_id, database_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) @@ -160,8 +146,8 @@ return data_manager_dict -def _stream_fasta_to_file( fasta_stream, target_directory, sequence_id, sequence_name, params, close_stream=True ): - fasta_base_filename = "%s.fa" % sequence_id +def _stream_fasta_to_file( fasta_stream, target_directory, database_id, database_name, params, close_stream=True ): + fasta_base_filename = "%s.fa" % database_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) fasta_writer = open( fasta_filename, 'wb+' ) @@ -194,14 +180,14 @@ fasta_writer.close() - return dict( value=sequence_id, name=sequence_name, path=fasta_base_filename ) + return dict( value=database_id, name=database_name, path=fasta_base_filename ) -def _create_symlink( input_filename, target_directory, sequence_id, sequence_name ): - fasta_base_filename = "%s.fa" % sequence_id +def _create_symlink( input_filename, target_directory, database_id, database_name ): + fasta_base_filename = "%s.fa" % database_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) os.symlink( input_filename, fasta_filename ) - return dict( value=sequence_id, name=sequence_name, path=fasta_base_filename ) + return dict( value=database_id, name=database_name, path=fasta_base_filename ) REFERENCE_SOURCE_TO_DOWNLOAD = dict( ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory ) @@ -219,10 +205,11 @@ os.mkdir( target_directory ) data_manager_dict = {} - sequence_id, sequence_name = get_id_name( params ) + database_id = params['param_dict']['database_id'] + database_name = params['param_dict']['database_name'] #Fetch the FASTA - REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, sequence_id, sequence_name ) + REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, database_id, database_name ) #save info to json file open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )