ctat_genome_resource_libs_data_manager_3: data_manager/add_ctat_resource

annotate data_manager/add_ctat_resource_lib.py @ 6:be2761745400 draft

Uploaded

author	trinity_ctat
date	Fri, 04 May 2018 13:19:47 -0400
parents	7f1257532b6f
children	f22a13378750

rev	line source
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	1 #!/usr/bin/env python
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	3
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	5 # other example code on the web.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	6 # This now allows downloading of a user selected library
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	7 # but only from the CTAT Genome Resource Library website.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	8 # Ultimately we might want to allow the user to specify any location
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	9 # from which to download.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	10 # Users can create or download other libraries and use this tool to add them if they don't want
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	11 # to add them by hand.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	12
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	13 import argparse
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	14 import os
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	15 #import tarfile
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	16 #import urllib
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	17 import subprocess
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	18
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	19 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	20 from galaxy.util.json import to_json_string
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	21 # The following is not being used, but leaving as info
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	22 # in case we ever want to get input values using json.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	23 # from galaxy.util.json import from_json_string
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	24
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	25 # datetime.now() is used to create the unique_id
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	26 from datetime import datetime
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	27
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	28 # The FileListParser is used by get_ctat_genome_filenames(),
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	29 # which is called by the Data Manager interface (.xml file) to get
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	30 # the filenames that are available online at broadinstitute.org
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	31 # Not sure best way to do it.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	32 # This object uses HTMLParser to look through the html
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	33 # searching for the filenames within anchor tags.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	34 import urllib2
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	35 from HTMLParser import HTMLParser
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	36
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	37 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	38 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	39 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	40 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	41 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	42 _CTAT_RefGenome_Filename = 'ref_genome.fa'
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	43 _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	44 _Download_TestFile = "write_testfile.txt"
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	45 _DownloadSuccessFile = 'download_succeeded.txt'
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	46
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	47 class FileListParser(HTMLParser):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	48 def __init__(self):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	49 # Have to use direct call to super class rather than using super():
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	50 # super(FileListParser, self).__init__()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	51 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	52 HTMLParser.__init__(self)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	53 self.urls = set()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	54 def handle_starttag(self, tag, attrs):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	55 # Look for filename references in anchor tags and add them to urls.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	56 if tag == "a":
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	57 # The tag is an anchor tag.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	58 for attribute in attrs:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	59 # print "Checking: {:s}".format(str(attribute))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	60 if attribute[0] == "href":
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	61 # Does the href have a tar.gz in it?
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	62 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	63 # Add the value to urls.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	64 self.urls.add(attribute[1])
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	65 # End of class FileListParser
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	66
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	67 def get_ctat_genome_urls():
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	68 # open the url and retrieve the urls of the files in the directory.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	69 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	70 theHTML = resource.read()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	71 filelist_parser = FileListParser()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	72 filelist_parser.feed(theHTML)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	73 # For dynamic options need to return an interable with contents that are tuples with 3 items.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	74 # Item one is a string that is the display name put into the option list.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	75 # Item two is the value that is put into the parameter associated with the option list.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	76 # Item three is a True or False value, indicating whether the item is selected.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	77 options = []
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	78 for i, url in enumerate(filelist_parser.urls):
5 7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	79 # The urls should look like:
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	80 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	81 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
5 7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	82 # But is actuality, they are coming in looking like:
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	83 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	84 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	85 # Write code to handle both situations, or an ftp: url.
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	86 if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"):
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	87 full_url_path = url
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	88 else:
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	89 # Assume the path is relative to the page location.
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	90 full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url)
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	91 filename = url.split("/")[-1]
5 7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	92
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	93 if filename.split("_")[0] != "Mouse":
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	94 # Take out the mouse genome options for now.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	95 # The mouse genome option is not handled correctly yet
5 7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	96 options.append((filename, full_url_path, i == 0))
7f1257532b6f Uploaded trinity_ctat parents: 4 diff changeset	97 options.sort() # So the list will be in alphabetical order.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	98 # return a tuple of the urls
4 c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	99 print "The list being returned as options is:"
c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	100 print "{:s}\n".format(str(options))
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	101 return options
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	102
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	103 # The following was used by the example program to get input parameters through the json.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	104 # Just leaving here for reference.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	105 # We are getting all of our parameter values through command line arguments.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	106 #def get_reference_id_name(params):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	107 # genome_id = params['param_dict']['genome_id']
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	108 # genome_name = params['param_dict']['genome_name']
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	109 # return genome_id, genome_name
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	110 #
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	111 #def get_url(params):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	112 # trained_url = params['param_dict']['trained_url']
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	113 # return trained_url
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	114
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	115 def print_directory_contents(dir_path, num_levels):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	116 if num_levels > 0:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	117 if os.path.exists(dir_path) and os.path.isdir(dir_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	118 print "\nDirectory {:s}:".format(dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	119 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	120 else:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	121 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	122 if num_levels > 1:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	123 for filename in os.listdir(dir_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	124 filename_path = "{:s}/{:s}".format(dir_path, filename)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	125 if os.path.exists(filename_path) and os.path.isdir(filename_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	126 print_directory_contents(filename_path, num_levels-1)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	127
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	128 def download_from_BroadInst(source, destination, force_download):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	129 # Input Parameters
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	130 # source is the full URL of the file we want to download.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	131 # It should look something like:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	132 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	133 # destination is the location where the source file will be unarchived.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	134 # Relative paths are expanded using the current working directory, so within Galaxy,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	135 # it is best to send in absolute fully specified path names so you know to where
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	136 # the source file going to be extracted.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	137 # force_download will cause a new download and extraction to occur, even if the destination
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	138 # has a file in it indicating that a previous download succeeded.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	139 #
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	140 # Returns the following:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	141 # return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	142 # downloaded_directory
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	143 # The directory which was created as a subdirectory of the destination directory
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	144 # when the download occurred, or if there was no download,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	145 # possibly the same directory as destination, if that is where the data resides.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	146 # download_has_source_data
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	147 # Is a boolean indicating whether the source file was "source_data" or was "plug-n-play".
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	148 # genome_build_directory
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	149 # The directory where the genome resource library is or where it should be built.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	150 # It can be the same as the downloaded directory, but is sometimes a subdirectory of it.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	151 # lib_was_downloaded
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	152 # Since it doesn't always do the download, the function returns whether download occurred.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	153 lib_was_downloaded = False
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	154
4 c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	155 print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source))
c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	156
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	157 # Get the root filename of the Genome Directory.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	158 src_filename = source.split("/")[-1]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	159 root_genome_dirname = src_filename.split(".")[0]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	160 # If the src_filename indicates it is a source file, as opposed to plug-n-play,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	161 # then we may need to do some post processing on it.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	162 type_of_download = src_filename.split(".")[1]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	163 download_has_source_data = (type_of_download == "source_data")
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	164
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	165 # We want to make sure that destination is absolute fully specified path.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	166 cannonical_destination = os.path.realpath(destination)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	167 if os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	168 if not os.path.isdir(cannonical_destination):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	169 raise ValueError("The destination is not a directory: " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	170 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	171 # else all is good. It is a directory.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	172 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	173 # We need to create it.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	174 try:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	175 os.makedirs(cannonical_destination)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	176 except os.error:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	177 print "ERROR: Trying to create the following directory path:"
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	178 print "\t{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	179 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	180
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	181 # Make sure the directory now exists and we can write to it.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	182 if not os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	183 # It should have been created, but if it doesn't exist at this point
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	184 # in the code, something is wrong. Raise an error.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	185 raise OSError("The destination directory could not be created: " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	186 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	187 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	188 try:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	189 filehandle = open(test_writing_file, "w")
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	190 filehandle.write("Testing writing to this file.")
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	191 filehandle.close()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	192 os.remove(test_writing_file)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	193 except IOError:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	194 print "The destination directory could not be written into: " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	195 "{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	196 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	197
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	198 # Get the list of files in the directory,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	199 # We use it to check for a previous download or extraction among other things.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	200 orig_files_in_destdir = set(os.listdir(cannonical_destination))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	201 # See whether the file has been downloaded already.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	202 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	203 if ((_DownloadSuccessFile not in orig_files_in_destdir) \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	204 or (root_genome_dirname not in orig_files_in_destdir) \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	205 or force_download):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	206 # Check whether there is enough space on the device for the library.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	207 statvfs = os.statvfs(cannonical_destination)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	208 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	209 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	210 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	211 # are allowed to use (excl. reserved space)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	212 if (num_avail_bytes < _NumBytesNeededForBuild):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	213 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	214 " on the device of the destination directory: " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	215 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	216
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	217 #Previous code to download and untar. Not using anymore.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	218 #full_filepath = os.path.join(destination, src_filename)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	219 #
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	220 #Download ref: https://dzone.com/articles/how-download-file-python
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	221 #f = urllib2.urlopen(source)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	222 #data = f.read()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	223 #with open(full_filepath, 'wb') as code:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	224 # code.write(data)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	225 #
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	226 #Another way to download:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	227 #try:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	228 # urllib.urlretrieve(url=source, filename=full_filepath)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	229 #
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	230 #Then untar the file.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	231 #try:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	232 # tarfile.open(full_filepath, mode='r:*').extractall()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	233
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	234 if (_DownloadSuccessFile in orig_files_in_destdir):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	235 # Since we are redoing the download,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	236 # the success file needs to be removed
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	237 # until the download has succeeded.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	238 os.remove(download_success_file_path)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	239 # We want to transfer and untar the file without storing the tar file, because that
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	240 # adds all that much more space to the needed amount of free space on the disk.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	241 # Use subprocess to pipe the output of curl into tar.
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	242 command = "curl --silent {:s} \| tar -xzf - -C {:s}".format(source, cannonical_destination)
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	243 try: # to send the command that downloads and extracts the file.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	244 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	245 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	246 # any problem, maybe we should not be checking output.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	247 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	248 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	249 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	250 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	251 lib_was_downloaded = True
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	252
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	253 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	254 print "\n*******************************\nFinished download and extraction."
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	255 print_directory_contents(cannonical_destination, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	256 print "*******************************\n"
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	257
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	258 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	259 if (root_genome_dirname not in newfiles_in_destdir):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	260 # Perhaps it has a different name than what we expected it to be.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	261 # It will be the file that was not in the directory
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	262 # before we did the download and extraction.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	263 found_filename = None
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	264 if len(newfiles_in_destdir) == 1:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	265 found_filename = newfiles_in_destdir[0]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	266 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	267 for filename in newfiles_in_destdir:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	268 # In most cases, there will only be one new file, but some OS's might have created
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	269 # other files in the directory.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	270 # Look for the directory that was downloaded and extracted.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	271 # The correct file's name should be a substring of the tar file that was downloaded.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	272 if filename in src_filename:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	273 found_filename = filename
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	274 if found_filename is not None:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	275 root_genome_dirname = found_filename
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	276
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	277 downloaded_directory = "{:s}/{:s}".format(cannonical_destination, root_genome_dirname)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	278
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	279 if (os.path.exists(downloaded_directory)):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	280 try:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	281 # Create a file to indicate that the download succeeded.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	282 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	283 except IOError:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	284 print "The download_success file could not be created: " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	285 "{:s}".format(download_success_file_path)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	286 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	287 # Look for the build directory, or specify the path where it should be placed.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	288 if len(os.listdir(downloaded_directory)) == 1:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	289 # Then that one file is a subdirectory that should be the downloaded_directory.
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	290 # That is how the plug-n-play directories are structured.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	291 subdir_filename = os.listdir(downloaded_directory)[0]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	292 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, subdir_filename)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	293 else:
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	294 # In this case, we have source_data in the directory. The default will be to create
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	295 # the build directory in the downloaded_directory with the default _CTAT_Build_dirname.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	296 # In this case, this directory will not exist yet until the library is built.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	297 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_Build_dirname)
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	298 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	299 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	300 "\n\t{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	301
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	302 return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	303
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	304 def gmap_the_library(genome_build_directory):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	305 # This is the processing that needs to happen for gmap-fusion to work.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	306 # genome_build_directory should normally be a fully specified path,
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	307 # though this function should work even if it is relative.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	308 # The command prints messages out to stderr, even when there is not an error,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	309 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	310 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	311 genome_build_directory, genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	312 try: # to send the gmap_build command.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	313 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	314 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	315 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	316 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	317 finally:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	318 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	319 print "\n*******************************\nAfter running gmap_build."
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	320 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	321 print "*******************************\n"
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	322
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	323 def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	324 """ genome_source_directory is the location of the source_data needed to build the library.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	325 Normally it is fully specified, but could be relative.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	326 genome_build_directory is the location where the library will be built.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	327 It can be relative to the current working directory or an absolute path.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	328 build specifies whether to run prep_genome_lib.pl even if it was run before.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	329 gmap_build specifies whether to run gmap_build or not.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	330
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	331 Following was the old way to do it. Before FusionFilter 0.5.0.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	332 prep_genome_lib.pl \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	333 --genome_fa ref_genome.fa \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	334 --gtf ref_annot.gtf \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	335 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	336 --fusion_annot_lib fusion_lib.dat.gz
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	337 --output_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	338 index_pfam_domain_info.pl \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	339 --pfam_domains PFAM.domtblout.dat.gz \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	340 --genome_lib_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	341 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	342 """
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	343 if (genome_source_directory != "" ) and build:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	344 if os.path.exists(genome_source_directory):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	345 os.chdir(genome_source_directory)
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	346 # Create the command that builds the Genome Resource Library form the source data.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	347 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	348 "--pfam_db PFAM.domtblout.dat.gz " + \
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	349 "--output_dir {:s}".format(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	350 found_HumanFusionLib = False
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	351 HumanFusionLib_filename = "NoFileFound"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	352 for filename in os.listdir(genome_source_directory):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	353 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	354 # We only check the prefix, in case other versions are used later.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	355 # I assume there is only one in the directory, but if there are more than one,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	356 # the later one, alphabetically, will be used.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	357 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	358 found_HumanFusionLib = True
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	359 filename_of_HumanFusionLib = filename
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	360 if found_HumanFusionLib:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	361 # The mouse genomes do not have a fusion_annot_lib
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	362 # so only add the following for Human genomes.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	363 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	364 "--annot_filter_rule AnnotFilterRule.pm "
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	365 if gmap_build:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	366 command += "--gmap_build "
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	367 # Send stderr of the command to stdout, because some functions may write to stderr,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	368 # even though no error has occurred. We will depend on error code return in order
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	369 # to know if an error occurred.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	370 command += " 2>&1"
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	371 try: # to send the prep_genome_lib command.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	372 command_output = subprocess.check_call(command, shell=True)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	373 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	374 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	375 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	376 raise
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	377 finally:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	378 # Some code to help us if errors occur.
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	379 print "\n*******************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	380 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	381 print_directory_contents(genome_source_directory, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	382 print "\nContents of Genome Build Directory {:s}:".format(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	383 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	384 print "*******************************\n"
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	385 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	386 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	387 "The source directory does not exist:\n\t{:s}".format(genome_source_directory))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	388 elif gmap_build:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	389 gmap_the_library(genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	390
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	391 def search_for_genome_build_dir(top_dir_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	392 # If we do not download the directory, the topdir_path could be the
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	393 # location of the genome resource library, but we also want to allow the
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	394 # user to give the same value for top_dir_path that they do when a
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	395 # build happens, so we need to handle all three cases:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	396 # 1) Is the top_dir_path the build directory,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	397 # 2) or is it inside of the given directory,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	398 # 3) or is it inside a subdirectory of the given directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	399 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	400 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	401 genome_build_directory = None
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	402 print_warning = False
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	403
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	404 if not os.path.exists(top_dir_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	405 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	406 "The given directory does not exist:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	407 elif not os.path.isdir(top_dir_path):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	408 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	409 "The given directory is not a directory:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	410 if top_dir_path.split("/")[-1] == _CTAT_Build_dirname:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	411 print "Build directory is: {:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	412 # The top_dir_path is the path to the genome_build_directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	413 genome_build_directory = top_dir_path
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	414 else:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	415 # Look for it inside of the top_dir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	416 print "Looking inside of: {:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	417 top_dir_contents = os.listdir(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	418 if (_CTAT_Build_dirname in top_dir_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	419 # The genome_build_directory is inside of the top_dir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	420 print "1. Found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	421 genome_build_directory = "{:s}/{:s}".format(top_dir_path,_CTAT_Build_dirname)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	422 else:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	423 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	424 # Look down the directory tree two levels.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	425 build_dirs_in_subdirs = list()
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	426 subdirs_with_genome_files = list()
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	427 build_dirs_in_sub_subdirs = list()
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	428 sub_subdirs_with_genome_files = list()
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	429 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_path,entry)))]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	430 for subdir in subdirs:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	431 subdir_path = "{:s}/{:s}".format(top_dir_path, subdir)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	432 subdir_path_contents = os.listdir(subdir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	433 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	434 if (_CTAT_Build_dirname in subdir_path_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	435 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	436 print "2a, Found one."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	437 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	438 if (_CTAT_RefGenome_Filename in subdir_path_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	439 subdirs_with_genome_files.append(subdir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	440 # Since we are already looping, loop through all dirs one level deeper as well.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	441 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	442 for sub_subdir in sub_subdirs:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	443 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	444 sub_subdir_path_contents = os.listdir(sub_subdir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	445 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	446 if (_CTAT_Build_dirname in sub_subdir_path_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	447 # The genome_build_directory is inside of the sub_subdir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	448 print "3a. Found one."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	449 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	450 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	451 sub_subdirs_with_genome_files.append(sub_subdir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	452 # Hopefully there is one and only one found build directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	453 # If none are found we check for a directory containing the genome reference file,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	454 # but the build process sometimes causes more than one directory to have a copy,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	455 # so finding that file is not a sure thing.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	456 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	457 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	458 print "Found multiple CTAT Genome Resource Libraries " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	459 "in the given directory:\n\t{:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	460 print_directory_contents(top_dir_path, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	461 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	462 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	463 "in the given directory:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	464 elif len(build_dirs_in_subdirs) == 1:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	465 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	466 print "2b, Found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	467 genome_build_directory = build_dirs_in_subdirs[0]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	468 elif len(build_dirs_in_sub_subdirs) == 1:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	469 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	470 print "3b, Found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	471 genome_build_directory = build_dirs_in_sub_subdirs[0]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	472 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	473 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	474 print "Unable to find CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	475 "in the given directory:\n\t{:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	476 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	477 print_directory_contents(top_dir_path, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	478 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	479 raise ValueError("Unable to find CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	480 "in the given directory:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	481 elif (len(sub_subdirs_with_genome_files) == 1):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	482 print "3c, Maybe found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	483 genome_build_directory = sub_subdirs_with_genome_files[0]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	484 print_warning = True
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	485 elif (len(subdirs_with_genome_files) == 1):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	486 print "2c, Maybe found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	487 genome_build_directory = subdirs_with_genome_files[0]
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	488 print_warning = True
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	489 elif (_CTAT_RefGenome_Filename in top_dir_contents):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	490 print "1c. Maybe found it."
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	491 genome_build_directory = top_dir_path
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	492 print_warning = True
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	493 else:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	494 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	495 print "Unable to find CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	496 "in the given directory:\n\t{:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	497 print_directory_contents(top_dir_path, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	498 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	499 raise ValueError("Unable to find CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	500 "in the given directory:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	501 # end else
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	502 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	503 if (genome_build_directory is None):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	504 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	505 print "Cannot find the CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	506 "in the given directory:\n\t{:s}".format(top_dir_path)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	507 print_directory_contents(top_dir_path, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	508 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	509 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	510 "in the given directory:\n\t{:s}".format(top_dir_path))
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	511 elif (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	512 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	513 print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	514 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	515 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	516 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	517 if print_warning and genome_build_directory:
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	518 print "\n***************************************"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	519 print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	520 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	521 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	522 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	523 print "***************************************\n"
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	524 return genome_build_directory
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	525
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	526 def main():
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	527 #Parse Command Line
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	528 parser = argparse.ArgumentParser()
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	529 parser.add_argument('-s', '--source_url', default='', \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	530 help='This is the url of a file with the data. ' + \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	531 'They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.')
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	532 parser.add_argument('-n', '--display_name', default='', \
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	533 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	534 parser.add_argument('-o', '--output_filename', \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	535 help='Name of the output file, where the json dictionary will be written.')
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	536 parser.add_argument('-f', '--force_download',
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	537 help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true')
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	538 parser.add_argument('-b', '--build',
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	539 help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	540 'Must have downloaded source_data for this to work.', action='store_true')
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	541 parser.add_argument('-m', '--gmap_build',
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	542 help='Must be selected if you want the library to be gmapped. ' + \
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	543 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true')
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	544 requiredNamed = parser.add_argument_group('required named arguments')
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	545 requiredNamed.add_argument('-p', '--destination_path', required=True, \
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	546 help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	547 args = parser.parse_args()
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	548
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	549 # All of the input parameters are written by default to the output file prior to
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	550 # this program being called.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	551 # But I do not get input values from the json file, but rather from command line.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	552 # Just leaving the following code as a comment, in case it might be useful to someone later.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	553 # params = from_json_string(open(filename).read())
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	554 # target_directory = params['output_data'][0]['extra_files_path']
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	555 # os.mkdir(target_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	556
4 c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	557 print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url))
c372930aaba1 Uploaded trinity_ctat parents: 0 diff changeset	558
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	559 # FIX - not sure lib_was_downloaded actually serves a purpose...
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	560 lib_was_downloaded = False
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	561 download_has_source_data = False
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	562 downloaded_directory = None
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	563 genome_build_directory = None
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	564 # FIX - need to make sure we are handling all "possible" combinations of arguments.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	565 # Probably would be good if we could simplify/remove some of them.
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	566 # But I think the current interface is using them all.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	567 if (args.source_url != ""):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	568 downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded = \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	569 download_from_BroadInst(source=args.source_url, \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	570 destination=args.destination_path, \
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	571 force_download=args.force_download)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	572 else:
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	573 genome_build_directory = search_for_genome_build_dir(args.destination_path)
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	574
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	575 print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	576
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	577 # FIX - We should leave a file indicating build success the same way we do for download success.
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	578 # To take out builds for testing, coment out the next four lines.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	579 if (download_has_source_data or args.build or args.gmap_build) :
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	580 build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	581 elif (args.gmap_build):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	582 gmap_the_library(genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	583
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	584 # The following looks to see if the library actually exists after the build,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	585 # and raises an error if it cannot find the library files.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	586 # The reassignment of genome_build_directory should be superfluous,
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	587 # unless I made a mistake in the build code.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	588 # FIX - need to get the genome name from the directory name, if there was no download.
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	589 #genome_build_directory, genome_name_from_dirname = search_for_genome_build_dir(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	590 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	591
be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	592 source_filename_root = None
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	593 if (args.source_url != None) and (args.source_url != ""):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	594 # Get the name out of the source's filename.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	595 source_filename_root = args.source_url.split("/")[-1].split(".")[0]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	596
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	597 # Determine the display_name for the library.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	598 if (args.display_name is None) or (args.display_name == ""):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	599 if (source_filename_root != None) and (source_filename_root != ""):
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	600 # Create the display_name from the source_filename_root.
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	601 display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	602 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	603 display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	604 print "WARNING: We do not have a genome name."
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	605 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	606 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	607 display_name = display_name.replace(" ","_")
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	608
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	609 # Create a unique_id for the library.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	610 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	611 if (source_filename_root != None) and (source_filename_root != ""):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	612 unique_id = source_filename_root + datetime_stamp
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	613 elif (downloaded_directory != None) and (downloaded_directory != ""):
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	614 unique_id = os.path.basename(downloaded_directory).split(".")[0]
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	615 else:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	616 unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	617
6 be2761745400 Uploaded trinity_ctat parents: 5 diff changeset	618 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
0 d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	619 print "Its unique_id will be set to: {:s}\n".format(unique_id)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	620 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	621
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	622 data_manager_dict = {}
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	623 data_manager_dict['data_tables'] = {}
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	624 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	625 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	626 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	627
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	628 # Temporarily the output file's dictionary is written for debugging:
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	629 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	630 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	631 # which then puts it into the correct .loc file (I think).
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	632 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	633 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	634
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	635 if __name__ == "__main__":
d2c51cdc2172 Uploaded trinity_ctat parents: diff changeset	636 main()

Mercurial > repos > trinity_ctat > ctat_genome_resource_libs_data_manager_3

annotate data_manager/add_ctat_resource_lib.py @ 6:be2761745400 draft