ctat_genome_ref_lib_data_manager_test2: data_manager/add_ctat_centrifuge

author	trinity_ctat
date	Mon, 30 Apr 2018 16:17:37 -0400
parents
children

rev	line source
15 367b0d693b0c Uploaded trinity_ctat parents: diff changeset	1 #!/usr/bin/env python
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	3
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	5 # other example code on the web.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	6 # This allows downloading of a centrifuge index, or specification of its disk location.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	7 # This index is one of the input paramters needed by the ctat_metagenomics tool.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	8 # At the moment only one index is supported by the ctat_metagenomics tool:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	9 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	10
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	11 import argparse
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	12 import os
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	13 #import tarfile
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	14 #import urllib
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	15 import subprocess
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	16
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	17 # The following is used to generate a unique_id value
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	18 from datetime import *
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	19
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	20 # Remove the following line when testing without galaxy package:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	21 from galaxy.util.json import to_json_string
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	22 # Am not using the following:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	23 # from galaxy.util.json import from_json_string
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	24
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	25 # The FileListParser is used by get_ctat_genome_filenames(),
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	26 # which is called by the Data Manager interface (.xml file) to get
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	27 # the filenames that are available online at broadinstitute.org
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	28 # Not sure best way to do it.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	29 # This object uses HTMLParser to look through the html
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	30 # searching for the filenames within anchor tags.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	31 import urllib2
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	32 from HTMLParser import HTMLParser
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	33
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	34 _CTAT_CentrifugeIndexPage_URL = 'https://ccb.jhu.edu/software/centrifuge/'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	35 _CTAT_CentrifugeDownload_URL = 'ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	36 _CTAT_CentrifugeIndexTableName = 'ctat_centrifuge_indexes'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	37 _CTAT_CentrifugeDir_Name = 'p_compressed+h+v'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	38 _CTAT_Centrifuge_DisplayNamePrefix = 'CTAT_CentrifugeIndex_'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	39 _CentrifugeIndexFileExtension = 'cf'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	40 _NumBytesNeededForIndex = 7400130287 # 6.9 GB
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	41 #_DownloadFileSize = 5790678746 # 5.4 Gigabytes.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	42 _Download_TestFile = 'write_testfile.txt'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	43 _DownloadSuccessFile = 'download_succeeded.txt'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	44
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	45 class FileListParser(HTMLParser):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	46 def __init__(self):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	47 # Have to use direct call to super class rather than using super():
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	48 # super(FileListParser, self).__init__()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	49 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	50 HTMLParser.__init__(self)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	51 self.filenames = set()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	52 def handle_starttag(self, tag, attrs):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	53 # Look for filename references in anchor tags and add them to filenames.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	54 if tag == "a":
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	55 # The tag is an anchor tag.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	56 for attribute in attrs:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	57 # print "Checking: {:s}".format(str(attribute))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	58 if attribute[0] == "href":
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	59 # Does the href have a tar.gz in it?
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	60 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	61 # Add the value to filenames.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	62 self.filenames.add(attribute[1])
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	63 # End of class FileListParser
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	64
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	65 def get_ctat_centrifuge_index_locations():
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	66 # For dynamic options need to return an interable with contents that are tuples with 3 items.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	67 # Item one is a string that is the display name put into the option list.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	68 # Item two is the value that is put into the parameter associated with the option list.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	69 # Item three is a True or False value, indicating whether the item is selected.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	70 options = []
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	71 # open the url and retrieve the filenames of the files in the directory.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	72 resource = urllib2.urlopen(_CTAT_CentrifugeIndexPage_URL)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	73 theHTML = resource.read()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	74 filelist_parser = FileListParser()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	75 filelist_parser.feed(theHTML)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	76 # This is what was returned on 2018-04-23
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	77 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	78 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	79 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	80 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	81 # Which could be hard coded:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	82 # vals.append(("p_compressed+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz", True))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	83 # vals.append(("p+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz", False))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	84 # vals.append(("nt_2018_3_3", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz", False))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	85 # vals.append(("p_compressed_2018_4_15", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz", False))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	86 # but only returning the one we want, which for now is assumed to be present.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	87 # For now, I am printing the list, just so I can see what was returned,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	88 print "FYI: The URL's that were found on Centrifuge's page are:"
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	89 print "\t" + "\n\t".join(filelist_parser.filenames)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	90 # For now instead of sending back the list of found URL's, send back the one URL we want.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	91 # Currently, only one of the options is supported.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	92 vals.append((_CTAT_CentrifugeDir_Name, _CTAT_CentrifugeDownload_URL, True))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	93 print "The items in vals are:"
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	94 print str(vals)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	95 return vals
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	96
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	97 # The following was used by the example program to get input parameters through the json.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	98 # Just leaving here for reference.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	99 # We are getting all of our parameter values through command line arguments.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	100 #def get_reference_id_name(params):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	101 # genome_id = params['param_dict']['genome_id']
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	102 # genome_name = params['param_dict']['genome_name']
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	103 # return genome_id, genome_name
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	104 #
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	105 #def get_url(params):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	106 # trained_url = params['param_dict']['trained_url']
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	107 # return trained_url
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	108
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	109 def download_index(src_location, destination, force_download):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	110 # We do not know if the index has been downloaded already.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	111 # This function returns whether or not the index actually gets downloaded.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	112 index_was_downloaded = False
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	113 # Get the root filename of the Genome Directory.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	114 # The part after the last '/' and before the first '.'
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	115 root_index_dirname = src_location.split("/")[-1].split(".")[0]
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	116
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	117 # We want to make sure that destination is absolute fully specified path.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	118 cannonical_destination = os.path.realpath(destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	119 if cannonical_destination.split("/")[-1] != root_index_dirname:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	120 cannonical_destination += "/" + root_index_dirname
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	121 if os.path.exists(cannonical_destination):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	122 if not os.path.isdir(cannonical_destination):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	123 raise ValueError("The destination is not a directory: " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	124 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	125 # else all is good. It is a directory.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	126 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	127 # We need to create it.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	128 try:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	129 os.makedirs(cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	130 except os.error:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	131 print "ERROR: Trying to create the following directory path:"
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	132 print "\t{:s}".format(cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	133 raise
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	134
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	135 # Make sure the directory now exists and we can write to it.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	136 if not os.path.exists(cannonical_destination):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	137 # It should have been created, but if it doesn't exist at this point
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	138 # in the code, something is wrong. Raise an error.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	139 raise OSError("The destination directory could not be created: " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	140 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	141 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	142 try:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	143 filehandle = open(test_writing_file, "w")
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	144 filehandle.write("Testing writing to this file.")
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	145 filehandle.close()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	146 os.remove(test_writing_file)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	147 except IOError:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	148 print "The destination directory could not be written into: " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	149 "{:s}".format(cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	150 raise
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	151
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	152 # Get the list of files in the directory,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	153 # We use it to check for a previous download or extraction among other things.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	154 orig_files_in_destdir = set(os.listdir(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	155 # See whether the file has been downloaded already.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	156 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	157 if (_DownloadSuccessFile not in orig_files_in_destdir) or force_download:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	158 # Check whether there is enough space on the device for the index.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	159 statvfs = os.statvfs(cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	160 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	161 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	162 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	163 # are allowed to use (excl. reserved space)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	164 if (num_avail_bytes < _NumBytesNeededForIndex):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	165 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	166 " on the device of the destination directory: " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	167 "{:s}".format(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	168
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	169 #Previous code to download and untar. Not using anymore.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	170 #full_filepath = os.path.join(destination, src_filename)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	171 #
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	172 #Download ref: https://dzone.com/articles/how-download-file-python
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	173 #f = urllib2.urlopen(ctat_resource_lib_url)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	174 #data = f.read()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	175 #with open(full_filepath, 'wb') as code:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	176 # code.write(data)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	177 #
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	178 #Another way to download:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	179 #try:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	180 # urllib.urlretrieve(url=ctat_resource_lib_url, filename=full_filepath)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	181 #
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	182 #Then untar the file.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	183 #try:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	184 # tarfile.open(full_filepath, mode='r:*').extractall()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	185
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	186 if (_DownloadSuccessFile in orig_files_in_destdir):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	187 # Since we are redoing the download,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	188 # the success file needs to be removed
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	189 # until the download has succeeded.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	190 os.remove(download_success_file_path)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	191 # We want to transfer and untar the file without storing the tar file, because that
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	192 # adds all that much more space to the needed amount of free space on the disk.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	193 # Use subprocess to pipe the output of curl into tar.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	194 # Make curl silent so progress is not printed to stderr.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	195 command = "curl --silent {:s} \| tar -xzf - -C {:s}".format(src_location, cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	196 try: # to send the command that downloads and extracts the file.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	197 command_output = subprocess.check_output(command, shell=True)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	198 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	199 # any problem, maybe we should not be checking output.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	200 except subprocess.CalledProcessError:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	201 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	202 raise
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	203 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	204 index_was_downloaded = True
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	205
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	206 # Some code to help us if errors occur.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	207 print "\n*******************************\nFinished download and extraction."
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	208 subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	209
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	210 files_in_destdir = set(os.listdir(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	211 found_filenames = set()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	212 for filename in files_in_destdir:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	213 # There should be three files, but some OS's might have created
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	214 # other files in the directory, or maybe the user did.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	215 # Look for the index files.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	216 # The download files' names should start with the root_index_dirname
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	217 # print "Is root: {:s} in file: {:s}".format(root_index_dirname, filename)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	218 if root_index_dirname in filename:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	219 found_filenames.add(filename)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	220 # print "The found_filenames are:\n\t{:s}".format(str(found_filenames))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	221 if (len(found_filenames) >= 3):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	222 # FIX - we could md5 the files to make sure they are correct.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	223 # Or at least check their sizes, to see if the download completed ok.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	224 # Also we could check the names of the files.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	225 try:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	226 # Create a file to indicate that the download succeeded.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	227 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	228 except IOError:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	229 print "The download_success file could not be created: " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	230 "{:s}".format(download_success_file_path)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	231 raise
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	232 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	233 print "After download, the potential index files found are:\n\t{:s}".format(str(found_filenames))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	234 raise ValueError("ERROR: Could not find the extracted index files " + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	235 "in the destination directory:\n\t{:s}".format(cannonical_destination))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	236
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	237 return (cannonical_destination, root_index_dirname, index_was_downloaded)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	238
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	239 def main():
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	240 #Parse Command Line
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	241 # print "At start before parsing arguments."
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	242 parser = argparse.ArgumentParser()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	243 parser.add_argument('-d', '--download_location', default="", \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	244 help='This is the download location of the centrifuge index.')
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	245 parser.add_argument('-n', '--display_name', default="", \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	246 help='Is used as the selector text for the entry of this Centrifuge Index in the data table.')
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	247 parser.add_argument('-p', '--destination_path', \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	248 help='Full path of the Centrifuge Index location or destination, either where it is, or where it will be placed.')
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	249 parser.add_argument('-o', '--output_filename', \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	250 help='Name of the output file, where the json dictionary will be written.')
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	251 parser.add_argument('-f', '--force_download',
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	252 help='Forces download of the Centrifuge Index, even if previously downloaded. ' + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	253 'Requires download_location to be set in order to work.', action="store_true")
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	254 args = parser.parse_args()
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	255
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	256 # All of the input parameters are written by default to the output file prior to
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	257 # this program being called.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	258 # But I do not get input values from the json file, but rather from command line.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	259 # Just leaving the following code as a comment, in case it might be useful to someone later.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	260 # params = from_json_string(open(filename).read())
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	261 # target_directory = params['output_data'][0]['extra_files_path']
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	262 # os.mkdir(target_directory)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	263
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	264 # print "Arguments are parsed."
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	265 print "\ndownload_location is {:s}".format(str(args.download_location))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	266 print "display_name is {:s}".format(str(args.display_name))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	267 print "destination_path is {:s}\n".format(str(args.destination_path))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	268 root_index_dirname = None
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	269 # FIX - Prob don't need index_was_downloaded. Not doing anything with it.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	270 # But it indicates success downloading the index, so maybe should be checking it.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	271 index_was_downloaded = False
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	272 if (args.download_location != ""):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	273 index_directory, root_index_dirname, index_was_downloaded = \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	274 download_index(src_location=args.download_location, \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	275 destination=args.destination_path, \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	276 force_download=args.force_download)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	277 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	278 cannonical_destination = os.path.realpath(args.destination_path)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	279 if not os.path.exists(cannonical_destination):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	280 raise ValueError("Cannot find the Centrifuge Index.\n" + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	281 "The directory does not exist:\n\t{:s}".format(index_directory))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	282 # If args.destination_path is a directory containing
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	283 # a subdirectory that contains the index files,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	284 # then we need to set the index_directory to be that subdirectory.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	285 files_in_destination_path = os.listdir(cannonical_destination)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	286 if (len(files_in_destination_path) == 1):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	287 path_to_file = "{:s}/{:s}".format(cannonical_destination, files_in_destination_path[0])
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	288 if os.path.isdir(path_to_file):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	289 index_directory = path_to_file
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	290 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	291 index_directory = cannonical_destination
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	292 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	293 index_directory = cannonical_destination
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	294 # Get the root_index_dirname of the index from the index_directory name.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	295 root_index_dirname = index_directory.split("/")[-1].split(".")[0]
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	296
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	297 # Check if there is an actual Centrifuge Index file in the index_directory.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	298 print "\nThe location of the Centrifuge Index is {:s}.\n".format(index_directory)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	299 files_in_index_directory = set(os.listdir(index_directory))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	300 index_file_found = False
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	301 index_file_path = index_directory
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	302 for filename in files_in_index_directory:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	303 # The current index is split into 3 files.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	304 # filenames are in the form: index_root_name.#.cf,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	305 # where # is a numeral (1, 2, or 3)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	306 # indicating the order of the files.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	307 if filename.split(".")[-1] == _CentrifugeIndexFileExtension:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	308 index_file_found = True
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	309 # The centrifuge program wants the root name of the files to be final part of the path.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	310 index_file_path = "{:s}/{:s}".format(index_directory, filename.split(".")[0])
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	311 if not index_file_found:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	312 raise ValueError("Cannot find any Centrifuge Index files.\n" + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	313 "The contents of the directory {:s} are:\n\t".format(index_directory) + \
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	314 "\n\t".join(files_in_index_directory))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	315
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	316 # Set the display_name
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	317 if (args.display_name is None) or (args.display_name == ""):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	318 # Use the root_index_dirname.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	319 if (root_index_dirname != None) and (root_index_dirname != ""):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	320 display_name = _CTAT_Centrifuge_DisplayNamePrefix + root_index_dirname
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	321 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	322 display_name = _CTAT_Centrifuge_DisplayNamePrefix + _CTAT_CentrifugeDir_Name
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	323 print "WARNING: Did not set the display name. Using the default: {:s}".format(display_name_value)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	324 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	325 display_name = _CTAT_Centrifuge_DisplayNamePrefix + args.display_name
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	326 display_name = display_name.replace(" ","_")
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	327
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	328 # Set the unique_id
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	329 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	330 if (root_index_dirname != None) and (root_index_dirname != ""):
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	331 unique_id = root_index_dirname + datetime_stamp
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	332 else:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	333 unique_id = _CTAT_CentrifugeDir_Name + datetime_stamp
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	334
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	335 print "The Index's display_name will be set to: {:s}\n".format(display_name)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	336 print "Its unique_id will be set to: {:s}\n".format(unique_id)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	337 print "Its dir_path will be set to: {:s}\n".format(index_file_path)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	338
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	339 data_manager_dict = {}
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	340 data_manager_dict['data_tables'] = {}
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	341 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName] = []
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	342 data_table_entry = dict(value=unique_id, name=display_name, path=index_file_path)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	343 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName].append(data_table_entry)
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	344
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	345 # Temporarily the output file's dictionary is written for debugging:
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	346 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	347 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	348 # which then puts it into the correct .loc file (I think).
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	349 # Remove the following line when testing without galaxy package.
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	350 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	351
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	352 if __name__ == "__main__":
367b0d693b0c Uploaded trinity_ctat parents: diff changeset	353 main()

15

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

1 #!/usr/bin/env python

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

3

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

5 # other example code on the web.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

6 # This allows downloading of a centrifuge index, or specification of its disk location.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

7 # This index is one of the input paramters needed by the ctat_metagenomics tool.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

8 # At the moment only one index is supported by the ctat_metagenomics tool:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

9 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

10

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

11 import argparse

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

12 import os

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

13 #import tarfile

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

14 #import urllib

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

15 import subprocess

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

16

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

17 # The following is used to generate a unique_id value

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

18 from datetime import *

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

19

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

20 # Remove the following line when testing without galaxy package:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

21 from galaxy.util.json import to_json_string

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

22 # Am not using the following:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

23 # from galaxy.util.json import from_json_string

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

24

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

25 # The FileListParser is used by get_ctat_genome_filenames(),

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

26 # which is called by the Data Manager interface (.xml file) to get

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

27 # the filenames that are available online at broadinstitute.org

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

28 # Not sure best way to do it.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

29 # This object uses HTMLParser to look through the html

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

30 # searching for the filenames within anchor tags.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

31 import urllib2

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

32 from HTMLParser import HTMLParser

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

33

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

34 _CTAT_CentrifugeIndexPage_URL = 'https://ccb.jhu.edu/software/centrifuge/'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

35 _CTAT_CentrifugeDownload_URL = 'ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

36 _CTAT_CentrifugeIndexTableName = 'ctat_centrifuge_indexes'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

37 _CTAT_CentrifugeDir_Name = 'p_compressed+h+v'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

38 _CTAT_Centrifuge_DisplayNamePrefix = 'CTAT_CentrifugeIndex_'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

39 _CentrifugeIndexFileExtension = 'cf'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

40 _NumBytesNeededForIndex = 7400130287 # 6.9 GB

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

41 #_DownloadFileSize = 5790678746 # 5.4 Gigabytes.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

42 _Download_TestFile = 'write_testfile.txt'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

43 _DownloadSuccessFile = 'download_succeeded.txt'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

44

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

45 class FileListParser(HTMLParser):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

46 def __init__(self):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

47 # Have to use direct call to super class rather than using super():

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

48 # super(FileListParser, self).__init__()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

49 # because HTMLParser is an "old style" class and its inheritance chain does not include object.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

50 HTMLParser.__init__(self)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

51 self.filenames = set()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

52 def handle_starttag(self, tag, attrs):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

53 # Look for filename references in anchor tags and add them to filenames.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

54 if tag == "a":

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

55 # The tag is an anchor tag.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

56 for attribute in attrs:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

57 # print "Checking: {:s}".format(str(attribute))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

58 if attribute[0] == "href":

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

59 # Does the href have a tar.gz in it?

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

60 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

61 # Add the value to filenames.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

62 self.filenames.add(attribute[1])

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

63 # End of class FileListParser

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

64

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

65 def get_ctat_centrifuge_index_locations():

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

66 # For dynamic options need to return an interable with contents that are tuples with 3 items.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

67 # Item one is a string that is the display name put into the option list.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

68 # Item two is the value that is put into the parameter associated with the option list.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

69 # Item three is a True or False value, indicating whether the item is selected.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

70 options = []

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

71 # open the url and retrieve the filenames of the files in the directory.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

72 resource = urllib2.urlopen(_CTAT_CentrifugeIndexPage_URL)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

73 theHTML = resource.read()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

74 filelist_parser = FileListParser()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

75 filelist_parser.feed(theHTML)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

76 # This is what was returned on 2018-04-23

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

77 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

78 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

79 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

80 # ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

81 # Which could be hard coded:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

82 # vals.append(("p_compressed+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz", True))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

83 # vals.append(("p+h+v", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz", False))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

84 # vals.append(("nt_2018_3_3", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt_2018_3_3.tar.gz", False))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

85 # vals.append(("p_compressed_2018_4_15", "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed_2018_4_15.tar.gz", False))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

86 # but only returning the one we want, which for now is assumed to be present.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

87 # For now, I am printing the list, just so I can see what was returned,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

88 print "FYI: The URL's that were found on Centrifuge's page are:"

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

89 print "\t" + "\n\t".join(filelist_parser.filenames)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

90 # For now instead of sending back the list of found URL's, send back the one URL we want.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

91 # Currently, only one of the options is supported.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

92 vals.append((_CTAT_CentrifugeDir_Name, _CTAT_CentrifugeDownload_URL, True))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

93 print "The items in vals are:"

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

94 print str(vals)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

95 return vals

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

96

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

97 # The following was used by the example program to get input parameters through the json.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

98 # Just leaving here for reference.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

99 # We are getting all of our parameter values through command line arguments.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

100 #def get_reference_id_name(params):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

101 # genome_id = params['param_dict']['genome_id']

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

102 # genome_name = params['param_dict']['genome_name']

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

103 # return genome_id, genome_name

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

104 #

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

105 #def get_url(params):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

106 # trained_url = params['param_dict']['trained_url']

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

107 # return trained_url

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

108

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

109 def download_index(src_location, destination, force_download):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

110 # We do not know if the index has been downloaded already.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

111 # This function returns whether or not the index actually gets downloaded.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

112 index_was_downloaded = False

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

113 # Get the root filename of the Genome Directory.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

114 # The part after the last '/' and before the first '.'

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

115 root_index_dirname = src_location.split("/")[-1].split(".")[0]

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

116

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

117 # We want to make sure that destination is absolute fully specified path.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

118 cannonical_destination = os.path.realpath(destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

119 if cannonical_destination.split("/")[-1] != root_index_dirname:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

120 cannonical_destination += "/" + root_index_dirname

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

121 if os.path.exists(cannonical_destination):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

122 if not os.path.isdir(cannonical_destination):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

123 raise ValueError("The destination is not a directory: " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

124 "{:s}".format(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

125 # else all is good. It is a directory.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

126 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

127 # We need to create it.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

128 try:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

129 os.makedirs(cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

130 except os.error:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

131 print "ERROR: Trying to create the following directory path:"

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

132 print "\t{:s}".format(cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

133 raise

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

134

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

135 # Make sure the directory now exists and we can write to it.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

136 if not os.path.exists(cannonical_destination):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

137 # It should have been created, but if it doesn't exist at this point

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

138 # in the code, something is wrong. Raise an error.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

139 raise OSError("The destination directory could not be created: " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

140 "{:s}".format(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

141 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

142 try:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

143 filehandle = open(test_writing_file, "w")

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

144 filehandle.write("Testing writing to this file.")

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

145 filehandle.close()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

146 os.remove(test_writing_file)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

147 except IOError:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

148 print "The destination directory could not be written into: " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

149 "{:s}".format(cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

150 raise

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

151

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

152 # Get the list of files in the directory,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

153 # We use it to check for a previous download or extraction among other things.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

154 orig_files_in_destdir = set(os.listdir(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

155 # See whether the file has been downloaded already.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

156 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

157 if (_DownloadSuccessFile not in orig_files_in_destdir) or force_download:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

158 # Check whether there is enough space on the device for the index.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

159 statvfs = os.statvfs(cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

160 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

161 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

162 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

163 # are allowed to use (excl. reserved space)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

164 if (num_avail_bytes < _NumBytesNeededForIndex):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

165 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

166 " on the device of the destination directory: " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

167 "{:s}".format(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

168

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

169 #Previous code to download and untar. Not using anymore.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

170 #full_filepath = os.path.join(destination, src_filename)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

171 #

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

172 #Download ref: https://dzone.com/articles/how-download-file-python

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

173 #f = urllib2.urlopen(ctat_resource_lib_url)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

174 #data = f.read()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

175 #with open(full_filepath, 'wb') as code:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

176 # code.write(data)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

177 #

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

178 #Another way to download:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

179 #try:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

180 # urllib.urlretrieve(url=ctat_resource_lib_url, filename=full_filepath)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

181 #

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

182 #Then untar the file.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

183 #try:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

184 # tarfile.open(full_filepath, mode='r:*').extractall()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

185

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

186 if (_DownloadSuccessFile in orig_files_in_destdir):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

187 # Since we are redoing the download,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

188 # the success file needs to be removed

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

189 # until the download has succeeded.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

190 os.remove(download_success_file_path)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

191 # We want to transfer and untar the file without storing the tar file, because that

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

192 # adds all that much more space to the needed amount of free space on the disk.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

193 # Use subprocess to pipe the output of curl into tar.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

194 # Make curl silent so progress is not printed to stderr.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

195 command = "curl --silent {:s} | tar -xzf - -C {:s}".format(src_location, cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

196 try: # to send the command that downloads and extracts the file.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

197 command_output = subprocess.check_output(command, shell=True)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

198 # FIX - not sure check_output is what we want to use. If we want to have an error raised on

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

199 # any problem, maybe we should not be checking output.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

200 except subprocess.CalledProcessError:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

201 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

202 raise

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

203 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

204 index_was_downloaded = True

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

205

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

206 # Some code to help us if errors occur.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

207 print "\n*******************************\nFinished download and extraction."

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

208 subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

209

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

210 files_in_destdir = set(os.listdir(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

211 found_filenames = set()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

212 for filename in files_in_destdir:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

213 # There should be three files, but some OS's might have created

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

214 # other files in the directory, or maybe the user did.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

215 # Look for the index files.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

216 # The download files' names should start with the root_index_dirname

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

217 # print "Is root: {:s} in file: {:s}".format(root_index_dirname, filename)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

218 if root_index_dirname in filename:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

219 found_filenames.add(filename)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

220 # print "The found_filenames are:\n\t{:s}".format(str(found_filenames))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

221 if (len(found_filenames) >= 3):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

222 # FIX - we could md5 the files to make sure they are correct.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

223 # Or at least check their sizes, to see if the download completed ok.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

224 # Also we could check the names of the files.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

225 try:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

226 # Create a file to indicate that the download succeeded.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

227 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

228 except IOError:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

229 print "The download_success file could not be created: " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

230 "{:s}".format(download_success_file_path)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

231 raise

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

232 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

233 print "After download, the potential index files found are:\n\t{:s}".format(str(found_filenames))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

234 raise ValueError("ERROR: Could not find the extracted index files " + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

235 "in the destination directory:\n\t{:s}".format(cannonical_destination))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

236

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

237 return (cannonical_destination, root_index_dirname, index_was_downloaded)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

238

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

239 def main():

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

240 #Parse Command Line

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

241 # print "At start before parsing arguments."

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

242 parser = argparse.ArgumentParser()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

243 parser.add_argument('-d', '--download_location', default="", \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

244 help='This is the download location of the centrifuge index.')

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

245 parser.add_argument('-n', '--display_name', default="", \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

246 help='Is used as the selector text for the entry of this Centrifuge Index in the data table.')

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

247 parser.add_argument('-p', '--destination_path', \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

248 help='Full path of the Centrifuge Index location or destination, either where it is, or where it will be placed.')

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

249 parser.add_argument('-o', '--output_filename', \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

250 help='Name of the output file, where the json dictionary will be written.')

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

251 parser.add_argument('-f', '--force_download',

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

252 help='Forces download of the Centrifuge Index, even if previously downloaded. ' + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

253 'Requires download_location to be set in order to work.', action="store_true")

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

254 args = parser.parse_args()

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

255

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

256 # All of the input parameters are written by default to the output file prior to

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

257 # this program being called.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

258 # But I do not get input values from the json file, but rather from command line.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

259 # Just leaving the following code as a comment, in case it might be useful to someone later.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

260 # params = from_json_string(open(filename).read())

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

261 # target_directory = params['output_data'][0]['extra_files_path']

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

262 # os.mkdir(target_directory)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

263

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

264 # print "Arguments are parsed."

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

265 print "\ndownload_location is {:s}".format(str(args.download_location))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

266 print "display_name is {:s}".format(str(args.display_name))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

267 print "destination_path is {:s}\n".format(str(args.destination_path))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

268 root_index_dirname = None

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

269 # FIX - Prob don't need index_was_downloaded. Not doing anything with it.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

270 # But it indicates success downloading the index, so maybe should be checking it.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

271 index_was_downloaded = False

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

272 if (args.download_location != ""):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

273 index_directory, root_index_dirname, index_was_downloaded = \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

274 download_index(src_location=args.download_location, \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

275 destination=args.destination_path, \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

276 force_download=args.force_download)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

277 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

278 cannonical_destination = os.path.realpath(args.destination_path)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

279 if not os.path.exists(cannonical_destination):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

280 raise ValueError("Cannot find the Centrifuge Index.\n" + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

281 "The directory does not exist:\n\t{:s}".format(index_directory))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

282 # If args.destination_path is a directory containing

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

283 # a subdirectory that contains the index files,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

284 # then we need to set the index_directory to be that subdirectory.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

285 files_in_destination_path = os.listdir(cannonical_destination)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

286 if (len(files_in_destination_path) == 1):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

287 path_to_file = "{:s}/{:s}".format(cannonical_destination, files_in_destination_path[0])

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

288 if os.path.isdir(path_to_file):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

289 index_directory = path_to_file

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

290 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

291 index_directory = cannonical_destination

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

292 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

293 index_directory = cannonical_destination

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

294 # Get the root_index_dirname of the index from the index_directory name.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

295 root_index_dirname = index_directory.split("/")[-1].split(".")[0]

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

296

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

297 # Check if there is an actual Centrifuge Index file in the index_directory.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

298 print "\nThe location of the Centrifuge Index is {:s}.\n".format(index_directory)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

299 files_in_index_directory = set(os.listdir(index_directory))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

300 index_file_found = False

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

301 index_file_path = index_directory

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

302 for filename in files_in_index_directory:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

303 # The current index is split into 3 files.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

304 # filenames are in the form: index_root_name.#.cf,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

305 # where # is a numeral (1, 2, or 3)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

306 # indicating the order of the files.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

307 if filename.split(".")[-1] == _CentrifugeIndexFileExtension:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

308 index_file_found = True

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

309 # The centrifuge program wants the root name of the files to be final part of the path.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

310 index_file_path = "{:s}/{:s}".format(index_directory, filename.split(".")[0])

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

311 if not index_file_found:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

312 raise ValueError("Cannot find any Centrifuge Index files.\n" + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

313 "The contents of the directory {:s} are:\n\t".format(index_directory) + \

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

314 "\n\t".join(files_in_index_directory))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

315

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

316 # Set the display_name

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

317 if (args.display_name is None) or (args.display_name == ""):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

318 # Use the root_index_dirname.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

319 if (root_index_dirname != None) and (root_index_dirname != ""):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

320 display_name = _CTAT_Centrifuge_DisplayNamePrefix + root_index_dirname

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

321 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

322 display_name = _CTAT_Centrifuge_DisplayNamePrefix + _CTAT_CentrifugeDir_Name

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

323 print "WARNING: Did not set the display name. Using the default: {:s}".format(display_name_value)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

324 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

325 display_name = _CTAT_Centrifuge_DisplayNamePrefix + args.display_name

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

326 display_name = display_name.replace(" ","_")

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

327

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

328 # Set the unique_id

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

329 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

330 if (root_index_dirname != None) and (root_index_dirname != ""):

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

331 unique_id = root_index_dirname + datetime_stamp

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

332 else:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

333 unique_id = _CTAT_CentrifugeDir_Name + datetime_stamp

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

334

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

335 print "The Index's display_name will be set to: {:s}\n".format(display_name)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

336 print "Its unique_id will be set to: {:s}\n".format(unique_id)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

337 print "Its dir_path will be set to: {:s}\n".format(index_file_path)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

338

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

339 data_manager_dict = {}

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

340 data_manager_dict['data_tables'] = {}

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

341 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName] = []

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

342 data_table_entry = dict(value=unique_id, name=display_name, path=index_file_path)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

343 data_manager_dict['data_tables'][_CTAT_CentrifugeIndexTableName].append(data_table_entry)

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

344

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

345 # Temporarily the output file's dictionary is written for debugging:

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

346 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

347 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

348 # which then puts it into the correct .loc file (I think).

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

349 # Remove the following line when testing without galaxy package.

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

350 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

351

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

352 if __name__ == "__main__":

367b0d693b0c Uploaded

trinity_ctat

parents:

diff changeset

353 main()

Mercurial > repos > trinity_ctat > ctat_genome_ref_lib_data_manager_test2

annotate data_manager/add_ctat_centrifuge_index.py @ 15:367b0d693b0c draft default tip