annotate data_manager/add_ctat_resource_lib.py @ 5:7f1257532b6f draft

Uploaded
author trinity_ctat
date Tue, 01 May 2018 15:40:08 -0400
parents c372930aaba1
children be2761745400
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
3
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
5 # other example code on the web.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
6 # This now allows downloading of a user selected library
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
7 # but only from the CTAT Genome Resource Library website.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
8 # Ultimately we might want to allow the user to specify any location
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
9 # from which to download.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
10 # Users can create or download other libraries and use this tool to add them if they don't want
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
11 # to add them by hand.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
12
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
13 import argparse
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
14 import os
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
15 #import tarfile
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
16 #import urllib
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
17 import subprocess
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
18
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
19 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
20 from galaxy.util.json import to_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
21 # The following is not being used, but leaving as info
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
22 # in case we ever want to get input values using json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
23 # from galaxy.util.json import from_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
24
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
25 # datetime.now() is used to create the unique_id
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
26 from datetime import datetime
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
27
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
28 # The FileListParser is used by get_ctat_genome_filenames(),
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
29 # which is called by the Data Manager interface (.xml file) to get
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
30 # the filenames that are available online at broadinstitute.org
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
31 # Not sure best way to do it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
32 # This object uses HTMLParser to look through the html
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
33 # searching for the filenames within anchor tags.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
34 import urllib2
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
35 from HTMLParser import HTMLParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
36
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
37 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
38 _CTAT_BuildDir_Name = 'ctat_genome_lib_build_dir'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
39 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
40 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
41 _NumBytesNeededForBuild = 64424509440 # 60 Gigabytes. FIX - This might not be correct.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
42 _Download_TestFile = "write_testfile.txt"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
43 _DownloadSuccessFile = 'download_succeeded.txt'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
44
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
45 class FileListParser(HTMLParser):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
46 def __init__(self):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
47 # Have to use direct call to super class rather than using super():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
48 # super(FileListParser, self).__init__()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
49 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
50 HTMLParser.__init__(self)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
51 self.urls = set()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
52 def handle_starttag(self, tag, attrs):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
53 # Look for filename references in anchor tags and add them to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
54 if tag == "a":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
55 # The tag is an anchor tag.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
56 for attribute in attrs:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
57 # print "Checking: {:s}".format(str(attribute))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
58 if attribute[0] == "href":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
59 # Does the href have a tar.gz in it?
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
60 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
61 # Add the value to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
62 self.urls.add(attribute[1])
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
63 # End of class FileListParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
64
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
65 def get_ctat_genome_urls():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
66 # open the url and retrieve the urls of the files in the directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
67 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
68 theHTML = resource.read()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
69 filelist_parser = FileListParser()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
70 filelist_parser.feed(theHTML)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
71 # For dynamic options need to return an interable with contents that are tuples with 3 items.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
72 # Item one is a string that is the display name put into the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
73 # Item two is the value that is put into the parameter associated with the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
74 # Item three is a True or False value, indicating whether the item is selected.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
75 options = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
76 for i, url in enumerate(filelist_parser.urls):
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
77 # The urls should look like:
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
78 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
79 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
80 # But is actuality, they are coming in looking like:
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
81 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
82 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
83 # Write code to handle both situations, or an ftp: url.
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
84 if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"):
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
85 full_url_path = url
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
86 else:
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
87 # Assume the path is relative to the page location.
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
88 full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
89 filename = url.split("/")[-1]
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
90
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
91 if filename.split("_")[0] != "Mouse":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
92 # Take out the mouse genome options for now.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
93 # The mouse genome option is not handled correctly yet
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
94 options.append((filename, full_url_path, i == 0))
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
95 options.sort() # So the list will be in alphabetical order.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
96 # return a tuple of the urls
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
97 print "The list being returned as options is:"
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
98 print "{:s}\n".format(str(options))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
99 return options
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
100
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
101 # The following was used by the example program to get input parameters through the json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
102 # Just leaving here for reference.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
103 # We are getting all of our parameter values through command line arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
104 #def get_reference_id_name(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
105 # genome_id = params['param_dict']['genome_id']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
106 # genome_name = params['param_dict']['genome_name']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
107 # return genome_id, genome_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
108 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
109 #def get_url(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
110 # trained_url = params['param_dict']['trained_url']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
111 # return trained_url
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
112
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
113 def download_from_BroadInst(source, destination, force_download):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
114 # Input Parameters
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
115 # source is the full URL of the file we want to download.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
116 # It should look something like:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
117 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
118 # destination is the location where the source file will be unarchived.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
119 # Relative paths are expanded using the current working directory, so within Galaxy,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
120 # it is best to send in absolute fully specified path names so you know to where
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
121 # the source file going to be extracted.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
122 # force_download will cause a new download and extraction to occur, even if the destination
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
123 # has a file in it indicating that a previous download succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
124 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
125 # Returns the following:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
126 # return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
127 # downloaded_directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
128 # The directory which was created as a subdirectory of the destination directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
129 # when the download occurred, or if there was no download,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
130 # possibly the same directory as destination, if that is where the data resides.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
131 # download_has_source_data
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
132 # Is a boolean indicating whether the source file was "source_data" or was "plug-n-play".
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
133 # genome_build_directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
134 # The directory where the genome resource library is or where it should be built.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
135 # It can be the same as the downloaded directory, but is sometimes a subdirectory of it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
136 # lib_was_downloaded
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
137 # Since it doesn't always do the download, the function returns whether download occurred.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
138 lib_was_downloaded = False
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
139
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
140 print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source))
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
141
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
142 # Get the root filename of the Genome Directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
143 src_filename = source.split("/")[-1]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
144 root_genome_dirname = src_filename.split(".")[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
145 # If the src_filename indicates it is a source file, as opposed to plug-n-play,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
146 # then we may need to do some post processing on it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
147 type_of_download = src_filename.split(".")[1]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
148 download_has_source_data = (type_of_download == "source_data")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
149
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
150 # We want to make sure that destination is absolute fully specified path.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
151 cannonical_destination = os.path.realpath(destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
152 if os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
153 if not os.path.isdir(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
154 raise ValueError("The destination is not a directory: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
155 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
156 # else all is good. It is a directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
157 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
158 # We need to create it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
159 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
160 os.makedirs(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
161 except os.error:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
162 print "ERROR: Trying to create the following directory path:"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
163 print "\t{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
164 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
165
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
166 # Make sure the directory now exists and we can write to it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
167 if not os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
168 # It should have been created, but if it doesn't exist at this point
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
169 # in the code, something is wrong. Raise an error.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
170 raise OSError("The destination directory could not be created: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
171 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
172 test_writing_file = "{:s}/{:s}".format(cannonical_destination, _Download_TestFile)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
173 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
174 filehandle = open(test_writing_file, "w")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
175 filehandle.write("Testing writing to this file.")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
176 filehandle.close()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
177 os.remove(test_writing_file)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
178 except IOError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
179 print "The destination directory could not be written into: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
180 "{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
181 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
182
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
183 # Get the list of files in the directory,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
184 # We use it to check for a previous download or extraction among other things.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
185 orig_files_in_destdir = set(os.listdir(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
186 # See whether the file has been downloaded already.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
187 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, _DownloadSuccessFile)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
188 if ((_DownloadSuccessFile not in orig_files_in_destdir) \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
189 or (root_genome_dirname not in orig_files_in_destdir) \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
190 or force_download):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
191 # Check whether there is enough space on the device for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
192 statvfs = os.statvfs(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
193 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
194 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
195 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
196 # are allowed to use (excl. reserved space)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
197 if (num_avail_bytes < _NumBytesNeededForBuild):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
198 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
199 " on the device of the destination directory: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
200 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
201
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
202 #Previous code to download and untar. Not using anymore.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
203 #full_filepath = os.path.join(destination, src_filename)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
204 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
205 #Download ref: https://dzone.com/articles/how-download-file-python
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
206 #f = urllib2.urlopen(source)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
207 #data = f.read()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
208 #with open(full_filepath, 'wb') as code:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
209 # code.write(data)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
210 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
211 #Another way to download:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
212 #try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
213 # urllib.urlretrieve(url=source, filename=full_filepath)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
214 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
215 #Then untar the file.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
216 #try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
217 # tarfile.open(full_filepath, mode='r:*').extractall()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
218
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
219 if (_DownloadSuccessFile in orig_files_in_destdir):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
220 # Since we are redoing the download,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
221 # the success file needs to be removed
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
222 # until the download has succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
223 os.remove(download_success_file_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
224 # We want to transfer and untar the file without storing the tar file, because that
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
225 # adds all that much more space to the needed amount of free space on the disk.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
226 # Use subprocess to pipe the output of curl into tar.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
227 command = "curl {:s} | tar -xzvf - -C {:s}".format(source, cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
228 try: # to send the command that downloads and extracts the file.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
229 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
230 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
231 # any problem, maybe we should not be checking output.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
232 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
233 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
234 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
235 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
236 lib_was_downloaded = True
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
237
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
238 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
239 print "\n*******************************\nFinished download and extraction."
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
240 subprocess.check_call("ls -lad {:s}/*".format(cannonical_destination), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
241 subprocess.check_call("ls -lad {:s}/*/*".format(cannonical_destination), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
242
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
243 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
244 if (root_genome_dirname not in newfiles_in_destdir):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
245 # Perhaps it has a different name than what we expected it to be.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
246 # It will be the file that was not in the directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
247 # before we did the download and extraction.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
248 found_filename = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
249 if len(newfiles_in_destdir) == 1:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
250 found_filename = newfiles_in_destdir[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
251 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
252 for filename in newfiles_in_destdir:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
253 # In most cases, there will only be one new file, but some OS's might have created
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
254 # other files in the directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
255 # Look for the directory that was downloaded and extracted.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
256 # The correct file's name should be a substring of the tar file that was downloaded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
257 if filename in src_filename:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
258 found_filename = filename
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
259 if found_filename is not None:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
260 root_genome_dirname = found_filename
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
261
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
262 downloaded_directory = "{:s}/{:s}".format(cannonical_destination, root_genome_dirname)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
263
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
264 if (os.path.exists(downloaded_directory)):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
265 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
266 # Create a file to indicate that the download succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
267 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
268 except IOError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
269 print "The download_success file could not be created: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
270 "{:s}".format(download_success_file_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
271 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
272 # Look for the build directory, or specify the path where it should be placed.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
273 if len(os.listdir(downloaded_directory)) == 1:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
274 # Then that one file is a subdirectory that should be the downloaded_directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
275 subdir_filename = os.listdir(downloaded_directory)[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
276 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, subdir_filename)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
277 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
278 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_BuildDir_Name)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
279 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
280 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
281 "\n\t{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
282
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
283 return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
284
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
285 def gmap_the_library(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
286 # This is the processing that needs to happen for gmap-fusion to work.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
287 # genome_build_directory should normally be a fully specified path,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
288 # though it should work if it is relative.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
289 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
290 genome_build_directory, genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
291 try: # to send the gmap_build command.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
292 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
293 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
294 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
295 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
296 finally:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
297 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
298 print "\n*******************************\nAfter running gmap_build."
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
299 if os.path.exists(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
300 print "\nBuild Directory {:s}:".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
301 subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
302 dir_entries = os.listdir(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
303 for entry in dir_entries:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
304 entry_path = "{:s}/{:s}".format(genome_build_directory, entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
305 print "\nDirectory {:s}:".format(entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
306 subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
307 if os.path.isdir(entry_path):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
308 subdir_entries = os.listdir(entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
309 for subdir_entry in subdir_entries:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
310 subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
311 print "\nDirectory {:s}:".format(subdir_entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
312 subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
313 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
314 print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
315 print "*******************************"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
316
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
317 def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
318 """ genome_source_directory is the location of the source_data needed to build the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
319 Normally it is fully specified, but could be relative.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
320 genome_build_directory is the location where the library will be built.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
321 It can be relative to the current working directory or an absolute path.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
322 build specifies whether to run prep_genome_lib.pl even if it was run before.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
323 gmap_build specifies whether to run gmap_build or not.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
324
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
325 Following was the old way to do it. Before FusionFilter 0.5.0.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
326 prep_genome_lib.pl \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
327 --genome_fa ref_genome.fa \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
328 --gtf ref_annot.gtf \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
329 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
330 --fusion_annot_lib fusion_lib.dat.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
331 --output_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
332 index_pfam_domain_info.pl \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
333 --pfam_domains PFAM.domtblout.dat.gz \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
334 --genome_lib_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
335 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
336 """
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
337 if (genome_source_directory != "" ) and build:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
338 if os.path.exists(genome_source_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
339 os.chdir(genome_source_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
340 # FIX - look for a fusion_annot_lib and include it, else omit it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
341 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
342 "--fusion_annot_lib CTAT_HumanFusionLib.v0.1.0.dat.gz " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
343 "--annot_filter_rule AnnotFilterRule.pm " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
344 "--pfam_db PFAM.domtblout.dat.gz " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
345 "--output_dir {:s} ".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
346 if gmap_build:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
347 command += "--gmap_build "
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
348 try: # to send the prep_genome_lib command.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
349 command_output = subprocess.check_call(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
350 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
351 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
352 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
353 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
354 finally:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
355 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
356 print "*******************************"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
357 if os.path.exists(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
358 print "\nSource Directory {:s}:".format(genome_source_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
359 subprocess.check_call("ls -la {:s}".format(genome_source_directory), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
360 dir_entries = os.listdir(genome_source_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
361 for entry in dir_entries:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
362 entry_path = "{:s}/{:s}".format(genome_source_directory, entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
363 print "\nDirectory {:s}:".format(entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
364 subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
365 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
366 print "Genome Source Directory does not exist:\n\t{:s}".format(genome_source_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
367 if os.path.exists(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
368 print "\nBuild Directory {:s}:".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
369 subprocess.check_call("ls -la {:s}".format(genome_build_directory), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
370 dir_entries = os.listdir(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
371 for entry in dir_entries:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
372 entry_path = "{:s}/{:s}".format(genome_build_directory, entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
373 print "\nDirectory {:s}:".format(entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
374 subprocess.check_call("ls -la {:s}".format(entry_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
375 if os.path.isdir(entry_path):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
376 subdir_entries = os.listdir(entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
377 for subdir_entry in subdir_entries:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
378 subdir_entry_path = "{:s}/{:s}".format(entry_path, subdir_entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
379 print "\nDirectory {:s}:".format(subdir_entry_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
380 subprocess.check_call("ls -la {:s}".format(subdir_entry_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
381 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
382 print "Genome Build Directory does not exist:\n\t{:s}".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
383 print "*******************************"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
384 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
385 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
386 "The source directory does not exist:\n\t{:s}".format(genome_source_directory))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
387 elif gmap_build:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
388 gmap_the_library(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
389
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
390 def main():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
391 #Parse Command Line
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
392 parser = argparse.ArgumentParser()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
393 parser.add_argument('-s', '--source_url', default="", \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
394 help='This is the url of a file with the data. They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.')
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
395 parser.add_argument('-n', '--display_name', default="", \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
396 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
397 parser.add_argument('-p', '--destination_path', \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
398 help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
399 parser.add_argument('-o', '--output_filename', \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
400 help='Name of the output file, where the json dictionary will be written.')
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
401 parser.add_argument('-f', '--force_download',
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
402 help='Forces download of the Genome Resource Library, even if previously downloaded.', action="store_true")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
403 parser.add_argument('-b', '--build',
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
404 help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
405 'Must have downloaded source_data for this to work.', action="store_true")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
406 parser.add_argument('-m', '--gmap_build',
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
407 help='Must be selected if you want the library to be gmapped. ' + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
408 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action="store_true")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
409 args = parser.parse_args()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
410
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
411 # All of the input parameters are written by default to the output file prior to
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
412 # this program being called.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
413 # But I do not get input values from the json file, but rather from command line.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
414 # Just leaving the following code as a comment, in case it might be useful to someone later.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
415 # params = from_json_string(open(filename).read())
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
416 # target_directory = params['output_data'][0]['extra_files_path']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
417 # os.mkdir(target_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
418
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
419 print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url))
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
420
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
421 # FIX - not sure the lib_was_downloaded actually serves a purpose...
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
422 lib_was_downloaded = False
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
423 download_has_source_data = False
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
424 # If we do not download the directory, the destination_path should be the
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
425 # location of the genome resource library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
426 downloaded_directory = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
427 # FIX - look inside of the args.destination_path to see if the build directory is inside it or is it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
428 genome_build_directory = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
429 # FIX - need to make sure we are handling all "possible" combinations of arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
430 # Probably would be good if we could simplify/remove some of them.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
431 if (args.source_url != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
432 downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded = \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
433 download_from_BroadInst(source=args.source_url, \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
434 destination=args.destination_path, \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
435 force_download=args.force_download)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
436 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
437 genome_build_directory = args.destination_path
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
438 if not os.path.exists(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
439 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
440 "The directory does not exist:\n\t{:s}".format(genome_build_directory))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
441 # else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
442 # FIX - Check if there is an actual CTAT Genome Resource Lib there.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
443 # _CTAT_BuildDir_Name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
444
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
445 print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
446
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
447 # Take out builds for testing.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
448 # FIX - We should leave a file indicating build success the same way we do for download success.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
449 if (download_has_source_data or args.build or args.gmap_build) :
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
450 build_the_library(downloaded_directory, genome_build_directory, args.build, args.gmap_build)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
451 elif (args.gmap_build):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
452 gmap_the_library(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
453
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
454 if (args.source_url != None) and (args.source_url != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
455 # Get the name out of the source's filename.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
456 source_filename_root = args.source_url.split("/")[-1].split(".")[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
457
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
458 # Determine the display_name for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
459 if (args.display_name is None) or (args.display_name == ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
460 if (source_filename_root != None) and (source_filename_root != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
461 # Get the name out of the source filename.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
462 display_name = _CTAT_ResourceLib_DisplayNamePrefix + source_filename_root
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
463 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
464 display_name = _CTAT_ResourceLib_DisplayNamePrefix + _CTAT_ResourceLib_DefaultGenome
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
465 print "WARNING: We do not have a genome name. Using a default name, that might not be correct."
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
466 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
467 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
468 display_name = display_name.replace(" ","_")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
469 print "The Genome Name will be set to: {:s}\n".format(display_name)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
470
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
471 # Create a unique_id for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
472 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
473 if (source_filename_root != None) and (source_filename_root != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
474 unique_id = source_filename_root + datetime_stamp
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
475 elif (downloaded_directory != None) and (downloaded_directory != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
476 unique_id = os.path.basename(downloaded_directory).split(".")[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
477 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
478 unique_id = _CTAT_ResourceLib_DefaultGenome + datetime_stamp
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
479
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
480 print "The Resource Lib's display_name will be set to: {:s}\n".format(display_name)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
481 print "Its unique_id will be set to: {:s}\n".format(unique_id)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
482 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
483
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
484 data_manager_dict = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
485 data_manager_dict['data_tables'] = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
486 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
487 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
488 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
489
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
490 # Temporarily the output file's dictionary is written for debugging:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
491 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
492 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
493 # which then puts it into the correct .loc file (I think).
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
494 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
495 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
496
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
497 if __name__ == "__main__":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
498 main()