annotate data_manager/add_ctat_resource_lib.py @ 9:1717c42112ed draft

Uploaded
author trinity_ctat
date Sat, 23 Jun 2018 16:06:17 -0400
parents b2e6ed40840a
children a7cd51b60f58
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
3
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
5 # other example code on the web.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
6 # This now allows downloading of a user selected library
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
7 # but only from the CTAT Genome Resource Library website.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
8 # Ultimately we might want to allow the user to specify any location
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
9 # from which to download.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
10 # Users can create or download other libraries and use this tool to add them if they don't want
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
11 # to add them by hand.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
12
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
13 import argparse
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
14 import os
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
15 #import tarfile
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
16 #import urllib
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
17 import subprocess
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
18
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
19 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
20 from galaxy.util.json import to_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
21 # The following is not being used, but leaving as info
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
22 # in case we ever want to get input values using json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
23 # from galaxy.util.json import from_json_string
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
24
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
25 # datetime.now() is used to create the unique_id
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
26 from datetime import datetime
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
27
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
28 # The FileListParser is used by get_ctat_genome_filenames(),
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
29 # which is called by the Data Manager interface (.xml file) to get
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
30 # the filenames that are available online at broadinstitute.org
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
31 # Not sure best way to do it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
32 # This object uses HTMLParser to look through the html
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
33 # searching for the filenames within anchor tags.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
34 import urllib2
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
35 from HTMLParser import HTMLParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
36
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
37 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
38 _CTAT_MutationIndex_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
39 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
40 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
41 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
42 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
43 _CTAT_RefGenome_Filename = 'ref_genome.fa'
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
44 _CTAT_MouseGenome_Prefix = 'Mouse'
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
45 _CTAT_HumanGenome_Prefix = 'GRCh'
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
46 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
47 _NumBytesNeededForIndexes = 21474836480 # 20 Gigabytes. FIX - This might not be correct.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
48 _Download_TestFile = "write_testfile.txt"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
49 _DownloadSuccessFile = 'download_succeeded.txt'
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
50 _LibBuiltSuccessFile = 'build_succeeded.txt'
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
51 _MutationDownloadSuccessFile = 'mutation_index_download_succeeded.txt'
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
52
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
53 class FileListParser(HTMLParser):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
54 def __init__(self):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
55 # Have to use direct call to super class rather than using super():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
56 # super(FileListParser, self).__init__()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
57 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
58 HTMLParser.__init__(self)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
59 self.urls = set()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
60 def handle_starttag(self, tag, attrs):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
61 # Look for filename references in anchor tags and add them to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
62 if tag == "a":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
63 # The tag is an anchor tag.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
64 for attribute in attrs:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
65 # print "Checking: {:s}".format(str(attribute))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
66 if attribute[0] == "href":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
67 # Does the href have a tar.gz in it?
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
68 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
69 # Add the value to urls.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
70 self.urls.add(attribute[1])
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
71 # End of class FileListParser
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
72
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
73 def get_ctat_genome_urls():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
74 # open the url and retrieve the urls of the files in the directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
75 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
76 theHTML = resource.read()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
77 filelist_parser = FileListParser()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
78 filelist_parser.feed(theHTML)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
79 # For dynamic options need to return an interable with contents that are tuples with 3 items.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
80 # Item one is a string that is the display name put into the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
81 # Item two is the value that is put into the parameter associated with the option list.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
82 # Item three is a True or False value, indicating whether the item is selected.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
83 options = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
84 for i, url in enumerate(filelist_parser.urls):
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
85 # The urls should look like:
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
86 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
87 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
88 # But in actuality, they are coming in looking like:
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
89 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
90 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
91 # Write code to handle both situations, or an ftp: url.
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
92 if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"):
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
93 full_url_path = url
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
94 else:
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
95 # Assume the path is relative to the page location.
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
96 full_url_path = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, url)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
97 filename = url.split("/")[-1]
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
98 # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
99 # # Don't put in the mouse genome options for now.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
100 # # The mouse genome option is not handled correctly yet
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
101 # options.append((filename, full_url_path, i == 0))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
102 # Mouse genomes should work now (we hope) - FIX - still not tested.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
103 options.append((filename, full_url_path, i == 0))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
104 options.sort() # So the list will be in alphabetical order.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
105 # return a tuple of the urls
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
106 print "The list being returned as options is:"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
107 print "{:s}\n".format(str(options))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
108 return options
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
109
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
110 def get_mutation_index_urls():
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
111 # open the url and retrieve the urls of the files in the directory.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
112 resource = urllib2.urlopen(_CTAT_MutationIndex_URL)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
113 theHTML = resource.read()
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
114 filelist_parser = FileListParser()
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
115 filelist_parser.feed(theHTML)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
116 # For dynamic options need to return an interable with contents that are tuples with 3 items.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
117 # Item one is a string that is the display name put into the option list.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
118 # Item two is the value that is put into the parameter associated with the option list.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
119 # Item three is a True or False value, indicating whether the item is selected.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
120 options = []
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
121 for i, url in enumerate(filelist_parser.urls):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
122 # The urls should look like:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
123 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
124 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
125 # But in actuality, they are coming in looking like:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
126 # hg19.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
127 # mc7.tar.gz
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
128 # Write code to handle both situations, or an ftp: url.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
129 if (url.split(":")[0] == "http") or (url.split(":")[0] == "https") or (url.split(":")[0] == "ftp"):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
130 full_url_path = url
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
131 else:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
132 # Assume the path is relative to the page location.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
133 full_url_path = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, url)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
134 filename = url.split("/")[-1]
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
135 options.append((filename, full_url_path, i == 0))
5
7f1257532b6f Uploaded
trinity_ctat
parents: 4
diff changeset
136 options.sort() # So the list will be in alphabetical order.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
137 # return a tuple of the urls
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
138 print "The list being returned as options is:"
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
139 print "{:s}\n".format(str(options))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
140 return options
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
141
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
142 # The following was used by the example program to get input parameters through the json.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
143 # Just leaving here for reference.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
144 # We are getting all of our parameter values through command line arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
145 #def get_reference_id_name(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
146 # genome_id = params['param_dict']['genome_id']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
147 # genome_name = params['param_dict']['genome_name']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
148 # return genome_id, genome_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
149 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
150 #def get_url(params):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
151 # trained_url = params['param_dict']['trained_url']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
152 # return trained_url
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
153
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
154 # The following procedure is used to help with debugging and for user information.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
155 def print_directory_contents(dir_path, num_levels):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
156 if num_levels > 0:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
157 if os.path.exists(dir_path) and os.path.isdir(dir_path):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
158 print "\nDirectory {:s}:".format(dir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
159 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
160 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
161 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
162 if num_levels > 1:
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
163 if os.path.exists(dir_path) and os.path.isdir(dir_path):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
164 for filename in os.listdir(dir_path):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
165 filename_path = "{:s}/{:s}".format(dir_path, filename)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
166 if os.path.exists(filename_path) and os.path.isdir(filename_path):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
167 print_directory_contents(filename_path, num_levels-1)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
168 else:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
169 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
170
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
171 def download_from_BroadInst(source, destination, force_download):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
172 # Input Parameters
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
173 # source is the full URL of the file we want to download.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
174 # It should look something like:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
175 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
176 # destination is the location where the source file will be unarchived.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
177 # Relative paths are expanded using the current working directory, so within Galaxy,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
178 # it is best to send in absolute fully specified path names so you know to where
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
179 # the source file going to be extracted.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
180 # force_download will cause a new download and extraction to occur, even if the destination
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
181 # has a file in it indicating that a previous download succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
182 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
183 # Returns the following:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
184 # return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
185 # downloaded_directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
186 # The directory which was created as a subdirectory of the destination directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
187 # when the download occurred, or if there was no download,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
188 # possibly the same directory as destination, if that is where the data resides.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
189 # download_has_source_data
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
190 # Is a boolean indicating whether the source file was "source_data" or was "plug-n-play".
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
191 # genome_build_directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
192 # The directory where the genome resource library is or where it should be built.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
193 # It can be the same as the downloaded directory, but is sometimes a subdirectory of it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
194 # lib_was_downloaded
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
195 # Since it doesn't always do the download, the function returns whether download occurred.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
196 lib_was_downloaded = False
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
197 if len(source.split(":")) == 1:
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
198 # Then we were given a source_url without a leading https: or similar.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
199 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
200 source = "{:s}/{:s}".format(_CTAT_ResourceLib_URL, source)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
201 # else we might want to check that it is one of "http", "ftp", "file" or other accepted url starts.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
202
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
203 print "In download_from_BroadInst(). The source_url is:\n\t{:s}".format(str(source))
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
204
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
205 # Get the root filename of the Genome Directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
206 src_filename = source.split("/")[-1]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
207 root_genome_dirname = src_filename.split(".")[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
208 # If the src_filename indicates it is a source file, as opposed to plug-n-play,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
209 # then we may need to do some post processing on it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
210 type_of_download = src_filename.split(".")[1]
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
211 print "The file to be extracted is {:s}".format(src_filename)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
212 print "The type of download is {:s}".format(type_of_download)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
213 download_has_source_data = (type_of_download == "source_data")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
214
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
215 # We want to make sure that destination is absolute fully specified path.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
216 cannonical_destination = os.path.realpath(destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
217 if os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
218 if not os.path.isdir(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
219 raise ValueError("The destination is not a directory: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
220 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
221 # else all is good. It is a directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
222 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
223 # We need to create it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
224 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
225 os.makedirs(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
226 except os.error:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
227 print "ERROR: Trying to create the following directory path:"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
228 print "\t{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
229 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
230
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
231 # Make sure the directory now exists and we can write to it.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
232 if not os.path.exists(cannonical_destination):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
233 # It should have been created, but if it doesn't exist at this point
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
234 # in the code, something is wrong. Raise an error.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
235 raise OSError("The destination directory could not be created: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
236 "{:s}".format(cannonical_destination))
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
237 test_writing_file = "{:s}/{:s}.{:s}".format(cannonical_destination, root_genome_dirname, _Download_TestFile)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
238 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
239 filehandle = open(test_writing_file, "w")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
240 filehandle.write("Testing writing to this file.")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
241 filehandle.close()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
242 os.remove(test_writing_file)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
243 except IOError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
244 print "The destination directory could not be written into: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
245 "{:s}".format(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
246 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
247
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
248 # Get the list of files in the directory,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
249 # We use it to check for a previous download or extraction among other things.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
250 orig_files_in_destdir = set(os.listdir(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
251 # See whether the file has been downloaded already.
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
252 # FIX - Try looking one or two directories above, as well as current directory,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
253 # and maybe one directory below,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
254 # for the download success file?
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
255 # Not sure about this though...
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
256 download_success_file = "{:s}.{:s}".format(root_genome_dirname, _DownloadSuccessFile)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
257 download_success_file_path = "{:s}/{:s}".format(cannonical_destination, download_success_file)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
258 if ((download_success_file not in orig_files_in_destdir) \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
259 or (root_genome_dirname not in orig_files_in_destdir) \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
260 or force_download):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
261 # Check whether there is enough space on the device for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
262 statvfs = os.statvfs(cannonical_destination)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
263 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
264 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
265 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
266 # are allowed to use (excl. reserved space)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
267 if (num_avail_bytes < _NumBytesNeededForBuild):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
268 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
269 " on the device of the destination directory: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
270 "{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
271
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
272 #Previous code to download and untar. Not using anymore.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
273 #full_filepath = os.path.join(destination, src_filename)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
274 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
275 #Download ref: https://dzone.com/articles/how-download-file-python
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
276 #f = urllib2.urlopen(source)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
277 #data = f.read()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
278 #with open(full_filepath, 'wb') as code:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
279 # code.write(data)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
280 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
281 #Another way to download:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
282 #try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
283 # urllib.urlretrieve(url=source, filename=full_filepath)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
284 #
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
285 #Then untar the file.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
286 #try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
287 # tarfile.open(full_filepath, mode='r:*').extractall()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
288
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
289 if (download_success_file in orig_files_in_destdir):
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
290 # Since we are redoing the download,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
291 # the success file needs to be removed
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
292 # until the download has succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
293 os.remove(download_success_file_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
294 # We want to transfer and untar the file without storing the tar file, because that
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
295 # adds all that much more space to the needed amount of free space on the disk.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
296 # Use subprocess to pipe the output of curl into tar.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
297 command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source, cannonical_destination)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
298 try: # to send the command that downloads and extracts the file.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
299 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
300 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
301 # any problem, maybe we should not be checking output.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
302 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
303 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
304 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
305 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
306 lib_was_downloaded = True
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
307
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
308 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
309 print "\n*******************************\nFinished download and extraction."
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
310 print_directory_contents(cannonical_destination, 2)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
311 print "*******************************\n"
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
312
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
313 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
314 if (root_genome_dirname not in newfiles_in_destdir):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
315 # Perhaps it has a different name than what we expected it to be.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
316 # It will be the file that was not in the directory
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
317 # before we did the download and extraction.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
318 found_filename = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
319 if len(newfiles_in_destdir) == 1:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
320 found_filename = newfiles_in_destdir[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
321 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
322 for filename in newfiles_in_destdir:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
323 # In most cases, there will only be one new file, but some OS's might have created
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
324 # other files in the directory.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
325 # Look for the directory that was downloaded and extracted.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
326 # The correct file's name should be a substring of the tar file that was downloaded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
327 if filename in src_filename:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
328 found_filename = filename
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
329 if found_filename is not None:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
330 root_genome_dirname = found_filename
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
331
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
332 downloaded_directory = "{:s}/{:s}".format(cannonical_destination, root_genome_dirname)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
333
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
334 if (os.path.exists(downloaded_directory)):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
335 try:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
336 # Create a file to indicate that the download succeeded.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
337 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
338 except IOError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
339 print "The download_success file could not be created: " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
340 "{:s}".format(download_success_file_path)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
341 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
342 # Look for the build directory, or specify the path where it should be placed.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
343 if len(os.listdir(downloaded_directory)) == 1:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
344 # Then that one file is a subdirectory that should be the downloaded_directory.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
345 # That is how the plug-n-play directories are structured.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
346 subdir_filename = os.listdir(downloaded_directory)[0]
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
347 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, subdir_filename)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
348 else:
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
349 # In this case, we have source_data in the directory. The default will be to create
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
350 # the build directory in the downloaded_directory with the default _CTAT_Build_dirname.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
351 # In this case, this directory will not exist yet until the library is built.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
352 genome_build_directory = "{:s}/{:s}".format(downloaded_directory, _CTAT_Build_dirname)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
353 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
354 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
355 "\n\t{:s}".format(cannonical_destination))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
356
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
357 return (downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
358
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
359 def gmap_the_library(genome_build_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
360 # This is the processing that needs to happen for gmap-fusion to work.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
361 # genome_build_directory should normally be a fully specified path,
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
362 # though this function should work even if it is relative.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
363 # The command prints messages out to stderr, even when there is not an error,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
364 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
365 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
366 genome_build_directory, genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
367 try: # to send the gmap_build command.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
368 command_output = subprocess.check_output(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
369 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
370 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
371 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
372 finally:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
373 # Some code to help us if errors occur.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
374 print "\n*******************************\nAfter running gmap_build."
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
375 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
376 print "*******************************\n"
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
377
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
378 def download_mutation_indexes(source_url, genome_build_directory, force_download):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
379 print "\n*****************************************************************"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
380 print "* The real mutation indexes have not yet been created. Just testing. *"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
381 print "*****************************************************************\n"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
382 # It is assumed that this procedure is only called with a valid genome_build_directory.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
383 # No checks are made to see whether it exists, whether we can write to it, etc.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
384 index_was_downloaded = False
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
385 if len(source_url.split(":")) == 1:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
386 # Then we were given a source_url without a leading https: or similar.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
387 # Assume we only were given the filename and that it exists at _CTAT_MutationIndex_URL.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
388 source_url = "{:s}/{:s}".format(_CTAT_MutationIndex_URL, source_url)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
389
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
390 print "In download_mutation_indexes(). The source_url is:\n\t{:s}".format(str(source_url))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
391
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
392 # Get the root filename of the Genome Directory.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
393 src_filename = source.split("/")[-1]
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
394 root_genome_dirname = src_filename.split(".")[0]
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
395 print "The mutation index file to be downloaded and extracted is {:s}".format(src_filename)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
396
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
397 # Get the list of files in the directory,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
398 # We use it to check for a previous download or extraction among other things.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
399 orig_files_in_destdir = set(os.listdir(genome_build_directory))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
400 # See whether the index file has been downloaded already.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
401 download_success_file = "{:s}.{:s}".format(root_genome_dirname, _MutationDownloadSuccessFile)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
402 download_success_file_path = "{:s}/{:s}".format(genome_build_directory, download_success_file)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
403 if ((download_success_file not in orig_files_in_destdir) or force_download):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
404 # Check whether there is enough space on the device for the library.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
405 statvfs = os.statvfs(genome_build_directory)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
406 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
407 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
408 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
409 # are allowed to use (excl. reserved space)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
410 if (num_avail_bytes < _NumBytesNeededForIndexes):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
411 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
412 " for the indexes on the device of the destination directory: " + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
413 "{:s}".format(cannonical_destination))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
414 if (download_success_file in orig_files_in_destdir):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
415 # Since we are redoing the download,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
416 # the success file needs to be removed
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
417 # until the download has succeeded.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
418 os.remove(download_success_file_path)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
419 # We want to transfer and untar the file without storing the tar file, because that
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
420 # adds all that much more space to the needed amount of free space on the disk.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
421 # Use subprocess to pipe the output of curl into tar.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
422 command = "curl --silent {:s} | tar -xzf - -C {:s}".format(source_url, genome_build_directory)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
423 try: # to send the command that downloads and extracts the file.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
424 command_output = subprocess.check_output(command, shell=True)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
425 # FIX - not sure check_output is what we want to use. If we want to have an error raised on
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
426 # any problem, maybe we should not be checking output.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
427 except subprocess.CalledProcessError:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
428 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
429 raise
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
430 else:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
431 index_was_downloaded = True
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
432 # Some code to help us if errors occur.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
433 print "/n*********************************************************"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
434 print "* Finished download and extraction of Mutation Indexes. *"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
435 print_directory_contents(genome_build_directory, 2)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
436 print "*********************************************************\n"
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
437 try:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
438 # Create a file to indicate that the download succeeded.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
439 subprocess.check_call("touch {:s}".format(download_success_file_path), shell=True)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
440 except IOError:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
441 print "The download_success file could not be created: " + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
442 "{:s}".format(download_success_file_path)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
443 raise
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
444 return index_was_downloaded
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
445
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
446 def build_the_library(genome_source_directory, genome_build_directory, build, gmap_build):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
447 """ genome_source_directory is the location of the source_data needed to build the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
448 Normally it is fully specified, but could be relative.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
449 genome_build_directory is the location where the library will be built.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
450 It can be relative to the current working directory or an absolute path.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
451 build specifies whether to run prep_genome_lib.pl even if it was run before.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
452 gmap_build specifies whether to run gmap_build or not.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
453
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
454 Following was the old way to do it. Before FusionFilter 0.5.0.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
455 prep_genome_lib.pl \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
456 --genome_fa ref_genome.fa \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
457 --gtf ref_annot.gtf \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
458 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
459 --fusion_annot_lib fusion_lib.dat.gz
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
460 --output_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
461 index_pfam_domain_info.pl \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
462 --pfam_domains PFAM.domtblout.dat.gz \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
463 --genome_lib_dir ctat_genome_lib_build_dir
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
464 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
465 """
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
466
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
467 # Get the root filename of the Genome Directory.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
468 src_filename = genome_source_directory.split("/")[-1]
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
469 root_genome_dirname = src_filename.split(".")[0]
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
470 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(genome_source_directory)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
471 # See whether the library has been built already. The success file is written into the source directory.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
472 files_in_sourcedir = set(os.listdir(genome_source_directory))
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
473 build_success_file = "{:s}.{:s}".format(root_genome_dirname, _LibBuiltSuccessFile)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
474 build_success_file_path = "{:s}/{:s}".format(genome_source_directory, build_success_file)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
475 if (genome_source_directory != "" ) and \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
476 ((build_success_file not in files_in_sourcedir) or build):
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
477 if os.path.exists(genome_source_directory):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
478 os.chdir(genome_source_directory)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
479 if (build_success_file in files_in_sourcedir):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
480 # Since we are redoing the build,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
481 # the success file needs to be removed
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
482 # until the build has succeeded.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
483 os.remove(build_success_file_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
484 # Create the command that builds the Genome Resource Library form the source data.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
485 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
486 "--pfam_db PFAM.domtblout.dat.gz " + \
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
487 "--output_dir {:s} ".format(genome_build_directory)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
488 found_HumanFusionLib = False
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
489 HumanFusionLib_filename = "NoFileFound"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
490 for filename in os.listdir(genome_source_directory):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
491 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
492 # We only check the prefix, in case other versions are used later.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
493 # I assume there is only one in the directory, but if there are more than one,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
494 # the later one, alphabetically, will be used.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
495 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
496 found_HumanFusionLib = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
497 filename_of_HumanFusionLib = filename
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
498 if found_HumanFusionLib:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
499 # The mouse genomes do not have a fusion_annot_lib
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
500 # so only add the following for Human genomes.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
501 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
502 "--annot_filter_rule AnnotFilterRule.pm "
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
503 if gmap_build:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
504 command += "--gmap_build "
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
505 # Send stderr of the command to stdout, because some functions may write to stderr,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
506 # even though no error has occurred. We will depend on error code return in order
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
507 # to know if an error occurred.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
508 command += " 2>&1"
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
509 try: # to send the prep_genome_lib command.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
510 command_output = subprocess.check_call(command, shell=True)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
511 except subprocess.CalledProcessError:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
512 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
513 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
514 raise
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
515 finally:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
516 # Some code to help us if errors occur.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
517 print "\n*******************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
518 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
519 print_directory_contents(genome_source_directory, 2)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
520 print "\nContents of Genome Build Directory {:s}:".format(genome_build_directory)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
521 print_directory_contents(genome_build_directory, 2)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
522 print "*******************************\n"
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
523 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
524 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
525 "The source directory does not exist:\n\t{:s}".format(genome_source_directory))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
526 elif gmap_build:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
527 gmap_the_library(genome_build_directory)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
528 try:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
529 # Create a file to indicate that the build succeeded.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
530 subprocess.check_call("touch {:s}".format(build_success_file_path), shell=True)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
531 except IOError:
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
532 print "The download_success file could not be created: " + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
533 "{:s}".format(build_success_file_path)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
534 raise
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
535
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
536 def search_for_genome_build_dir(top_dir_path):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
537 # If we do not download the directory, the topdir_path could be the
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
538 # location of the genome resource library, but we also want to allow the
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
539 # user to give the same value for top_dir_path that they do when a
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
540 # build happens, so we need to handle all three cases:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
541 # 1) Is the top_dir_path the build directory,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
542 # 2) or is it inside of the given directory,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
543 # 3) or is it inside a subdirectory of the given directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
544 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
545 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
546 # We also look for the genome name and return that, if we find it in the
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
547 # directory name of the directory holding the build directory.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
548 top_dir_full_path = os.path.realpath(top_dir_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
549 genome_build_directory = None
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
550 genome_name_from_dirname = None
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
551 print_warning = False
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
552
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
553 if not os.path.exists(top_dir_full_path):
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
554 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
555 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
556 elif not os.path.isdir(top_dir_full_path):
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
557 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
558 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
559 if top_dir_full_path.split("/")[-1] == _CTAT_Build_dirname:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
560 print "Build directory is: {:s}".format(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
561 # The top_dir_path is the path to the genome_build_directory.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
562 genome_build_directory = top_dir_full_path
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
563 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
564 # Look for it inside of the top_dir_path directory.
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
565 print "Looking inside of: {:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
566 top_dir_contents = os.listdir(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
567 if (_CTAT_Build_dirname in top_dir_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
568 # The genome_build_directory is inside of the top_dir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
569 print "1. Found it."
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
570 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
571 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
572 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
573 # Look down the directory tree two levels.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
574 build_dirs_in_subdirs = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
575 subdirs_with_genome_files = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
576 build_dirs_in_sub_subdirs = list()
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
577 sub_subdirs_with_genome_files = list()
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
578 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
579 for subdir in subdirs:
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
580 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
581 subdir_path_contents = os.listdir(subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
582 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
583 if (_CTAT_Build_dirname in subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
584 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
585 print "2a, Found one."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
586 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
587 if (_CTAT_RefGenome_Filename in subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
588 subdirs_with_genome_files.append(subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
589 # Since we are already looping, loop through all dirs one level deeper as well.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
590 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
591 for sub_subdir in sub_subdirs:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
592 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
593 sub_subdir_path_contents = os.listdir(sub_subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
594 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
595 if (_CTAT_Build_dirname in sub_subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
596 # The genome_build_directory is inside of the sub_subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
597 print "3a. Found one."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
598 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
599 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
600 sub_subdirs_with_genome_files.append(sub_subdir_path)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
601 # Hopefully there is one and only one found build directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
602 # If none are found we check for a directory containing the genome reference file,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
603 # but the build process sometimes causes more than one directory to have a copy,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
604 # so finding that file is not a sure thing.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
605 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
606 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
607 print "Found multiple CTAT Genome Resource Libraries " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
608 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
609 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
610 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
611 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
612 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
613 elif len(build_dirs_in_subdirs) == 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
614 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
615 print "2b, Found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
616 genome_build_directory = build_dirs_in_subdirs[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
617 elif len(build_dirs_in_sub_subdirs) == 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
618 # The genome_build_directory is inside of the subdir_path directory.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
619 print "3b, Found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
620 genome_build_directory = build_dirs_in_sub_subdirs[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
621 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
622 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
623 print "Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
624 "in the given directory:\n\t{:s}".format(top_dir_full_path)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
625 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
626 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
627 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
628 raise ValueError("Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
629 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
630 elif (len(sub_subdirs_with_genome_files) == 1):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
631 print "3c, Maybe found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
632 genome_build_directory = sub_subdirs_with_genome_files[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
633 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
634 elif (len(subdirs_with_genome_files) == 1):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
635 print "2c, Maybe found it."
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
636 genome_build_directory = subdirs_with_genome_files[0]
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
637 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
638 elif (_CTAT_RefGenome_Filename in top_dir_contents):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
639 print "1c. Maybe found it."
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
640 genome_build_directory = top_dir_full_path
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
641 print_warning = True
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
642 else:
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
643 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
644 print "Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
645 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
646 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
647 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
648 raise ValueError("Unable to find CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
649 "in the given directory:\n\t{:s}".format(top_dir_full_path))
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
650 # end else
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
651 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
652 if (genome_build_directory is None):
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
653 print "\n***************************************"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
654 print "Cannot find the CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
655 "in the given directory:\n\t{:s}".format(top_dir_full_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
656 print_directory_contents(top_dir_full_path, 2)
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
657 print "***************************************\n"
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
658 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
659 "in the given directory:\n\t{:s}".format(top_dir_full_path))
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
660 else:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
661 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
662 print "\n***************************************"
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
663 print "\nWARNING: Cannot find Genome Reference file {:s}".format(_CTAT_RefGenome_Filename) + \
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
664 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
665 print_directory_contents(genome_build_directory, 2)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
666 print "***************************************\n"
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
667 if print_warning and genome_build_directory:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
668 print "\n***************************************"
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
669 print "\nWARNING: Cannot find the CTAT Genome Resource Library," + \
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
670 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
671 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
672 print_directory_contents(genome_build_directory, 2)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
673 print "***************************************\n"
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
674 return genome_build_directory
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
675
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
676 def find_genome_name_in_path(path):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
677 # The form of the genome name in directory names (if present in the path) looks like:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
678 # GRCh37_v19_CTAT_lib_Feb092018
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
679 # Mouse_M16_CTAT_lib_Feb202018
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
680 genome_name = None
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
681 if (path is not None) and (path != ""):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
682 for element in path.split("/"):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
683 # print "Looking for genome name in {:s}.".format(element)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
684 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
685 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
686 # Remove any extension that might be in the filename.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
687 genome_name = element.split(".")[0]
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
688 return genome_name
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
689
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
690 def main():
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
691 #Parse Command Line
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
692 parser = argparse.ArgumentParser()
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
693 parser.add_argument('-s', '--source_url', default='', \
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
694 help='This is the url of a file with the data. ' + \
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
695 'They come from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.')
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
696 parser.add_argument('-n', '--display_name', default='', \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
697 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
698 parser.add_argument('-o', '--output_filename', \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
699 help='Name of the output file, where the json dictionary will be written.')
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
700 parser.add_argument('-d', '--force_download', \
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
701 help='Forces download of the Genome Resource Library, even if previously downloaded.', action='store_true')
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
702 parser.add_argument('-b', '--build', \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
703 help='Forces build/rebuild the Genome Resource Library, even if previously built. ' + \
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
704 'Must have downloaded source_data for this to work.', action='store_true')
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
705 parser.add_argument('-g', '--gmap_build', \
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
706 help='Must be selected if you want the library to be gmapped. ' + \
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
707 'Will force gmap_build of the Genome Resource Library, even if previously gmapped.', action='store_true')
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
708 parser.add_argument('-m', '--download_mutation_indexes', default='', \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
709 help='Set to the url of the mutation indexes for the Library. ' + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
710 'Will download mutation indexes into the Genome Resource Library.', action='store_true')
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
711 parser.add_argument('-f', '--force_mutation_indexes_download', \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
712 help='Forces the mutation indexes to download, ' + \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
713 'even if previously downloaded to this Library.', action='store_true')
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
714 requiredNamed = parser.add_argument_group('required named arguments')
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
715 requiredNamed.add_argument('-p', '--destination_path', required=True, \
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
716 help='Full path of the CTAT Resource Library location or destination, either where it is, or where it will be placed.')
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
717 args = parser.parse_args()
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
718
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
719 # All of the input parameters are written by default to the output file prior to
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
720 # this program being called.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
721 # But I do not get input values from the json file, but rather from command line.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
722 # Just leaving the following code as a comment, in case it might be useful to someone later.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
723 # params = from_json_string(open(filename).read())
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
724 # target_directory = params['output_data'][0]['extra_files_path']
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
725 # os.mkdir(target_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
726
4
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
727 print "The value of source_url argument is:\n\t{:s}".format(str(args.source_url))
c372930aaba1 Uploaded
trinity_ctat
parents: 0
diff changeset
728
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
729 # FIX - not sure lib_was_downloaded actually serves a purpose...
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
730 # The original intent was to check whether an attempted download actually succeeded before proceeding,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
731 # but I believe that in those situations, currently, exceptions are raised.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
732 # FIX - Need to double check that. Sometimes, although we are told to download, the function
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
733 # could find that the files are already there, successfully downloaded from a prior attempt,
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
734 # and does not re-download them.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
735 lib_was_downloaded = False
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
736 lib_was_built = False
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
737 download_has_source_data = False
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
738 downloaded_directory = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
739 genome_build_directory = None
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
740 # FIX - need to make sure we are handling all "possible" combinations of arguments.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
741 # Probably would be good if we could simplify/remove some of them.
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
742 # But I think the current interface is using them all.
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
743 if (args.source_url != ""):
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
744 downloaded_directory, download_has_source_data, genome_build_directory, lib_was_downloaded = \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
745 download_from_BroadInst(source=args.source_url, \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
746 destination=args.destination_path, \
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
747 force_download=args.force_download)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
748 else:
9
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
749 if (args.build):
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
750 # Then the user wants to build the directory from the data
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
751 # in the location that was given in destination_path.
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
752 downloaded_directory = args.destination_path
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
753 genome_build_directory = search_for_genome_build_dir(args.destination_path)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
754
9
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
755 print "\nThe location of the downloaded_directory is {:s}.\n".format(str(downloaded_directory))
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
756 print "\nThe location of the CTAT Genome Resource Library is {:s}.\n".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
757
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
758 # FIX - We should leave a file indicating build success the same way we do for download success.
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
759 # To take out builds for testing, comment out the lines that do the building.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
760 # The command that builds the ctat genome library also has an option for building the gmap indexes.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
761 # That is why the gmap_build value is sent to build_the_library(), but if we are not building the
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
762 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
763 # genome_build_directory.
9
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
764 if (downloaded_directory is not None) and (download_has_source_data or args.build):
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
765 build_the_library(downloaded_directory, genome_build_directory, True, args.gmap_build)
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
766 lib_was_built = True
9
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
767 elif downloaded_directory is None:
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
768 print "No directory was downloaded and there is no source data, " + \
1717c42112ed Uploaded
trinity_ctat
parents: 8
diff changeset
769 "so the Resource Library was not built (it may already be built)."
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
770 # The following looks to see if the library actually exists after the build,
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
771 # and raises an error if it cannot find the library files.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
772 # The reassignment of genome_build_directory should be superfluous,
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
773 # since genome_build_directory should already point to the correct directory,
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
774 # unless I made a mistake in the build code.
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
775 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
776
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
777 if (args.gmap_build and not lib_was_built):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
778 # If we did not build the genome resource library
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
779 # the user might still be asking for a gmap_build.
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
780 gmap_the_library(genome_build_directory)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
781
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
782 if (args.download_mutation_indexes != ""):
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
783 download_mutation_indexes(source_url=args.download_mutation_indexes, \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
784 genome_build_directory=genome_build_directory, \
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
785 force_download=args.force_mutation_indexes_download)
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
786
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
787 # Need to get the genome name.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
788 genome_name = find_genome_name_in_path(args.source_url)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
789 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
790 genome_name = find_genome_name_in_path(genome_build_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
791 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
792 genome_name = find_genome_name_in_path(downloaded_directory)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
793 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
794 genome_name = find_genome_name_in_path(args.destination_path)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
795 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
796 genome_name = find_genome_name_in_path(args.display_name)
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
797 if genome_name is None:
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
798 genome_name = _CTAT_ResourceLib_DefaultGenome
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
799 print "WARNING: We could not find a genome name in any of the directory paths."
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
800
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
801 # Determine the display_name for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
802 if (args.display_name is None) or (args.display_name == ""):
7
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
803 # Create the display_name from the genome_name.
f22a13378750 Uploaded
trinity_ctat
parents: 6
diff changeset
804 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
805 else:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
806 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
807 display_name = display_name.replace(" ","_")
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
808
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
809 # Create a unique_id for the library.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
810 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
8
b2e6ed40840a Uploaded
trinity_ctat
parents: 7
diff changeset
811 unique_id = genome_name + "." + datetime_stamp
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
812
6
be2761745400 Uploaded
trinity_ctat
parents: 5
diff changeset
813 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
0
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
814 print "Its unique_id will be set to: {:s}\n".format(unique_id)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
815 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
816
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
817 data_manager_dict = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
818 data_manager_dict['data_tables'] = {}
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
819 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
820 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
821 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
822
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
823 # Temporarily the output file's dictionary is written for debugging:
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
824 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
825 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
826 # which then puts it into the correct .loc file (I think).
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
827 # Comment out the following line when testing without galaxy package.
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
828 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
829
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
830 if __name__ == "__main__":
d2c51cdc2172 Uploaded
trinity_ctat
parents:
diff changeset
831 main()