annotate data_manager/add_ctat_resource_lib.py @ 46:f4f48007db67 draft

Uploaded
author trinity_ctat
date Thu, 25 Oct 2018 20:55:44 -0400
parents 3acb7bc809b5
children fb13fae2c873
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1 #!/usr/bin/env python
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
3
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
4 # Rewritten by H.E. Cicada Brokaw Dennis from code downloaded from the toolshed and
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
5 # other example code on the web. It has however been extensively modified and augmented.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
6 # This now allows downloading of a user selected library
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
7 # but only from the CTAT Genome Resource Library website.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
8 # Ultimately we might want to allow the user to specify any location
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
9 # from which to download.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
10 # Users can create or download other libraries and use this Data Manger to add them
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
11 # if they don't want to add them by hand.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
12
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
13 import sys
45
3acb7bc809b5 More print statement fixes.
trinity_ctat
parents: 44
diff changeset
14 # The many calls to sys.stdout.flush() are done in order to get the output to be synchronized.
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
15 import argparse
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
16 import os
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
17 import shutil
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
18 import tarfile
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
19 import hashlib
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
20 import urllib
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
21 import urlparse
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
22 import contextlib
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
23 import subprocess
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
24
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
25 # Comment out the following line when testing without galaxy package.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
26 from galaxy.util.json import to_json_string
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
27 # The following is not being used, but leaving as info
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
28 # in case we ever want to get input values using json.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
29 # from galaxy.util.json import from_json_string
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
30
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
31 # datetime.now() is used to create the unique_id
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
32 from datetime import datetime
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
33
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
34 # The Data Manager uses a subclass of HTMLParser to look through a web page's html
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
35 # searching for the filenames within anchor tags.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
36 import urllib2
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
37 from HTMLParser import HTMLParser
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
38
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
39 _CTAT_ResourceLib_URL = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
40 _CTAT_Mutation_URL = 'https://data.broadinstitute.org/Trinity/CTAT/mutation/'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
41 _CTAT_Build_dirname = 'ctat_genome_lib_build_dir'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
42 _CTAT_MutationLibDirname = 'ctat_mutation_lib'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
43 _CTAT_ResourceLib_DisplayNamePrefix = 'CTAT_GenomeResourceLib_'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
44 _CTAT_ResourceLib_DefaultGenome = 'Unspecified_Genome'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
45 _CTAT_HumanFusionLib_FilenamePrefix = 'CTAT_HumanFusionLib'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
46 _CTAT_RefGenome_Filename = 'ref_genome.fa'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
47 _CTAT_MouseGenome_Prefix = 'Mouse'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
48 _CTAT_HumanGenome_Prefix = 'GRCh'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
49 _COSMIC_Mutant_Filename = 'CosmicMutantExport.tsv.gz'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
50 _COSMIC_Coding_Filename = 'CosmicCodingMuts.vcf.gz'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
51
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
52 # FIX - The following numbers need to be checked and other numbers for gmap, etc. need to be determined.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
53 # Values for each genome should be determined, so we can get more precise values for each genome.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
54 _NumBytesNeededForSourceDataExtraction = 10737418240 # 10 Gigabytes. FIX - Not checked - Largest archive is currently 2.5GB.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
55 _NumBytesNeededForPlugNPlayExtraction = 48318382080 # 45 Gigabytes. Largest archive is currently 28GB and extracts to 43GB.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
56 # Built Human Genome archive (GRCh38_v27_CTAT_lib_Feb092018) with mutation lib is 46GB.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
57 # Fix - check amount with gmap.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
58 _NumBytesNeededForBuild = 66571993088 # 62 Gigabytes. FIX - This might not be correct.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
59 _NumBytesNeededForMutationResources = 4294967296 # 4 Gigabytes. Actually need about 3.8GB.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
60 # Once built the downloaded archive could be deleted to reduce the amount used, but with the archive
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
61 # there and the Cosmic files and the built ctat_mutation_library, 3.8GB is needed.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
62 # If the archive files are deleted after the integration of the library, only 1.8GB would be used at that point.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
63 _Write_TestFile = 'write_testfile.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
64 _DownloadSuccessFile = 'download_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
65 _ExtractionSuccessFile = 'extraction_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
66 _LibBuiltSuccessFile = 'build_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
67 _GmapSuccessFile = 'gmap_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
68 _MutationDownloadSuccessFile = 'mutation_download_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
69 _MutationIntegrationSuccessFile = 'mutation_integration_succeeded.txt'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
70 _LIBTYPE_SOURCE_DATA = 'source_data'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
71 _LIBTYPE_PLUG_N_PLAY = 'plug-n-play'
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
72
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
73 class resumable_URL_opener(urllib.FancyURLopener):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
74 # This class is used to do downloads that can restart a download from
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
75 # the point where it left off after a partial download was interupted.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
76 # This class and code using it was found online:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
77 # http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
78 # A sub-class is created in order to overide error 206.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
79 # This error means a partial file is being sent,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
80 # which is ok in this case. Do nothing with this error.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
81 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
82 pass
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
83 # End of class resumable_URL_opener
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
84
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
85 class FileListParser(HTMLParser):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
86 # The FileListParser object is used by get_ctat_genome_urls() and get_mutation_resource_urls(),
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
87 # which can be called by the Data Manager interface (.xml file) to get
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
88 # the filenames that are available online at broadinstitute.org
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
89 # Apparently creating dynamic option lists this way is deprecated, but no
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
90 # other method exists by which I can get the options dynamically from the web.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
91 # I believe that it is considered a security risk.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
92
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
93 # This HTMLParser facilitates getting url's of tar.gz links in an HTML page.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
94 # These are assumed to be files that can be downloaded and are the files we
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
95 # are particularly interested in this Data Manager.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
96 def __init__(self):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
97 # Have to use direct call to super class rather than using super():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
98 # super(FileListParser, self).__init__()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
99 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
100 HTMLParser.__init__(self)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
101 self.urls = set()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
102 def handle_starttag(self, tag, attrs):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
103 # Look for filename references in anchor tags and add them to urls.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
104 if tag == "a":
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
105 # The tag is an anchor tag.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
106 for attribute in attrs:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
107 # print "Checking: {:s}".format(str(attribute))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
108 if attribute[0] == "href":
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
109 # Does the href have a tar.gz in it?
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
110 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
111 # Add the value to urls.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
112 self.urls.add(attribute[1])
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
113 # End of class FileListParser
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
114
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
115 def get_ctat_genome_urls():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
116 # open the url and retrieve the urls of the files in the directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
117 # If we can't get the list, send a default list.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
118
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
119 build_default_list = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
120 resource = urllib2.urlopen(_CTAT_ResourceLib_URL)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
121 if resource is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
122 build_default_list = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
123 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
124 theHTML = resource.read()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
125 if (theHTML is None) or (theHTML == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
126 build_default_list = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
127 if build_default_list:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
128 # These are the filenames for what was there at least until 2018/10/09.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
129 urls_to_return = set()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
130 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
131 urls_to_return.add("GRCh37_v19_CTAT_lib_Feb092018.source_data.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
132 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
133 urls_to_return.add("GRCh38_v27_CTAT_lib_Feb092018.source_data.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
134 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.plug-n-play.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
135 urls_to_return.add("Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
136 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
137 filelist_parser = FileListParser()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
138 filelist_parser.feed(theHTML)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
139 urls_to_return = filelist_parser.urls
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
140
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
141 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
142 # Item one is a string that is the display name put into the option list.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
143 # Item two is the value that is put into the parameter associated with the option list.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
144 # Item three is a True or False value, indicating whether the item is selected.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
145 options = []
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
146 for i, url in enumerate(filelist_parser.urls):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
147 # The urls should look like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
148 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
149 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
150 # But in actuality, they are coming in looking like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
151 # GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
152 # Mouse_M16_CTAT_lib_Feb202018.source_data.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
153 # Write code to handle both situations, or an ftp: url.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
154 url_parts = urlparse.urlparse(url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
155 if (url_parts.scheme != ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
156 full_url_path = url
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
157 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
158 # Assume the path is relative to the page location.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
159 full_url_path = os.path.join(_CTAT_ResourceLib_URL, url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
160 filename = os.path.basename(url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
161 # if filename.split("_")[0] != _CTAT_MouseGenome_Prefix:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
162 # # Don't put in the mouse genome options for now.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
163 # # The mouse genome option is not handled correctly yet
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
164 # options.append((filename, full_url_path, i == 0))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
165 # Mouse genomes should work now (we hope) - FIX - still not tested.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
166 options.append((filename, full_url_path, i == 0))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
167 options.sort() # So the list will be in alphabetical order.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
168 # return a tuple of the urls
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
169 print "The list being returned as options is:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
170 print "{:s}\n".format(str(options))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
171 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
172 return options
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
173
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
174 def get_mutation_resource_urls():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
175 # FIX - Rather than letting user choose mutation resource url,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
176 # download the correct one for the chosen library?
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
177 # Not sure about this.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
178 # In that case don't provide a pull down interface for this.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
179 # FIX -
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
180 build_default_list = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
181 resource = urllib2.urlopen(_CTAT_Mutation_URL)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
182 if resource is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
183 build_default_list = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
184 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
185 theHTML = resource.read()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
186 if (theHTML is None) or (theHTML == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
187 build_default_list = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
188 if build_default_list:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
189 # These are the filenames for what was there at least until 2018/10/09.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
190 urls_to_return = set()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
191 urls_to_return.add("mutation_lib.hg19.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
192 urls_to_return.add("mutation_lib.hg38.tar.gz")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
193 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
194 filelist_parser = FileListParser()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
195 filelist_parser.feed(theHTML)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
196 urls_to_return = filelist_parser.urls
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
197
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
198 # For dynamic options need to return an itterable with contents that are tuples with 3 items.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
199 # Item one is a string that is the display name put into the option list.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
200 # Item two is the value that is put into the parameter associated with the option list.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
201 # Item three is a True or False value, indicating whether the item is selected.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
202 options = []
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
203 for i, url in enumerate(filelist_parser.urls):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
204 # The urls should look like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
205 # https://data.broadinstitute.org/Trinity/CTAT/mutation/mc7.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
206 # https://data.broadinstitute.org/Trinity/CTAT/mutation/hg19.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
207 # But in actuality, they are coming in looking like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
208 # hg19.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
209 # mc7.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
210 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
211 # On 2018/10/06, the following tar.gz files were present:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
212 # mutation_lib.hg19.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
213 # mutation_lib.hg38.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
214 # mc-7.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
215 # ctat_mutation_demo.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
216 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
217 # Write code to handle both situations, or an ftp: url.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
218 url_parts = urlparse.urlparse(url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
219 if (url_parts.scheme != ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
220 full_url_path = url
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
221 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
222 # Assume the path is relative to the page location.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
223 full_url_path = os.path.join(_CTAT_Mutation_URL, url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
224 filename = os.path.basename(url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
225 if (filename.split(".")[0] == "mutation_lib"):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
226 # As of 2018_10_09, the only ones supported have mutation_lib as the first part of the name.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
227 options.append((filename, full_url_path, i == 0))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
228 options.sort() # So the list will be in alphabetical order.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
229 # return a tuple of the urls
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
230 print "The list being returned as options is:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
231 print "{:s}\n".format(str(options))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
232 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
233 return options
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
234
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
235 # The following was used by the example program to get input parameters through the json.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
236 # Just leaving here for reference.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
237 # We are getting all of our parameter values through command line arguments.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
238 #def get_reference_id_name(params):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
239 # genome_id = params['param_dict']['genome_id']
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
240 # genome_name = params['param_dict']['genome_name']
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
241 # return genome_id, genome_name
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
242 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
243 #def get_url(params):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
244 # trained_url = params['param_dict']['trained_url']
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
245 # return trained_url
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
246
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
247 def print_directory_contents(dir_path, num_levels):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
248 # This procedure is used to help with debugging and for user information.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
249 if num_levels > 0:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
250 if os.path.exists(dir_path) and os.path.isdir(dir_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
251 print "\nDirectory {:s}:".format(dir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
252 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
253 subprocess.call("ls -la {:s} 2>&1".format(dir_path), shell=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
254 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
255 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
256 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
257 if num_levels > 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
258 if os.path.exists(dir_path) and os.path.isdir(dir_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
259 for filename in os.listdir(dir_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
260 filename_path = os.path.join(dir_path, filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
261 if os.path.exists(filename_path) and os.path.isdir(filename_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
262 print_directory_contents(filename_path, num_levels-1)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
263 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
264 print "Path either does not exist, or is not a directory:\n\t{:s}.".format(dir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
265 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
266
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
267 def which(file):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
268 # This procedure is similar to the linux "which" command.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
269 # It is used to find the location of an executable program that is in the PATH.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
270 # However this implementation does not check whether the program's file is executable.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
271 for path in os.environ["PATH"].split(os.pathsep):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
272 if os.path.exists(os.path.join(path, file)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
273 return os.path.join(path, file)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
274 return None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
275
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
276 def size_of_file_at(file_url):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
277 # Returns the size of the file at file_url.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
278 # We have to open the file, in order to find out how big it is.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
279 file_retriever = resumable_URL_opener()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
280 with contextlib.closing(file_retriever.open(file_url)) as filelike_object:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
281 filesize = int(filelike_object.headers['Content-Length'])
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
282 return filesize
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
283
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
284 def md5sum_for(filename, blocksize=2**20):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
285 # I got this code for this function off the web, but don't remember where.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
286 m = hashlib.md5()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
287 finished = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
288 with open(filename, "rb" ) as f:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
289 while not finished:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
290 buf = f.read(blocksize)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
291 if buf:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
292 m.update( buf )
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
293 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
294 finished = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
295 return m.hexdigest()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
296
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
297 def ctat_library_type(filepath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
298 # This function pulls out the string indicating the library type of the file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
299 # If the filename indicates source_data, as opposed to plug-n-play,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
300 # then the library will have to be built after it is downloaded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
301 base_filename = os.path.basename(filepath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
302 library_type = base_filename.split(".")[1]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
303 #print "The file {:s}".format(base_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
304 #print "is of type {:s}".format(library_type)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
305 return library_type
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
306
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
307 def find_genome_name_in_path(path, raise_error=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
308 # The form of the genome name in directory names (if present in the path) looks like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
309 # GRCh37_v19_CTAT_lib_Feb092018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
310 # GRCh38_v27_CTAT_lib_Feb092018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
311 # Mouse_M16_CTAT_lib_Feb202018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
312 # Raises a ValueError if there is no genome name in the given path.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
313 genome_name = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
314 if (path is not None) and (path != ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
315 for element in path.split(os.sep):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
316 # print "Looking for genome name in {:s}.".format(element)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
317 if (element[0:len(_CTAT_MouseGenome_Prefix)] == _CTAT_MouseGenome_Prefix) \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
318 or (element[0:len(_CTAT_HumanGenome_Prefix)] == _CTAT_HumanGenome_Prefix):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
319 # Remove any extension that might be in the filename.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
320 genome_name = element.split(".")[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
321 if (genome_name is None or (genome_name == "")) and raise_error:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
322 raise ValueError("Cannnot find genome name in the given filename path:\n\t".format(path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
323 return genome_name
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
324
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
325 def bytes_needed_to_extract(archive_filepath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
326 # FIX -- The following should be replaced by a series of statements that return the right value for each archive.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
327 # The numbers used now estimates for the human genome, and so are big enough for the mouse genome, so ok for now.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
328 # But now we are also using this for the mutation resource files, so really need to FIX this.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
329 # FIX --
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
330 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
331 if (ctat_library_type(archive_filepath) == _LIBTYPE_SOURCE_DATA):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
332 bytes_needed = _NumBytesNeededForSourceDataExtraction
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
333 else: # assume otherwise that it is a plug-n-play archive.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
334 bytes_needed = _NumBytesNeededForPlugNPlayExtraction
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
335 return bytes_needed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
336
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
337 def bytes_needed_to_build(source_data_filepath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
338 # FIX - The following should be replaced by a series of statements that return the right value for each archive.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
339 # The numbers used now estimates that largest size needed. Also, it is probably not correct.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
340 return _NumBytesNeededForBuild
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
341
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
342 def create_success_file(full_file_path, contents=None):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
343 # full_file_path is the path to the file to write.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
344 # It should not exist before calling this function,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
345 # but if it does, it will be overwritten.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
346 # contents is some text that will be written into the file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
347 # It can be empty and nothing will be written.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
348 try:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
349 with open(full_file_path,"w") as success_file:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
350 if contents is not None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
351 success_file.write(contents)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
352 # else nothing is written into it, but we still will have created the file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
353 except IOError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
354 print "The success indication file could not be created: " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
355 "{:s}".format(full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
356 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
357 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
358
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
359 def download_file_from_url(file_url, dest_dir, resume_download=True):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
360 # Some of the code used in this procedure was downloaded and modified for our needs.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
361 # That code was at: http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
362 # Given a file_url, downloads that file to dest_dir.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
363 # The url must specify a file to download, so I can grab the filename from the end of the url's path.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
364 # It is best to fully specify dest_dir. Otherwise the dest_dir will be opened relative to whatever cwd is.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
365 # If resume_download is True (the default), the function will attempt to resume the download where it left off,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
366 # if, for example, a previous download was interupted.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
367 # If resume_download is False, any existing download of the file is deleted and a new download is started.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
368
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
369 # DOWNLOAD_BLOCK_SIZE = 65536 # 64KB. Old number was 8192 or 8KB.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
370 DOWNLOAD_BLOCK_SIZE = 1048576 # 1 MB
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
371 download_complete = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
372 existing_size = 0
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
373 bytes_read = 0
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
374 file_retriever = resumable_URL_opener()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
375 dest_filename = os.path.basename(file_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
376 dest_fullpath = os.path.join(dest_dir, dest_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
377 source_filesize = size_of_file_at(file_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
378 print "Downloading {:s}\nSize of the file is {:d}".format(file_url, source_filesize)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
379 print "Destination file for the download is {:s}".format(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
380 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
381
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
382 # If the file exists and resume_download is requested, then only download the remainder
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
383 if resume_download and os.path.exists(dest_fullpath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
384 existing_size = os.path.getsize(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
385 #If the file exists, but we already have the whole thing, don't download again
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
386 print "The destination file exists and is {:d} bytes in size.".format(existing_size)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
387 if (source_filesize == existing_size):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
388 print "The file has already been completely downloaded:\n\t{:s}".format(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
389 download_complete = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
390 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
391 header = "Range","bytes={:s}-".format(str(existing_size))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
392 print "Adding header to resume download:\n\t{:s}".format(header)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
393 file_retriever.addheader("Range","bytes={:s}-".format(str(existing_size)))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
394 # We open even if download is complete, to avoid adding code to determine whether to close.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
395 output_file = open(dest_fullpath,"ab")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
396 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
397 if os.path.exists(dest_fullpath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
398 print "The destination file exists:\n\t{:s}".format(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
399 print "However a new download has been requested."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
400 print "The download will overwrite the existing file."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
401 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
402 print "The destination file does not exist yet."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
403 existing_size = 0
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
404 output_file = open(dest_fullpath,"wb")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
405 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
406
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
407 try:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
408 # Check whether there is enough space on the device for the rest of the file to download.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
409 statvfs = os.statvfs(dest_dir)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
410 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
411 # num_avail_bytes is the number of free bytes that ordinary users
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
412 # are allowed to use (excl. reserved space)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
413 # Perhaps should subtract some padding amount from num_avail_bytes
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
414 # rather than raising only if there is less than exactly what is needed.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
415 if (num_avail_bytes < (source_filesize-existing_size)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
416 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
417 " on the device of the destination directory for the download: " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
418 "{:s}".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
419
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
420 source_file = file_retriever.open(file_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
421 while not download_complete:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
422 data = source_file.read(DOWNLOAD_BLOCK_SIZE)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
423 if data:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
424 output_file.write(data)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
425 bytes_read = bytes_read + len(data)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
426 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
427 download_complete = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
428 source_file.close()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
429 except IOError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
430 print "Error while attempting to download {:s}".format(file_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
431 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
432 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
433 finally:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
434 output_file.close()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
435
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
436 for k,v in source_file.headers.items():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
437 print k, "=",v
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
438 print "Downloaded {:s} bytes from {:s}".format(str(bytes_read), str(file_url))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
439 dest_filesize = os.path.getsize(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
440 print "{:s} {:s}".format(str(dest_filesize), str(dest_fullpath))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
441 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
442 if source_filesize != dest_filesize:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
443 raise IOError("Download error:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
444 "The source file\n\t\t{:d}\t{:s}\n\t".format(source_filesize, file_url) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
445 "and the destination file\n\t\t{:d}\t{:s}\n\t".format(dest_filesize, dest_fullpath) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
446 "are different sizes.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
447 return dest_fullpath
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
448
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
449 def ensure_we_can_write_numbytes_to(destination, numbytes):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
450 # Attempts to create the destination directory if it does not exist.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
451 # Tests whether a file can be written to that directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
452 # Tests whether there is numbytes space on the device of the destination.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
453 # Raises errors if it cannot do any of the above.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
454 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
455 # Returns the full specification of the destination path.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
456 # We want to make sure that destination is an absolute fully specified path.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
457 cannonical_destination = os.path.realpath(destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
458 if os.path.exists(cannonical_destination):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
459 if not os.path.isdir(cannonical_destination):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
460 raise ValueError("The destination is not a directory: " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
461 "{:s}".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
462 # else all is good. It is a directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
463 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
464 # We need to create it since it does not exist.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
465 try:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
466 os.makedirs(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
467 except os.error:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
468 print "ERROR: Trying to create the following directory path:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
469 print "\t{:s}".format(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
470 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
471 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
472 # Make sure the directory now exists and we can write to it.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
473 if not os.path.exists(cannonical_destination):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
474 # It should have been created, but if it doesn't exist at this point
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
475 # in the code, something is wrong. Raise an error.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
476 raise OSError("The destination directory could not be created: " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
477 "{:s}".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
478 test_writing_filename = "{:s}.{:s}".format(os.path.basename(cannonical_destination), _Write_TestFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
479 test_writing_filepath = os.path.join(cannonical_destination, test_writing_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
480 try:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
481 with open(test_writing_filepath, "w") as test_writing_file:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
482 test_writing_file.write("Testing writing to this file.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
483 if os.path.exists(test_writing_filepath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
484 os.remove(test_writing_filepath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
485 except IOError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
486 print "The destination directory could not be written into:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
487 "{:s}".format(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
488 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
489 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
490 # Check whether there are numbytes available on cannonical_destination's device.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
491 statvfs = os.statvfs(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
492 # fs_size = statvfs.f_frsize * statvfs.f_blocks # Size of filesystem in bytes
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
493 # num_free_bytes = statvfs.f_frsize * statvfs.f_bfree # Actual number of free bytes
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
494 num_avail_bytes = statvfs.f_frsize * statvfs.f_bavail # Number of free bytes that ordinary users
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
495 # are allowed to use (excl. reserved space)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
496 if (num_avail_bytes < numbytes):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
497 raise OSError("There is insufficient space ({:s} bytes)".format(str(num_avail_bytes)) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
498 " on the device of the destination directory:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
499 "{:s}\n\t{:d} bytes are needed.".format(cannonical_destination, numbytes))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
500
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
501 return cannonical_destination
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
502
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
503 def download_genome_archive(source_url, destination, force_new_download=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
504 # This function downloads but does not extract the archive at source_url.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
505 # This function can be called on a file whose download was interrupted, and if force_new_download
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
506 # is False, the download will proceed where it left off.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
507 # If download does not succeed, an IOError is raised.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
508 # The function checks whether there is enough space at the destination for the expanded library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
509 # It raises an OSError if not.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
510 # ValueError can also be raised by this function.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
511
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
512 # Input Parameters
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
513 # source_url is the full URL of the file we want to download.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
514 # It should look something like:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
515 # https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play.tar.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
516 # If only the filename is given, it is assumed to reside at _CTAT_ResourceLib_URL.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
517 # destination is the location (directory) where a copy of the source file will be placed.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
518 # Relative paths are expanded using the current working directory, so within Galaxy,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
519 # it is best to send in absolute fully specified path names so you know to where
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
520 # the source file is going to be copied.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
521 # force_new_download if True, will cause a new download to occur, even if the file has been downloaded previously.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
522 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
523 # Returns the canonical path to the file that was downloaded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
524
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
525 dest_fullpath = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
526 url_parts = urlparse.urlparse(source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
527 source_filename = os.path.basename(url_parts.path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
528 if url_parts.scheme == "":
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
529 # Then we were given a source_url without a leading https: or similar.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
530 # Assume we only were given the filename and that it exists at _CTAT_ResourceLib_URL.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
531 source_url = urlparse.urljoin(_CTAT_ResourceLib_URL, source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
532 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
533
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
534 print "Downloading:\n\t{:s}".format(str(source_url))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
535 print "to:\n\t{:s}".format(destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
536 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
537 # The next is done so that if the source_url does not have a genome name in it, an error will be raised.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
538 find_genome_name_in_path(source_url, raise_error=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
539 cannonical_destination = ensure_we_can_write_numbytes_to(destination, size_of_file_at(source_url))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
540
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
541 # Get the list of files in the directory,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
542 # We use it to check for a previous download.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
543 orig_files_in_destdir = set(os.listdir(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
544 # See whether the file has been downloaded already.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
545 download_success_filename = "{:s}.{:s}".format(source_filename, _DownloadSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
546 download_success_full_file_path = os.path.join(cannonical_destination, download_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
547 if ((download_success_filename not in orig_files_in_destdir) \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
548 or force_new_download):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
549 if (download_success_filename in orig_files_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
550 # Since we are redoing the download,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
551 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
552 # until the download has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
553 os.remove(download_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
554 # The following raises an error if the download fails for some reason.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
555 dest_fullpath = download_file_from_url(source_url, cannonical_destination, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
556 resume_download=(not force_new_download))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
557 # Check the md5sum of the cannonical_destination file to ensure the data in the file is correct.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
558 file_retriever = resumable_URL_opener()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
559 md5_url = "{:s}.md5".format(source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
560 print "Checking the md5sum of the downloaded file."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
561 try:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
562 md5_file = file_retriever.open(md5_url, "r")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
563 md5sum_from_web = md5_file.readlines()[0].strip().split()[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
564 md5_file.close()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
565 md5sum_from_file = md5sum_for(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
566 except IOError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
567 print "Error while attempting to check the md5sum for {:s}".format(dest_fullpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
568 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
569 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
570 if md5sum_from_web != md5sum_from_file:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
571 raise IOError("Download error:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
572 "The md5 sum for\n\t\t({:s})\n\t".format(dest_fullpath) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
573 "does not match the value read from the web:\n\t\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
574 "({:s} != {:s})".format(md5sum_from_file, md5sum_from_web))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
575 print "Check of md5sum succeeded."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
576 create_success_file(download_success_full_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
577 "Download of:\n\t{:s}\n".format(source_url) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
578 "to:\n\t{:s}\nsucceeded.".format(dest_fullpath))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
579 elif download_success_filename in orig_files_in_destdir:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
580 print "The download success file exists, so no download is being attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
581 print "\t{:s}".format(download_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
582 print "Remove the file or set <Force New Download> if you want a new download to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
583 dest_filename = os.path.basename(source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
584 dest_fullpath = os.path.join(cannonical_destination, dest_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
585 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
586 print "download_genome_archive(): This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
587 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
588
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
589 # Some code to help us if errors occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
590 print "\n*******************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
591 print "* Finished download. *"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
592 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
593 print_directory_contents(cannonical_destination, 1)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
594 print "*******************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
595 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
596
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
597 return dest_fullpath
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
598
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
599 def extract_archive(archive_filepath, destination, force_new_extraction=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
600 # Generic function will use tarfile object to extract the given archive_filepath
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
601 # to the destination. If a file indicating a previous successful extraction exists
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
602 # the file is not extracted again unless force_new_extraction is True.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
603 # This procedure does not write the extraction success file, because some error checking
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
604 # is dependant on the file being extracted. The calling procedure can/should write the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
605 # success file after doing error checking.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
606 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
607
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
608 # Create the name of the file used to indicate prior success of the file's extraction.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
609 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
610 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
611 #print "extraction_success_filename is {:s}".format(extraction_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
612
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
613 orig_files_in_destination = set(os.listdir(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
614 if ((extraction_success_filename not in orig_files_in_destination) \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
615 or force_new_extraction):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
616 # Do the extraction.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
617 if (extraction_success_filename in orig_files_in_destination):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
618 # Since we are redoing the extraction,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
619 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
620 # until the extraction has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
621 os.remove(extraction_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
622 with tarfile.open(archive_filepath, mode="r:*") as archive_file:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
623 archive_file.extractall(path=cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
624 elif (extraction_success_filename in orig_files_in_destination):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
625 # The archive was successfully extracted before so we do not do it again.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
626 print "The extraction success file exists, so no new extraction was attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
627 print "\t{:s}".format(extraction_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
628 print "Remove the success file or set <force new extraction> if you want a new extraction to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
629 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
630 print "extract_archive(): This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
631 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
632
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
633 # Some code to help us if errors occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
634 print "\n*******************************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
635 print "* Finished extraction. Destination directory listing. *"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
636 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
637 print_directory_contents(cannonical_destination, 1)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
638 print "*******************************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
639 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
640 return
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
641
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
642 def extract_genome_file(archive_filepath, destination, force_new_extraction=False, keep_archive=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
643 # Extract a CTAT Genome Reference Library archive file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
644 # It is best if archive_filepath is an absolute, fully specified filepath, not a relative one.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
645 # destination is the directory to which the archive will be extracted.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
646 # force_new_extraction can be used to cause extraction to occur, even if the file was extracted before.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
647 #
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
648 # Returns extracted_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
649 # The full path of the top level directory that is
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
650 # created by the extraction of the files from the archive.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
651
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
652 print "Extracting:\n\t {:s}".format(str(archive_filepath))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
653 print "to:\n\t{:s}".format(destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
654 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
655 cannonical_destination = ensure_we_can_write_numbytes_to(destination, bytes_needed_to_extract(archive_filepath))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
656 # Get the root filename of the Genome Directory from the source file's name.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
657 # That should also be the name of the extracted directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
658 genome_dirname = find_genome_name_in_path(archive_filepath, raise_error=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
659
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
660 orig_files_in_destination = set(os.listdir(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
661 extract_archive(archive_filepath, destination, force_new_extraction)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
662 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destination
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
663
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
664 if (genome_dirname not in newfiles_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
665 # Perhaps it has a different name than what we expect it to be.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
666 # It will be a sub-directory that was not in the directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
667 # before we did the download and extraction.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
668 found_filename = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
669 if len(newfiles_in_destdir) == 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
670 found_filename = newfiles_in_destdir[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
671 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
672 for filename in newfiles_in_destdir:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
673 # In most cases, there will only be one new file, but some OS's might have created
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
674 # other files in the directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
675 # Look for the directory that was downloaded and extracted.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
676 # The correct file's name should be a substring of the tar file that was downloaded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
677 if filename in src_filename:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
678 # make sure it is a directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
679 if os.path.isdir(os.path.join(cannonical_destination,filename)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
680 found_filename = filename
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
681 if found_filename is not None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
682 genome_dirname = found_filename
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
683
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
684 extracted_directory = os.path.join(cannonical_destination, genome_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
685 if (os.path.exists(extracted_directory)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
686 # Create the name of the file used to indicate prior success of the file's extraction.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
687 extraction_success_filename = "{:s}.{:s}".format(os.path.basename(archive_filepath), _ExtractionSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
688 extraction_success_full_file_path = os.path.join(cannonical_destination, extraction_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
689 create_success_file(extraction_success_full_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
690 "Extraction of:\n\t{:s}\n".format(archive_filepath) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
691 "to:\n\t{:s}\nsucceeded.".format(extracted_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
692 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
693 raise ValueError("ERROR: Could not find the extracted directory in the destination directory:" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
694 "\n\t{:s}".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
695 if not keep_archive:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
696 # We are done extracting, so remove the archive file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
697 if os.path.exists(archive_filepath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
698 print "Removing the archive file:\n\t{:s}".format(archive_filepath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
699 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
700 os.remove(archive_filepath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
701 # else: # It was removed previously, so we don't need to remove it again.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
702 return extracted_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
703
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
704 def get_gmap_success_filename(genome_build_directory):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
705 genome_name = find_genome_name_in_path(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
706 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
707 genome_name = os.path.basename(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
708 return "{:s}.{:s}".format(genome_name, _GmapSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
709
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
710 def gmap_the_library(genome_build_directory, force_new_gmap=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
711 # This is the processing that needs to happen for gmap-fusion to work.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
712 # genome_build_directory should normally be a fully specified path,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
713 # though this function should work even if it is relative.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
714 # The command prints messages out to stderr, even when there is not an error,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
715 # so route stderr to stdout. Otherwise, galaxy thinks an error occurred.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
716
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
717 # Create the name of the file used to indicate prior success of gmap.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
718 gmap_success_filename = get_gmap_success_filename(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
719 gmap_success_full_file_path = os.path.join(genome_build_directory, gmap_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
720
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
721 orig_files_in_build_dir = set(os.listdir(genome_build_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
722 if ((gmap_success_filename not in orig_files_in_build_dir) \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
723 or force_new_gmap):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
724 # Do the gmap.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
725 if (gmap_success_filename in orig_files_in_build_dir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
726 # Since we are redoing the gmap,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
727 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
728 # until the gmap has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
729 os.remove(gmap_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
730 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa 2>&1".format( \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
731 genome_build_directory, genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
732 try: # to send the gmap_build command.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
733 subprocess.check_call(command, shell=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
734 except subprocess.CalledProcessError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
735 print "ERROR: While trying to run the gmap_build command on the library:\n\t{:s}".format(command)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
736 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
737 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
738 finally:
45
3acb7bc809b5 More print statement fixes.
trinity_ctat
parents: 44
diff changeset
739 sys.stdout.flush()
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
740 # Some code to help us if errors occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
741 print "\n*******************************\nAfter running gmap_build."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
742 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
743 print_directory_contents(genome_build_directory, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
744 print "*******************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
745 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
746 create_success_file(gmap_success_full_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
747 "gmap of:\n\t{:s}\nsucceeded.".format(genome_build_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
748 elif gmap_success_filename in orig_files_in_build_dir:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
749 print "The gmap success file exists, so no gmap is being attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
750 print "\t{:s}".format(gmap_success_full_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
751 print "Remove the file or set <force new gmap> if you want a new gmap to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
752 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
753 print "gmap_the_library(): This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
754 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
755 return
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
756
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
757
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
758 def build_the_library(genome_source_directory, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
759 genome_build_directory, force_new_build=False, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
760 gmap_build=False, force_gmap_build=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
761 """ genome_source_directory is the location of the source_data needed to build the library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
762 Normally it is fully specified, but could be relative.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
763 genome_build_directory is the location where the library will be built.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
764 It can be relative to the current working directory or an absolute path.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
765 build specifies whether to run prep_genome_lib.pl even if it was run before.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
766 gmap_build specifies whether to run gmap_build or not.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
767
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
768 Following was the old way to do it. Before FusionFilter 0.5.0.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
769 prep_genome_lib.pl \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
770 --genome_fa ref_genome.fa \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
771 --gtf ref_annot.gtf \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
772 --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
773 --fusion_annot_lib fusion_lib.dat.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
774 --output_dir ctat_genome_lib_build_dir
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
775 index_pfam_domain_info.pl \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
776 --pfam_domains PFAM.domtblout.dat.gz \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
777 --genome_lib_dir ctat_genome_lib_build_dir
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
778 gmap_build -D ctat_genome_lib_build_dir -d ref_genome.fa.gmap -k 13 ctat_genome_lib_build_dir/ref_genome.fa"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
779 """
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
780
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
781 if (genome_source_directory is None) or (genome_source_directory == "" ) or not os.path.exists(genome_source_directory):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
782 raise ValueError("Cannot build the CTAT Genome Resource Library. " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
783 "The source directory does not exist:\n\t{:s}".format(str(genome_source_directory)))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
784 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
785 bytes_needed_to_build(genome_source_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
786 print "Building the CTAT Genome Resource Library from source data at:\n\t{:s}".format(str(genome_source_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
787 print "The Destination directory is at:\n\t{:s}".format(str(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
788 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
789
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
790 # Get the root filename of the Genome Directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
791 src_filename = os.path.basename(genome_source_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
792 # See whether the library has been built already. The success file is written into the source directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
793 files_in_sourcedir = set(os.listdir(genome_source_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
794 build_success_filename = "{:s}.{:s}".format(src_filename, _LibBuiltSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
795 build_success_file_path = os.path.join(genome_source_directory, build_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
796 if (build_success_filename not in files_in_sourcedir) or force_new_build:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
797 os.chdir(genome_source_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
798 if (build_success_filename in files_in_sourcedir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
799 # Since we are redoing the build,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
800 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
801 # until the build has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
802 os.remove(build_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
803 # Create the command that builds the Genome Resource Library form the source data.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
804 command = "prep_genome_lib.pl --genome_fa ref_genome.fa --gtf ref_annot.gtf " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
805 "--pfam_db PFAM.domtblout.dat.gz " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
806 "--output_dir {:s} ".format(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
807 found_HumanFusionLib = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
808 HumanFusionLib_filename = "NoFileFound"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
809 for filename in os.listdir(genome_source_directory):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
810 # At the time this was written, the filename was CTAT_HumanFusionLib.v0.1.0.dat.gz
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
811 # We only check the prefix, in case other versions are used later.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
812 # I assume there is only one in the directory, but if there are more than one,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
813 # the later one, alphabetically, will be used.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
814 if filename.split(".")[0] == _CTAT_HumanFusionLib_FilenamePrefix:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
815 found_HumanFusionLib = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
816 filename_of_HumanFusionLib = filename
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
817 if found_HumanFusionLib:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
818 # The mouse genomes do not have a fusion_annot_lib
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
819 # so only add the following for Human genomes.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
820 command += "--fusion_annot_lib {:s} ".format(filename_of_HumanFusionLib) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
821 "--annot_filter_rule AnnotFilterRule.pm "
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
822 if gmap_build:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
823 command += "--gmap_build "
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
824 # Send stderr of the command to stdout, because some functions may write to stderr,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
825 # even though no error has occurred. We will depend on error code return in order
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
826 # to know if an error occurred.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
827 command += " 2>&1"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
828 print "About to run the following command:\n\t{:s}".format(command)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
829 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
830 try: # to send the prep_genome_lib command.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
831 subprocess.check_call(command, shell=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
832 except subprocess.CalledProcessError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
833 print "ERROR: While trying to run the prep_genome_lib.pl command " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
834 "on the CTAT Genome Resource Library:\n\t{:s}".format(command)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
835 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
836 finally:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
837 # Some code to help us if errors occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
838 print "\n*******************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
839 print "Contents of Genome Source Directory {:s}:".format(genome_source_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
840 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
841 print_directory_contents(genome_source_directory, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
842 print "\nContents of Genome Build Directory {:s}:".format(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
843 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
844 print_directory_contents(cannonical_destination, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
845 print "*******************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
846 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
847 create_success_file(build_success_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
848 "Build of:\n\t{:s}\n".format(genome_source_directory) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
849 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
850 if gmap_build:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
851 # Create the gmap success file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
852 gmap_success_filename = get_gmap_success_filename(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
853 gmap_success_full_file_path = os.path.join(cannonical_destination, gmap_success_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
854 create_success_file(gmap_success_full_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
855 "gmap of:\n\t{:s}\nsucceeded.".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
856 elif (build_success_filename in files_in_sourcedir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
857 print "The build success file exists, so no build is being attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
858 print "\t{:s}".format(build_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
859 print "Remove the file or set <force new build> if you want a new build to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
860 # We might still need to do a gmap_build.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
861 if gmap_build:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
862 print "Checking if we need to gmap the library."
45
3acb7bc809b5 More print statement fixes.
trinity_ctat
parents: 44
diff changeset
863 sys.stdout.flush()
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
864 gmap_the_library(cannonical_destination, force_gmap_build)
45
3acb7bc809b5 More print statement fixes.
trinity_ctat
parents: 44
diff changeset
865 sys.stdout.flush()
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
866 # gmap_the_library creates a gmap success file if it succeeds.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
867 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
868 print "build_the_library(): This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
869 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
870 return
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
871 # End of build_the_library()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
872
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
873 def find_path_to_mutation_lib_integration():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
874 # We are assuming that we exist inside of a conda environment and that the directory that we want
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
875 # is in the share directory, one level up from the bin directory that contains the ctat_mutations
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
876 # command.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
877 path_to_mutation_lib_integration = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
878 path_to_ctat_mutations = which("ctat_mutations")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
879 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
880 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resource processing.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
881 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
882 share_dir = os.path.join(conda_root_dir, "share")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
883 ctat_mutations_dir = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
884 for filename in os.listdir(share_dir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
885 if "ctat-mutations" in filename:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
886 ctat_mutations_dir = filename
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
887 if (ctat_mutations_dir is None) or (ctat_mutations_dir == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
888 raise ValueError("Unable to find the home of ctat_mutations.\n" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
889 "It should be in the share directory:\n\t{:s}.".format(share_dir))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
890 path_to_mutation_lib_integration = os.path.join(share_dir, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
891 ctat_mutations_dir, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
892 "mutation_lib_prep", \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
893 "ctat-mutation-lib-integration.py")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
894 return path_to_mutation_lib_integration
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
895
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
896 def find_path_to_picard_home():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
897 picard_home = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
898 path_to_ctat_mutations = which("ctat_mutations")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
899 if (path_to_ctat_mutations is None) or (path_to_ctat_mutations == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
900 raise ValueError("Unable to find ctat_mutations, which is required to do mutation resources processing.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
901 # The ctat_mutations shell script defines PICARD_HOME. We just need to get it out of that file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
902 ctat_mutations_file = open(path_to_ctat_mutations, "r")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
903 for line in ctat_mutations_file:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
904 if ("export" in line) and ("PICARD_HOME=" in line):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
905 # Get the value after the equal sign and strip off the newline at the end of string.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
906 # Then strip off quotes at begin and end if they are there.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
907 # And then strip off any other whitespace that might have been inside of stripped off quotes.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
908 picard_home = line.split("=")[1].strip().strip('\"').strip()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
909 if (picard_home is None) or (picard_home == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
910 # We didn't find it in the ctat_mutations file. Search for it.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
911 conda_root_dir = os.path.dirname(os.path.dirname(path_to_ctat_mutations))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
912 share_dir = os.path.join(conda_root_dir, "share")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
913 for filename in os.listdir(share_dir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
914 if "picard" in filename:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
915 picard_home = os.path.join(share_dir,filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
916 if (picard_home is None) or (picard_home == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
917 raise ValueError("Unable to find PICARD_HOME.\n" +
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
918 "It should be in the share directory:\n\t{:s}.".format(share_dir))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
919 return picard_home
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
920
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
921 def download_and_integrate_mutation_resources(source_url, genome_build_directory, cosmic_resources_location=None, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
922 force_new_download=False, force_new_integration=False):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
923 # source_url is the url of the mutation resources archive to download.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
924 # genome_build_dir is the location where the archive will be placed.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
925 # If cosmic_files_location is set, that is the location where the files are presumed to exist.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
926 # If cosmic_files_location is not set, the files will assumed to exist in genome_build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
927 # If force_new_download is True, then even if the archive has previously been downloaded,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
928 # it will be downloaded again.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
929 """
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
930 From https://github.com/NCIP/ctat-mutations/tree/master/mutation_lib_prep
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
931
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
932 Step 1 (after CTAT Genome Resource Library is built)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
933 download mutation_lib.hg38.tar.gz into GRCh38_v27_CTAT_lib_Feb092018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
934 or
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
935 download mutation_lib.hg19.tar.gz into GRCh37_v19_CTAT_lib_Feb092018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
936 or
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
937 download mc-7.tar.gz into Mouse_M16_CTAT_lib_Feb202018
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
938 (Need to ask about support for mouse, since there is not info about Cosmic mouse genome files in instracutions.)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
939
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
940 Step 2: Cosmic files download - User must perform this step prior to running this code. We check if files are present.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
941
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
942 Next download COSMIC resources required in this directory. Depending on the version of genome you need you can install either COSMIC's hg38 or COSMIC's hg19. You will need to download 2 sets of files: COSMIC Mutation Data (CosmicMutantExport.tsv.gz) and COSMIC Coding Mutation VCF File (CosmicCodingMuts.vcf.gz). Please note, for download to succeed you will need to register and login to their service.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
943
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
944 So is there a way the user can give their credentials through the Data Manager interface as a part of specifying Mutation parameters and then I can programatically use those credentials to download the file, or maybe instead, the interface needs to have the intructions for the user to download the files, then the use needs to specify the absolute path to where those files are.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
945
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
946 Step 3: Mutation lib integration
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
947
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
948 Once you have downloaded CosmicMutantExport.tsv.gz AND CosmicCodingMuts.vcf.gz (hg38 or hg19), proceed with mutation lib integration step which will integrate the mutation resource with CTAT_GENOME_LIB (This corresponds to "GRCh37_v19_CTAT_lib_Feb092018" or "GRCh38_v27_CTAT_lib_Feb092018" downloaded in Step 1). You will find this script in ctat-mutations repo in 'src' directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
949
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
950 #Keep Picard in PICARD_HOME environmental variable like so
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
951 export PICARD_HOME=/path/to/picard
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
952
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
953 #Integrate CTAT mutations lib with CTAT genome library
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
954 python ctat-mutations/mutation_lib_prep/ctat-mutation-lib-integration.py \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
955 --CosmicMutantExport CosmicMutantExport.tsv.gz \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
956 --CosmicCodingMuts CosmicCodingMuts.vcf.gz \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
957 --genome_lib_dir GRCh37_v19_CTAT_lib_Feb092018/ # OR GRCh38_v27_CTAT_lib_Feb092018/
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
958
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
959 Now you are all set to run the ctat-mutations pipeline
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
960 """
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
961 print "\n***********************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
962 print "* Integrating Mutation Resources. *"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
963 print "***********************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
964 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
965 # It is assumed that this procedure is only called with a valid genome_build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
966 url_parts = urlparse.urlparse(source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
967 source_filename = os.path.basename(url_parts.path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
968 if url_parts.scheme == "":
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
969 # Then we were given a source_url without a leading https: or similar.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
970 # Assume we only were given the filename and that it exists at _CTAT_Mutation_URL.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
971 source_url = urlparse.urljoin(_CTAT_Mutation_URL, source_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
972 # FIX - We might want to otherwise check if we have a valid url and/or if we can reach it.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
973 cannonical_destination = ensure_we_can_write_numbytes_to(genome_build_directory, _NumBytesNeededForMutationResources)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
974 print "Download and Integrate a Mutation Resource Archive."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
975 print "The source URL is:\n\t{:s}".format(str(source_url))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
976 print "The destination is:\n\t{:s}".format(str(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
977 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
978 # Get the list of files in the directory,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
979 # We use it to check for a previous download or extraction among other things.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
980 orig_files_in_destdir = set(os.listdir(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
981
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
982 # DOWNLOAD SECTION
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
983 # See whether the index file has been downloaded already.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
984 download_success_file = "{:s}.{:s}".format(source_filename, _MutationDownloadSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
985 download_success_file_path = os.path.join(cannonical_destination, download_success_file)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
986 if ((download_success_file not in orig_files_in_destdir) or force_new_download):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
987 # DO THE DOWNLOAD
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
988 if (download_success_file in orig_files_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
989 # Since we are redoing the download,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
990 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
991 # until the download has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
992 os.remove(download_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
993 # The following raises an IOError if the download fails for some reason.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
994 archive_fullpath = download_file_from_url(source_url, cannonical_destination, resume_download=(not force_new_download))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
995 create_success_file(download_success_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
996 "Download of the mutation resource archive:\n\t{:s}\n".format(source_url) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
997 "to:\n\t{:s}\nsucceeded.".format(cannonical_destination))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
998 elif (download_success_file in orig_files_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
999 print "The download success file exists, so no download is being attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1000 print "\t{:s}".format(download_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1001 print "Remove the file or set <new_mutation_download> if you want a new download to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1002 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1003 print "download_and_integrate_mutation_resources() - Download: This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1004 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1005
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1006 # INTEGRATION SECTION
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1007 integration_success_file = "{:s}.{:s}".format(source_filename, _MutationIntegrationSuccessFile)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1008 integration_success_file_path = os.path.join(cannonical_destination, integration_success_file)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1009 if ((integration_success_file not in orig_files_in_destdir) or force_new_integration):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1010 # INTEGRATE THE LIBRARY
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1011 if (integration_success_file in orig_files_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1012 # Since we are redoing the integration,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1013 # the success file needs to be removed
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1014 # until the download has succeeded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1015 os.remove(integration_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1016 mutation_lib_dirpath = os.path.join(cannonical_destination, _CTAT_MutationLibDirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1017 # If we do not remove the directory, then the old files will exist and a new integration does not occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1018 # Also, with the Cosmic files, when the integrated file is created, if there is a previous one, gzip
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1019 # asks a question of the user, and this program is not prepared to respond to a question from a subprocess:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1020 # [bgzip] /path/to/ctat_mutation_lib/cosmic.vcf.gz already exists; do you wish to overwrite (y or n)?
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1021 if os.path.exists(mutation_lib_dirpath):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1022 shutil.rmtree(mutation_lib_dirpath)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1023 # Check for Cosmic resources. User has to place these files into the correct location.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1024 if (cosmic_resources_location is None) or (cosmic_resources_location == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1025 cosmic_resources_loc_full_path = cannonical_destination
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1026 end_err_msg = "These files must be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1027 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1028 cosmic_resources_loc_full_path = os.path.realpath(cosmic_resources_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1029 end_err_msg = "This function was told they would be placed into:\n\t{:s}".format(cosmic_resources_loc_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1030 cosmic_mutant_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Mutant_Filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1031 cosmic_coding_full_path = os.path.join(cosmic_resources_loc_full_path, _COSMIC_Coding_Filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1032 if not (os.path.exists(cosmic_mutant_full_path) and os.path.exists(cosmic_coding_full_path)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1033 raise IOError("Either one or both of Cosmic Resources are missing:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1034 "{:s}\nand/or\n\t{:s}\n".format(cosmic_mutant_full_path, cosmic_coding_full_path) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1035 "Unable to integrate mutation resources.\n{:s}".format(end_err_msg))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1036 # Create the integration command. We also must define PICARD_HOME for the command to work.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1037 picard_home = find_path_to_picard_home()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1038 integration_command = find_path_to_mutation_lib_integration()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1039 command = "export PICARD_HOME={:s} && python {:s} ".format(picard_home, integration_command) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1040 "--CosmicMutantExport {:s} ".format(cosmic_mutant_full_path) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1041 "--CosmicCodingMuts {:s} ".format(cosmic_coding_full_path) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1042 "--genome_lib_dir {:s}".format(cannonical_destination)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1043 try: # to send the ctat-mutation-lib-integration command.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1044 subprocess.check_call(command, shell=True)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1045 except subprocess.CalledProcessError:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1046 print "ERROR: While trying to integrate the mutation resources:\n\t{:s}".format(command)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1047 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1048 raise
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1049 finally:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1050 # Some code to help us if errors occur.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1051 print "/n*********************************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1052 print "* After download and integration of Mutation Resources. *"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1053 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1054 print_directory_contents(cannonical_destination, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1055 print "*********************************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1056 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1057 create_success_file(integration_success_file_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1058 "Download and integration of mutation resources:\n\t{:s}\n".format(source_url) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1059 "to:\n\t{:s}\nsucceeded.".format(genome_build_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1060 elif (integration_success_file in orig_files_in_destdir):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1061 print "The mutation resources integration success file exists, so no integration is being attempted:"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1062 print "\t{:s}".format(integration_success_file_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1063 print "Remove the file or set <new_mutation_integration> if you want a new integration to occur."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1064 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1065 print "download_and_integrate_mutation_resources() - Integration: This code should never be printed. Something is wrong."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1066 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1067 return
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1068
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1069 def search_for_genome_build_dir(top_dir_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1070 # If we do not download the directory, the topdir_path could be the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1071 # location of the genome resource library, but we also want to allow the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1072 # user to give the same value for top_dir_path that they do when a
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1073 # build happens, so we need to handle all three cases:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1074 # 1) Is the top_dir_path the build directory,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1075 # 2) or is it inside of the given directory,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1076 # 3) or is it inside a subdirectory of the given directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1077 # The source_data downloads are built to a directory named _CTAT_Build_dirname,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1078 # and the plug-n-play downloads contain a sub-directory named _CTAT_Build_dirname.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1079 # We also look for the genome name and return that, if we find it in the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1080 # directory name of the directory holding the build directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1081 top_dir_full_path = os.path.realpath(top_dir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1082 genome_build_directory = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1083 genome_name_from_dirname = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1084 print_warning = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1085
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1086 if not os.path.exists(top_dir_full_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1087 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1088 "The given directory does not exist:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1089 elif not os.path.isdir(top_dir_full_path):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1090 raise ValueError("Cannot find the CTAT Genome Resource Library. " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1091 "The given directory is not a directory:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1092 if os.path.basename(top_dir_full_path) == _CTAT_Build_dirname:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1093 print "Build directory is: {:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1094 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1095 # The top_dir_path is the path to the genome_build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1096 genome_build_directory = top_dir_full_path
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1097 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1098 # Look for it inside of the top_dir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1099 print "Looking inside of: {:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1100 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1101 top_dir_contents = os.listdir(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1102 if (_CTAT_Build_dirname in top_dir_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1103 # The genome_build_directory is inside of the top_dir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1104 print "1. Found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1105 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1106 genome_build_directory = "{:s}/{:s}".format(top_dir_full_path,_CTAT_Build_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1107 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1108 # Find all subdirectories containing the _CTAT_Build_dirname or the _CTAT_RefGenome_Filename.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1109 # Look down the directory tree two levels.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1110 build_dirs_in_subdirs = list()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1111 subdirs_with_genome_files = list()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1112 build_dirs_in_sub_subdirs = list()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1113 sub_subdirs_with_genome_files = list()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1114 subdirs = [entry for entry in top_dir_contents if (os.path.isdir("{:s}/{:s}".format(top_dir_full_path,entry)))]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1115 for subdir in subdirs:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1116 subdir_path = "{:s}/{:s}".format(top_dir_full_path, subdir)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1117 subdir_path_contents = os.listdir(subdir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1118 # print "Is it one of:\n\t" + "\n\t".join(subdir_path_contents)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1119 if (_CTAT_Build_dirname in subdir_path_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1120 # The genome_build_directory is inside of the subdir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1121 print "2a, Found one."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1122 build_dirs_in_subdirs.append("{:s}/{:s}".format(subdir_path, _CTAT_Build_dirname))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1123 if (_CTAT_RefGenome_Filename in subdir_path_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1124 subdirs_with_genome_files.append(subdir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1125 # Since we are already looping, loop through all dirs one level deeper as well.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1126 sub_subdirs = [entry for entry in subdir_path_contents if (os.path.isdir("{:s}/{:s}".format(subdir_path,entry)))]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1127 for sub_subdir in sub_subdirs:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1128 sub_subdir_path = "{:s}/{:s}".format(subdir_path, sub_subdir)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1129 sub_subdir_path_contents = os.listdir(sub_subdir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1130 # print "Is it one of:\n\t" + "\n\t".join(sub_subdir_path_contents)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1131 if (_CTAT_Build_dirname in sub_subdir_path_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1132 # The genome_build_directory is inside of the sub_subdir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1133 print "3a. Found one."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1134 build_dirs_in_sub_subdirs.append("{:s}/{:s}".format(sub_subdir_path, _CTAT_Build_dirname))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1135 if (_CTAT_RefGenome_Filename in sub_subdir_path_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1136 sub_subdirs_with_genome_files.append(sub_subdir_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1137 # Hopefully there is one and only one found build directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1138 # If none are found we check for a directory containing the genome reference file,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1139 # but the build process sometimes causes more than one directory to have a copy,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1140 # so finding that file is not a sure thing.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1141 if (len(build_dirs_in_subdirs) + len(build_dirs_in_sub_subdirs)) > 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1142 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1143 print "Found multiple CTAT Genome Resource Libraries " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1144 "in the given directory:\n\t{:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1145 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1146 print_directory_contents(top_dir_full_path, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1147 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1148 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1149 raise ValueError("Found multiple CTAT Genome Resource Libraries " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1150 "in the given directory:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1151 elif len(build_dirs_in_subdirs) == 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1152 # The genome_build_directory is inside of the subdir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1153 print "2b, Found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1154 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1155 genome_build_directory = build_dirs_in_subdirs[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1156 elif len(build_dirs_in_sub_subdirs) == 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1157 # The genome_build_directory is inside of the subdir_path directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1158 print "3b, Found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1159 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1160 genome_build_directory = build_dirs_in_sub_subdirs[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1161 elif (len(sub_subdirs_with_genome_files) + len(subdirs_with_genome_files)) > 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1162 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1163 print "Unable to find CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1164 "in the given directory:\n\t{:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1165 print "And multiple directories contain {:s}".format(_CTAT_RefGenome_Filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1166 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1167 print_directory_contents(top_dir_full_path, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1168 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1169 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1170 raise ValueError("Unable to find CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1171 "in the given directory:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1172 elif (len(sub_subdirs_with_genome_files) == 1):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1173 print "3c, Maybe found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1174 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1175 genome_build_directory = sub_subdirs_with_genome_files[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1176 print_warning = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1177 elif (len(subdirs_with_genome_files) == 1):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1178 print "2c, Maybe found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1179 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1180 genome_build_directory = subdirs_with_genome_files[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1181 print_warning = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1182 elif (_CTAT_RefGenome_Filename in top_dir_contents):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1183 print "1c. Maybe found it."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1184 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1185 genome_build_directory = top_dir_full_path
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1186 print_warning = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1187 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1188 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1189 print "Unable to find CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1190 "in the given directory:\n\t{:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1191 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1192 print_directory_contents(top_dir_full_path, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1193 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1194 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1195 raise ValueError("Unable to find CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1196 "in the given directory:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1197 # end else
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1198 # Check if the CTAT Genome Resource Lib has anything in it (and specifically ref_genome.fa).
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1199 if (genome_build_directory is None):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1200 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1201 print "Cannot find the CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1202 "in the given directory:\n\t{:s}".format(top_dir_full_path)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1203 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1204 print_directory_contents(top_dir_full_path, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1205 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1206 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1207 raise ValueError("Cannot find the CTAT Genome Resource Library " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1208 "in the given directory:\n\t{:s}".format(top_dir_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1209 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1210 if (_CTAT_RefGenome_Filename not in os.listdir(genome_build_directory)):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1211 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1212 print "\nWARNING: Cannot find Genome Reference file {:s} ".format(_CTAT_RefGenome_Filename) + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1213 "in the genome build directory:\n\t{:s}".format(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1214 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1215 print_directory_contents(genome_build_directory, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1216 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1217 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1218 if print_warning and genome_build_directory:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1219 print "\n***************************************"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1220 print "\nWARNING: Cannot find the CTAT Genome Resource Library, " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1221 "but found a {:s} file, so set its directory as the library.".format(_CTAT_RefGenome_Filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1222 print "This my not be the correct directory:\n\t{:s}".format(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1223 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1224 print_directory_contents(genome_build_directory, 2)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1225 print "***************************************\n"
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1226 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1227 return genome_build_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1228
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1229 def build_directory_from_build_location(src_filename, build_location):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1230 build_directory = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1231 genome_dir_name = find_genome_name_in_path(src_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1232 if os.path.basename(build_location) == genome_dir_name:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1233 build_directory = os.path.join(build_location, _CTAT_Build_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1234 elif os.path.basename(build_location) == _CTAT_Build_dirname:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1235 build_directory = build_location
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1236 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1237 build_directory = os.path.join(build_location, genome_dir_name, _CTAT_Build_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1238 return build_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1239
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1240 def main():
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1241 #Parse Command Line. There are three basic ways to use this tool.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1242 # 1) Download and Build the CTAT Genome Resource Library from an archive.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1243 # 2) Build the library from source data files that are already downloaded.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1244 # 3) Specify the location of an already built library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1245 # Any of these methods can incorporate or be followed by a gmap build.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1246 # Any of these methods can be followed by a mutation resources download and/or integration.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1247 # Choose arguments for only one method.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1248 # Do not use arguments in a mixed manner. I am not writing code to handle that at this time.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1249 parser = argparse.ArgumentParser()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1250 # Arguments for all methods:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1251 parser.add_argument('-o', '--output_filename', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1252 help='Name of the output file, where the json dictionary will be written.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1253 parser.add_argument('-y', '--display_name',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1254 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1255 help='Is used as the display name for the entry of this Genome Resource Library in the data table.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1256 parser.add_argument('-g', '--gmap_build', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1257 help='Will do a gmap_build on the Genome Resource Library, if it has not previously been gmapped.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1258 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1259 parser.add_argument('-f', '--force_gmap_build', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1260 help='Will force gmap_build of the Genome Resource Library, even if previously gmapped.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1261 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1262 parser.add_argument('-m', '--download_mutation_resources_url',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1263 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1264 help='Value should be the url of the zipped up mutation resources. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1265 'These are located at: https://data.broadinstitute.org/Trinity/CTAT/mutation/.' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1266 'Will download mutation resources and integrate them into the Genome Resource Library.' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1267 'Cosmic resources must previously have beeen downloaded (https://cancer.sanger.ac.uk/cosmic/download).' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1268 'Cosmic resources can be placed directly into the Genome Resource Library ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1269 'or you can set the --cosmic_resources_location argument.' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1270 'See https://github.com/NCIP/ctat-mutations/tree/no_sciedpiper/mutation_lib_prep for more info. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1271 'If a previous download and integration was not completed, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1272 'calling with this option set will attempt to finish the integration.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1273 parser.add_argument('-l', '--new_mutation_download', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1274 help='Forces the mutation resources to be downloaded, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1275 'even if previously downloaded into this Genome Resource Library.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1276 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1277 parser.add_argument('-i', '--new_mutation_integration', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1278 help='Forces the mutation resources to be integrated, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1279 'even if previously integrated into this Genome Resource Library.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1280 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1281 parser.add_argument('-c', '--cosmic_resources_location',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1282 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1283 help='Specify a non-default location where the Cosmic files reside. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1284 'Normally they are assumed to reside in the build directory, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1285 'but if that directory has not been created yet when this program ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1286 'is called, you can specify the full path to the directory where they reside.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1287 # Method 1) arguments - Download and Build.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1288 # - One can optionally utilize --build_location argument with this group of arguments.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1289 download_and_build_args = parser.add_argument_group('Download and Build arguments')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1290 download_and_build_args.add_argument('-u', '--download_url',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1291 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1292 help='This is the url of an archive file containing the library files. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1293 'These are located at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1294 'Works with both source-data and plug-n-play archives.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1295 download_and_build_args.add_argument('-d', '--download_location',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1296 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1297 help='Full path of the CTAT Resource Library download location, where the download will be placed. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1298 'If the archive file has already had been successfully downloaded, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1299 'it will only be downloaded again if --new_archive_download is selected. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1300 'If --build_location is not set, then the archive will be built in place at the download_location. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1301 'If a previous download and build was started but not completed at this or a specified build_location, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1302 'calling with this and the previous option set, but not --new_archive_download, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1303 'will attempt to finish the download and build.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1304 download_and_build_args.add_argument('-a', '--new_archive_download', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1305 help='Forces a new download (and build if needed) of the Genome Resource Library, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1306 'even if previously downloaded and built.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1307 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1308 download_and_build_args.add_argument('-k', '--keep_archive', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1309 help='The archive will not be deleted after it is extracted.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1310 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1311 # Method 2) arguments - Specify source and build locations.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1312 specify_source_and_build_args = parser.add_argument_group('Specify Source and Build locations arguments')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1313 specify_source_and_build_args.add_argument('-s', '--source_location',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1314 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1315 help='Full path to the directory containing CTAT Resource Library source-data files ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1316 'or the full path to a CTAT Resource Library archive file (.tar.gz). ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1317 'If the --build_location option is not set, the reference library will be built in the source_location directory.' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1318 'If a previous download and build was started but not completed at this location, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1319 'calling with this option set, but not --new_library_build, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1320 'will attempt to finish the build.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1321 specify_source_and_build_args.add_argument('-r', '--new_library_build', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1322 help='Forces build of the CTAT Genome Resource Library, even if previously built. ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1323 'The --source_location must be a source-data archive or directory, or this is a no-op.',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1324 action='store_true')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1325 # Method 3) arguments - Specify the location of a built library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1326 built_lib_location_arg = parser.add_argument_group('Specify location of built library arguments')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1327 built_lib_location_arg.add_argument('-b', '--build_location',
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1328 default='', \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1329 help='Full path to the location of a built CTAT Genome Resource Library, ' + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1330 'either where it is, or where it will be placed.')
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1331
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1332 args = parser.parse_args()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1333
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1334 # All of the input parameters are written by default to the output file prior to
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1335 # this program being called.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1336 # But I do not get input values from the json file, but rather from command line.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1337 # Just leaving the following code as a comment, in case it might be useful to someone later.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1338 # params = from_json_string(open(filename).read())
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1339 # target_directory = params['output_data'][0]['extra_files_path']
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1340 # os.mkdir(target_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1341
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1342 lib_was_built = False
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1343 extracted_directory = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1344 source_data_directory = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1345 genome_build_directory = None
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1346 download_url_is_set = (args.download_url is not None) and (args.download_url != "")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1347 download_location_is_set = (args.download_location is not None) and (args.download_location != "")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1348 source_location_is_set = (args.source_location is not None) and (args.source_location != "")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1349 build_location_is_set = (args.build_location is not None) and (args.build_location != "")
46
f4f48007db67 Uploaded
trinity_ctat
parents: 45
diff changeset
1350
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1351 if download_url_is_set:
46
f4f48007db67 Uploaded
trinity_ctat
parents: 45
diff changeset
1352 print "The value of download_url argument is:\n\t{:s}".format(str(args.download_url))
f4f48007db67 Uploaded
trinity_ctat
parents: 45
diff changeset
1353 sys.stdout.flush()
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1354 if source_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1355 raise ValueError("Argument --source_location cannot be used in combination with --download_url.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1356 if not download_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1357 raise ValueError("Argument --download_url requires that --download_location be specified.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1358 downloaded_filename_full_path = \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1359 download_genome_archive(source_url=args.download_url, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1360 destination=args.download_location, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1361 force_new_download=args.new_archive_download)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1362 print "\nThe downloaded file is:\n\t{:s}.\n".format(str(downloaded_filename_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1363 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1364
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1365 if ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_SOURCE_DATA:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1366 print "It is source data."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1367 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1368 # If it is source_data, extract to download_location (the directory where the download was placed).
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1369 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1370 destination=args.download_location, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1371 force_new_extraction=args.new_archive_download, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1372 keep_archive=args.keep_archive)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1373 source_data_directory = extracted_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1374 if build_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1375 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1376 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1377 # We will build within a subdirectory of the source_data_directory .
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1378 # The name of the build directory will be the default _CTAT_Build_dirname.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1379 # This _CTAT_Build_dirname directory will not exist until the library is built.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1380 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1381
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1382 elif ctat_library_type(downloaded_filename_full_path) == _LIBTYPE_PLUG_N_PLAY:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1383 print "It is plug-n-play data."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1384 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1385 if build_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1386 # Extract to the build location. The library is already built.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1387 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1388 destination=args.build_location, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1389 force_new_extraction=args.new_archive_download, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1390 keep_archive=args.keep_archive)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1391 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1392 # Extract to the download location.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1393 extracted_directory = extract_genome_file(archive_filepath=downloaded_filename_full_path, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1394 destination=args.download_location, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1395 force_new_extraction=args.new_archive_download, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1396 keep_archive=args.keep_archive)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1397 # There is no source_data_directory, so its value stays as None.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1398
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1399 # Look for the build directory. It should be inside the extracted_directory
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1400 if len(os.listdir(extracted_directory)) == 1:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1401 # Then that one file is a subdirectory that should be the build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1402 # That is how the plug-n-play directories are structured.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1403 subdir_filename = os.listdir(extracted_directory)[0]
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1404 genome_build_directory = os.path.join(extracted_directory, subdir_filename)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1405 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1406 # We need to search for the build directory, since there is more than one file.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1407 genome_build_directory = search_for_genome_build_dir(extracted_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1408 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1409 raise ValueError("Unexpected CTAT Library type. Neither plug-n-play nor source_data:\n\t" + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1410 "{:s}".format(downloaded_filename_full_path))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1411 elif source_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1412 # Then the user wants to build the directory from the source data.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1413 source_data_directory = os.path.realpath(args.source_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1414 print "\nThe program is being told that the source data is in:\n\t{:s}.\n".format(str(source_data_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1415 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1416 if build_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1417 genome_build_directory = build_directory_from_build_location(source_data_directory, args.build_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1418 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1419 # We will build within a subdirectory of the source_data_directory .
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1420 # The name of the build directory will be the default _CTAT_Build_dirname.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1421 # This _CTAT_Build_dirname directory will not exist until the library is built.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1422 genome_build_directory = os.path.join(source_data_directory, _CTAT_Build_dirname)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1423 elif build_location_is_set:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1424 genome_build_directory = args.build_location
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1425
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1426 if (genome_build_directory is None) or (genome_build_directory == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1427 raise ValueError("At least one of --download_url, --source_location, or --build_location must be specified.")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1428
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1429 print "\nThe location where the CTAT Genome Resource Library exists " + \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1430 "or will be built is {:s}.\n".format(str(genome_build_directory))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1431 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1432
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1433 # To take out builds for testing, comment out the lines that do the building.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1434 # The command that builds the ctat genome library also has an option for building the gmap indexes.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1435 # That is why the gmap_build values are sent to build_the_library(), but if we are not building the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1436 # library, the user might still be asking for a gmap_build. That is done after rechecking for the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1437 # genome_build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1438 if (source_data_directory is not None):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1439 build_the_library(source_data_directory, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1440 genome_build_directory, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1441 args.new_library_build, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1442 args.gmap_build, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1443 args.force_gmap_build)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1444 lib_was_built = True
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1445
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1446 # The following looks to see if the library actually exists after the build,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1447 # and raises an error if it cannot find the library files.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1448 # The reassignment of genome_build_directory can be superfluous,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1449 # since many times the genome_build_directory will already point to the correct directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1450 # There are cases, however, where a user specifies a location that contains the
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1451 # genome_build_directory rather than is the genome_build_directory.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1452 genome_build_directory = search_for_genome_build_dir(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1453
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1454 if (args.gmap_build and not lib_was_built):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1455 # If we did not build the genome resource library
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1456 # the user might still be asking for a gmap_build.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1457 gmap_the_library(genome_build_directory, args.force_gmap_build)
45
3acb7bc809b5 More print statement fixes.
trinity_ctat
parents: 44
diff changeset
1458 sys.stdout.flush()
44
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1459
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1460 if (args.download_mutation_resources_url != ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1461 download_and_integrate_mutation_resources(source_url=args.download_mutation_resources_url, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1462 genome_build_directory=genome_build_directory, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1463 cosmic_resources_location=args.cosmic_resources_location, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1464 force_new_download=args.new_mutation_download, \
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1465 force_new_integration=args.new_mutation_integration)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1466
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1467 # Need to get the genome name.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1468 genome_name = find_genome_name_in_path(args.download_url)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1469 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1470 genome_name = find_genome_name_in_path(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1471 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1472 genome_name = find_genome_name_in_path(extracted_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1473 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1474 genome_name = find_genome_name_in_path(args.source_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1475 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1476 genome_name = find_genome_name_in_path(args.download_location)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1477 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1478 genome_name = find_genome_name_in_path(args.display_name)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1479 if genome_name is None:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1480 genome_name = _CTAT_ResourceLib_DefaultGenome
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1481 print "WARNING: We could not find a genome name in any of the directory paths."
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1482 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1483
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1484 # Determine the display_name for the library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1485 if (args.display_name is None) or (args.display_name == ""):
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1486 # Create the display_name from the genome_name.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1487 display_name = _CTAT_ResourceLib_DisplayNamePrefix + genome_name
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1488 else:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1489 display_name = _CTAT_ResourceLib_DisplayNamePrefix + args.display_name
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1490 display_name = display_name.replace(" ","_")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1491
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1492 # Create a unique_id for the library.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1493 datetime_stamp = datetime.now().strftime("_%Y_%m_%d_%H_%M_%S_%f")
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1494 unique_id = genome_name + "." + datetime_stamp
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1495
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1496 print "The Genome Resource Library's display_name will be set to: {:s}\n".format(display_name)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1497 print "Its unique_id will be set to: {:s}\n".format(unique_id)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1498 print "Its dir_path will be set to: {:s}\n".format(genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1499 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1500
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1501 data_manager_dict = {}
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1502 data_manager_dict['data_tables'] = {}
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1503 data_manager_dict['data_tables']['ctat_genome_resource_libs'] = []
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1504 data_table_entry = dict(value=unique_id, name=display_name, path=genome_build_directory)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1505 data_manager_dict['data_tables']['ctat_genome_resource_libs'].append(data_table_entry)
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1506
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1507 # Temporarily the output file's dictionary is written for debugging:
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1508 print "The dictionary for the output file is:\n\t{:s}".format(str(data_manager_dict))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1509 sys.stdout.flush()
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1510 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1511 # which then puts it into the correct .loc file (I think).
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1512 # Comment out the following line when testing without galaxy package.
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1513 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1514
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1515 if __name__ == "__main__":
76f2367996b8 Uploaded
trinity_ctat
parents:
diff changeset
1516 main()