Mercurial > repos > yating-l > jbrowsearchivecreator
comparison util/subtools.py @ 39:4a69515eed63 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 91271a6c0d39c923f0d460b2979247baa297286b-dirty
author | yating-l |
---|---|
date | Fri, 13 Apr 2018 18:21:35 -0400 |
parents | d17f629f5486 |
children | 061da5d3a219 |
comparison
equal
deleted
inserted
replaced
38:d17f629f5486 | 39:4a69515eed63 |
---|---|
235 if os.path.exists(filename): | 235 if os.path.exists(filename): |
236 return filename | 236 return filename |
237 else: | 237 else: |
238 raise ValueError('Did not find fai file') | 238 raise ValueError('Did not find fai file') |
239 | 239 |
240 def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder): | |
241 faiFile = createFastaIndex(fastaFile) | |
242 refSeqFile = os.path.join(outputFolder, referenceName) | |
243 refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai') | |
244 shutil.copy(fastaFile, refSeqFile) | |
245 shutil.copy(faiFile, refSeqIndexFile) | |
246 | |
247 def remove_gene_lines(gff3_file, gff3_filtered): | |
248 with open(gff3_file, 'r') as f: | |
249 with open(gff3_filtered, 'w') as out: | |
250 for line in f: | |
251 if not line.startswith('#'): | |
252 feature_type = line.split('\t')[2].rstrip() | |
253 if feature_type == 'transcript' or feature_type == 'mRNA': | |
254 arr = line.split('\t') | |
255 # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript | |
256 arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip() | |
257 line = '\t'.join(arr) + '\n' | |
258 if feature_type == 'gene': | |
259 continue | |
260 out.write(line) | |
261 | |
240 def gff3sort(inputFile, outputFile, precise=False): | 262 def gff3sort(inputFile, outputFile, precise=False): |
241 array_call = ['gff3sort.pl', inputFile, '>', outputFile] | 263 array_call = ['gff3sort.pl', inputFile] |
242 if precise: | 264 if precise: |
243 array_call.append('--precise') | 265 array_call.append('--precise') |
244 p = _handleExceptionAndCheckCall(array_call) | 266 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) |
245 return p | 267 return p |
246 | 268 |
247 def bedSort(inputFile, outputFile): | 269 def bedSort(inputFile, outputFile): |
248 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile, '>', outputFile] | 270 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile] |
249 p = _handleExceptionAndCheckCall(array_call) | 271 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) |
250 return p | 272 return p |
251 | 273 |
252 def bgzip(inputFile): | 274 def bgzip(inputFile): |
253 subprocess.call(['bgzip', inputFile]) | 275 subprocess.call(['bgzip', inputFile]) |
254 filename = inputFile + '.gz' | 276 filename = inputFile + '.gz' |
263 if os.path.exists(filename): | 285 if os.path.exists(filename): |
264 return filename | 286 return filename |
265 else: | 287 else: |
266 raise ValueError('Did not find tbi file') | 288 raise ValueError('Did not find tbi file') |
267 | 289 |
268 def generate_tabix_indexed_track(inputFile, dataType, outputFolder): | 290 def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder): |
269 if "bed" in dataType: | 291 if "bed" in dataType: |
270 fileType = 'bed' | 292 fileType = 'bed' |
271 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | 293 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) |
272 bedSort(inputFile, sortedFile.name) | 294 bedSort(inputFile, sortedFile) |
273 elif "gff" in dataType: | 295 elif "gff" in dataType: |
274 fileType = 'gff' | 296 fileType = 'gff' |
297 filteredFile = tempfile.NamedTemporaryFile(bufsize=0) | |
298 remove_gene_lines(inputFile, filteredFile.name) | |
275 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | 299 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) |
276 gff3sort(inputFile, sortedFile.name) | 300 gff3sort(filteredFile.name, sortedFile) |
277 compressedFile = bgzip(sortedFile) | 301 # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl |
302 trackName = trackName + '.gff3.gz' | |
303 compressedFile = bgzip(sortedFile.name) | |
278 tabixFile = createTabix(compressedFile, fileType) | 304 tabixFile = createTabix(compressedFile, fileType) |
279 trackPath = os.path.join(outputFolder, inputFile) | 305 trackPath = os.path.join(outputFolder, trackName) |
280 trackIndexPath = os.path.join(outputFolder, inputFile+'.tbi') | 306 trackIndexPath = os.path.join(outputFolder, trackName+'.tbi') |
281 shutil.copy(compressedFile, trackPath) | 307 shutil.copy(compressedFile, trackPath) |
282 shutil.copy(tabixFile, trackIndexPath) | 308 shutil.copy(tabixFile, trackIndexPath) |
283 | 309 |
284 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): | 310 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): |
285 if "bed" in dataType: | 311 if "bed" in dataType: |
347 track_json = json.dumps(track_json) | 373 track_json = json.dumps(track_json) |
348 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) | 374 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) |
349 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) | 375 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) |
350 return p | 376 return p |
351 | 377 |
352 def prepare_refseqs(fasta_file_name, outputFolder): | 378 def prepare_refseqs(fastaFile, outputFolder): |
353 array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] | 379 #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] |
380 createFastaIndex(fastaFile) | |
381 array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder] | |
354 p = _handleExceptionAndCheckCall(array_call) | 382 p = _handleExceptionAndCheckCall(array_call) |
355 return p | 383 return p |
356 | 384 |
357 def generate_names(outputFolder): | 385 def generate_names(outputFolder, hashBits=4): |
358 array_call = ['generate-names.pl', '-v', '--out', outputFolder] | 386 array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder] |
359 p = _handleExceptionAndCheckCall(array_call) | 387 p = _handleExceptionAndCheckCall(array_call) |
360 return p | 388 return p |
361 | 389 |
362 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): | 390 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): |
363 """ | 391 """ |