Mercurial > repos > yating-l > jbrowsearchivecreator
comparison util/subtools.py @ 39:4a69515eed63 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 91271a6c0d39c923f0d460b2979247baa297286b-dirty
| author | yating-l |
|---|---|
| date | Fri, 13 Apr 2018 18:21:35 -0400 |
| parents | d17f629f5486 |
| children | 061da5d3a219 |
comparison
equal
deleted
inserted
replaced
| 38:d17f629f5486 | 39:4a69515eed63 |
|---|---|
| 235 if os.path.exists(filename): | 235 if os.path.exists(filename): |
| 236 return filename | 236 return filename |
| 237 else: | 237 else: |
| 238 raise ValueError('Did not find fai file') | 238 raise ValueError('Did not find fai file') |
| 239 | 239 |
| 240 def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder): | |
| 241 faiFile = createFastaIndex(fastaFile) | |
| 242 refSeqFile = os.path.join(outputFolder, referenceName) | |
| 243 refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai') | |
| 244 shutil.copy(fastaFile, refSeqFile) | |
| 245 shutil.copy(faiFile, refSeqIndexFile) | |
| 246 | |
| 247 def remove_gene_lines(gff3_file, gff3_filtered): | |
| 248 with open(gff3_file, 'r') as f: | |
| 249 with open(gff3_filtered, 'w') as out: | |
| 250 for line in f: | |
| 251 if not line.startswith('#'): | |
| 252 feature_type = line.split('\t')[2].rstrip() | |
| 253 if feature_type == 'transcript' or feature_type == 'mRNA': | |
| 254 arr = line.split('\t') | |
| 255 # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript | |
| 256 arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip() | |
| 257 line = '\t'.join(arr) + '\n' | |
| 258 if feature_type == 'gene': | |
| 259 continue | |
| 260 out.write(line) | |
| 261 | |
| 240 def gff3sort(inputFile, outputFile, precise=False): | 262 def gff3sort(inputFile, outputFile, precise=False): |
| 241 array_call = ['gff3sort.pl', inputFile, '>', outputFile] | 263 array_call = ['gff3sort.pl', inputFile] |
| 242 if precise: | 264 if precise: |
| 243 array_call.append('--precise') | 265 array_call.append('--precise') |
| 244 p = _handleExceptionAndCheckCall(array_call) | 266 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) |
| 245 return p | 267 return p |
| 246 | 268 |
| 247 def bedSort(inputFile, outputFile): | 269 def bedSort(inputFile, outputFile): |
| 248 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile, '>', outputFile] | 270 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile] |
| 249 p = _handleExceptionAndCheckCall(array_call) | 271 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile) |
| 250 return p | 272 return p |
| 251 | 273 |
| 252 def bgzip(inputFile): | 274 def bgzip(inputFile): |
| 253 subprocess.call(['bgzip', inputFile]) | 275 subprocess.call(['bgzip', inputFile]) |
| 254 filename = inputFile + '.gz' | 276 filename = inputFile + '.gz' |
| 263 if os.path.exists(filename): | 285 if os.path.exists(filename): |
| 264 return filename | 286 return filename |
| 265 else: | 287 else: |
| 266 raise ValueError('Did not find tbi file') | 288 raise ValueError('Did not find tbi file') |
| 267 | 289 |
| 268 def generate_tabix_indexed_track(inputFile, dataType, outputFolder): | 290 def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder): |
| 269 if "bed" in dataType: | 291 if "bed" in dataType: |
| 270 fileType = 'bed' | 292 fileType = 'bed' |
| 271 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | 293 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) |
| 272 bedSort(inputFile, sortedFile.name) | 294 bedSort(inputFile, sortedFile) |
| 273 elif "gff" in dataType: | 295 elif "gff" in dataType: |
| 274 fileType = 'gff' | 296 fileType = 'gff' |
| 297 filteredFile = tempfile.NamedTemporaryFile(bufsize=0) | |
| 298 remove_gene_lines(inputFile, filteredFile.name) | |
| 275 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) | 299 sortedFile = tempfile.NamedTemporaryFile(bufsize=0) |
| 276 gff3sort(inputFile, sortedFile.name) | 300 gff3sort(filteredFile.name, sortedFile) |
| 277 compressedFile = bgzip(sortedFile) | 301 # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl |
| 302 trackName = trackName + '.gff3.gz' | |
| 303 compressedFile = bgzip(sortedFile.name) | |
| 278 tabixFile = createTabix(compressedFile, fileType) | 304 tabixFile = createTabix(compressedFile, fileType) |
| 279 trackPath = os.path.join(outputFolder, inputFile) | 305 trackPath = os.path.join(outputFolder, trackName) |
| 280 trackIndexPath = os.path.join(outputFolder, inputFile+'.tbi') | 306 trackIndexPath = os.path.join(outputFolder, trackName+'.tbi') |
| 281 shutil.copy(compressedFile, trackPath) | 307 shutil.copy(compressedFile, trackPath) |
| 282 shutil.copy(tabixFile, trackIndexPath) | 308 shutil.copy(tabixFile, trackIndexPath) |
| 283 | 309 |
| 284 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): | 310 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): |
| 285 if "bed" in dataType: | 311 if "bed" in dataType: |
| 347 track_json = json.dumps(track_json) | 373 track_json = json.dumps(track_json) |
| 348 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) | 374 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) |
| 349 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) | 375 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) |
| 350 return p | 376 return p |
| 351 | 377 |
| 352 def prepare_refseqs(fasta_file_name, outputFolder): | 378 def prepare_refseqs(fastaFile, outputFolder): |
| 353 array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] | 379 #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] |
| 380 createFastaIndex(fastaFile) | |
| 381 array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder] | |
| 354 p = _handleExceptionAndCheckCall(array_call) | 382 p = _handleExceptionAndCheckCall(array_call) |
| 355 return p | 383 return p |
| 356 | 384 |
| 357 def generate_names(outputFolder): | 385 def generate_names(outputFolder, hashBits=4): |
| 358 array_call = ['generate-names.pl', '-v', '--out', outputFolder] | 386 array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder] |
| 359 p = _handleExceptionAndCheckCall(array_call) | 387 p = _handleExceptionAndCheckCall(array_call) |
| 360 return p | 388 return p |
| 361 | 389 |
| 362 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): | 390 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): |
| 363 """ | 391 """ |
