Mercurial > repos > rnateam > splitfasta
comparison split_fasta.py @ 1:87bdbac78136 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
| author | rnateam |
|---|---|
| date | Mon, 21 Sep 2020 15:41:01 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:f6d6b62540f8 | 1:87bdbac78136 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os | |
| 4 import sys | |
| 5 from Bio import SeqIO | |
| 6 | |
| 7 num_chunks = 0 | |
| 8 if len(sys.argv) == 3: | |
| 9 num_chunks = int(sys.argv[2]) | |
| 10 input_filename = sys.argv[1] | |
| 11 elif len(sys.argv) == 2: | |
| 12 input_filename = sys.argv[1] | |
| 13 else: | |
| 14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") | |
| 15 | |
| 16 os.mkdir('splits') | |
| 17 | |
| 18 if num_chunks != 0: | |
| 19 # if splitting into chunks we need to count how many records are in the | |
| 20 # input file | |
| 21 record_count = 0 | |
| 22 with open(input_filename) as input_file: | |
| 23 for line in input_file: | |
| 24 if line.lstrip().startswith('>'): | |
| 25 record_count += 1 | |
| 26 | |
| 27 records_per_chunk = round(float(record_count) / num_chunks) | |
| 28 | |
| 29 count = 1 | |
| 30 with open(input_filename) as input_file: | |
| 31 | |
| 32 chunk_record_count = 0 # how many lines have we written to the output file | |
| 33 records = [] | |
| 34 for record in SeqIO.parse(input_file, 'fasta'): | |
| 35 records.append(record) | |
| 36 if num_chunks == 0 or (count < num_chunks and | |
| 37 len(records) >= records_per_chunk): | |
| 38 if num_chunks == 0: | |
| 39 output_filename = os.path.join('splits', record.id) | |
| 40 else: | |
| 41 output_filename = os.path.join('splits', 'part{}'.format(count)) | |
| 42 SeqIO.write(records, output_filename, 'fasta') | |
| 43 count += 1 | |
| 44 records = [] | |
| 45 | |
| 46 if records: | |
| 47 # this only applies for the mode where input file is | |
| 48 # split into chunks | |
| 49 output_filename = os.path.join('splits', 'part{}'.format(count)) | |
| 50 SeqIO.write(records, output_filename, 'fasta') |
