Mercurial > repos > iuc > progressivemauve
comparison xmfa2gff3.py @ 1:272c6e61dd0c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 936ca4b4641eae2bdfa93885f35f5a9263f9dae1
| author | iuc |
|---|---|
| date | Fri, 22 May 2015 22:50:24 -0400 |
| parents | |
| children | 71e6dd9198dc |
comparison
equal
deleted
inserted
replaced
| 0:aaa94e13ac60 | 1:272c6e61dd0c |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 from Bio import SeqIO | |
| 4 from Bio.Seq import Seq | |
| 5 from Bio.SeqRecord import SeqRecord | |
| 6 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| 7 import argparse | |
| 8 from BCBio import GFF | |
| 9 import logging | |
| 10 logging.basicConfig(level=logging.INFO) | |
| 11 log = logging.getLogger(__name__) | |
| 12 | |
| 13 | |
| 14 def parse_xmfa(xmfa): | |
| 15 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 | |
| 16 """ | |
| 17 current_lcb = [] | |
| 18 current_seq = {} | |
| 19 for line in xmfa.readlines(): | |
| 20 if line.startswith('#'): | |
| 21 continue | |
| 22 | |
| 23 if line.strip() == '=': | |
| 24 if 'id' in current_seq: | |
| 25 current_lcb.append(current_seq) | |
| 26 current_seq = {} | |
| 27 yield current_lcb | |
| 28 current_lcb = [] | |
| 29 else: | |
| 30 line = line.strip() | |
| 31 if line.startswith('>'): | |
| 32 if 'id' in current_seq: | |
| 33 current_lcb.append(current_seq) | |
| 34 current_seq = {} | |
| 35 data = line.strip().split() | |
| 36 id, loc = data[1].split(':') | |
| 37 start, end = loc.split('-') | |
| 38 current_seq = { | |
| 39 'rid': '_'.join(data[1:]), | |
| 40 'id': id, | |
| 41 'start': int(start), | |
| 42 'end': int(end), | |
| 43 'strand': 1 if data[2] == '+' else -1, | |
| 44 'seq': '' | |
| 45 } | |
| 46 else: | |
| 47 current_seq['seq'] += line.strip() | |
| 48 | |
| 49 | |
| 50 def _percent_identity(a, b): | |
| 51 """Calculate % identity, ignoring gaps in the host sequence | |
| 52 """ | |
| 53 match = 0 | |
| 54 mismatch = 0 | |
| 55 for char_a, char_b in zip(list(a), list(b)): | |
| 56 if char_a == '-': | |
| 57 continue | |
| 58 if char_a == char_b: | |
| 59 match += 1 | |
| 60 else: | |
| 61 mismatch += 1 | |
| 62 | |
| 63 if match + mismatch == 0: | |
| 64 return 0 | |
| 65 return 100 * float(match) / (match + mismatch) | |
| 66 | |
| 67 | |
| 68 def _id_tn_dict(sequences): | |
| 69 """Figure out sequence IDs | |
| 70 """ | |
| 71 label_convert = {} | |
| 72 if sequences is not None: | |
| 73 if len(sequences) == 1: | |
| 74 for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')): | |
| 75 label_convert[str(i + 1)] = record.id | |
| 76 else: | |
| 77 for i, sequence in enumerate(sequences): | |
| 78 for record in SeqIO.parse(sequence, 'fasta'): | |
| 79 label_convert[str(i + 1)] = record.id | |
| 80 continue | |
| 81 return label_convert | |
| 82 | |
| 83 | |
| 84 def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): | |
| 85 label_convert = _id_tn_dict(sequences) | |
| 86 | |
| 87 lcbs = parse_xmfa(xmfa_file) | |
| 88 | |
| 89 records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))] | |
| 90 for lcb in lcbs: | |
| 91 ids = [seq['id'] for seq in lcb] | |
| 92 | |
| 93 # Doesn't match part of our sequence | |
| 94 if relative_to not in ids: | |
| 95 continue | |
| 96 | |
| 97 # Skip sequences that are JUST our "relative_to" genome | |
| 98 if len(ids) == 1: | |
| 99 continue | |
| 100 | |
| 101 parent = [seq for seq in lcb if seq['id'] == relative_to][0] | |
| 102 others = [seq for seq in lcb if seq['id'] != relative_to] | |
| 103 | |
| 104 for other in others: | |
| 105 other['feature'] = SeqFeature( | |
| 106 FeatureLocation(parent['start'], parent['end'] + 1), | |
| 107 type="match", strand=parent['strand'], | |
| 108 qualifiers={ | |
| 109 "source": "progressiveMauve", | |
| 110 "target": label_convert.get(other['id'], other['id']), | |
| 111 "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) | |
| 112 } | |
| 113 ) | |
| 114 | |
| 115 for i in range(0, len(lcb[0]['seq']), window_size): | |
| 116 block_seq = parent['seq'][i:i + window_size] | |
| 117 real_window_size = len(block_seq) | |
| 118 real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i | |
| 119 real_end = real_start + real_window_size - block_seq.count('-') | |
| 120 | |
| 121 if (real_end - real_start) < 10: | |
| 122 continue | |
| 123 | |
| 124 if parent['start'] < 0: | |
| 125 strand = -1 | |
| 126 else: | |
| 127 strand = 1 | |
| 128 | |
| 129 for other in others: | |
| 130 pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) | |
| 131 # Ignore 0% identity sequences | |
| 132 if pid == 0: | |
| 133 continue | |
| 134 other['feature'].sub_features.append( | |
| 135 SeqFeature( | |
| 136 FeatureLocation(real_start, real_end), | |
| 137 type="match_part", strand=strand, | |
| 138 qualifiers={ | |
| 139 "source": "progressiveMauve", | |
| 140 'score': pid | |
| 141 } | |
| 142 ) | |
| 143 ) | |
| 144 | |
| 145 for other in others: | |
| 146 records[0].features.append(other['feature']) | |
| 147 return records | |
| 148 | |
| 149 | |
| 150 if __name__ == '__main__': | |
| 151 parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3') | |
| 152 parser.add_argument('xmfa_file', type=file, help='XMFA File') | |
| 153 parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000) | |
| 154 parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1') | |
| 155 parser.add_argument('--sequences', type=file, nargs='+', | |
| 156 help='Fasta files (in same order) passed to parent for reconstructing proper IDs') | |
| 157 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
| 158 | |
| 159 args = parser.parse_args() | |
| 160 | |
| 161 result = convert_xmfa_to_gff3(**vars(args)) | |
| 162 GFF.write(result, sys.stdout) |
