comparison splitStockholm.py @ 6:e31c659be8bc draft

planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit 6767a5ffb02052c844e9d862c79912f998f39d8e
author rnateam
date Mon, 20 Nov 2017 05:01:21 -0500
parents
children
comparison
equal deleted inserted replaced
5:f4ad5dceb619 6:e31c659be8bc
1 #!/usr/bin/env python
2
3 ########
4 # This script reads multiple alignments merged in single stockholm file
5 # and splits the alignment blocks according to data.names table
6 # The first sequence of each alignment file assumed to match to names table entries
7 # Author: M. Miladi
8 ########
9 import os
10 import re
11 import sys
12
13 from Bio import AlignIO, SeqIO
14 try:
15 from StringIO import StringIO
16 except ImportError:
17 from io import StringIO
18
19 stk_file = sys.argv[1]
20 print ("Parsing and splitting stk file:{}".format(stk_file))
21 target_f = "alignment_data_split.stk"
22 pattern = re.compile("^>.*$")
23 toWriteID = ""
24
25 count_for_id = 1
26 seq_counter = 0
27 new_id = ""
28
29 seq_id = []
30 seq_string = []
31 orig_id = []
32 name_file = "FASTA/data.names"
33 array_all_chunks = []
34 with open(name_file, 'r') as f:
35 for line in f:
36 if len(line.strip()) == 0:
37 continue
38 seq_id.append(int(line.split()[0]))
39 seq_string.append(line.split()[1])
40 orig_id_srt = line.split()[3]
41 orig_id_srt = orig_id_srt.rsplit('_',1)[0]
42 orig_id.append(orig_id_srt)
43
44
45
46 with open(stk_file) as stk_in:
47 alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna)
48 alignments_dic = {(a[0].id):a for a in alignments}
49
50
51 regx_gaps = '[-.~_]' # valid gap symbols all be converted to "-"
52 str_gaps = '-.~_' # valid gap symbols all be converted to "-"
53
54
55 chunks = []
56 with open(target_f, 'w') as out_stk_handle:
57 for i in range(len(orig_id)):
58
59 #----------------------
60 # We need to map ungapped positions of the chunks to gapped positions of first sequence
61 gap_count = 0
62 ungap_ind = 0
63 dic_gap_counts = dict()
64 cur_alignment = alignments_dic[orig_id[i]]
65 for c in cur_alignment[0].seq:
66 #print ungap_ind
67 if c in str_gaps:
68 gap_count += 1
69 else:
70 dic_gap_counts[ungap_ind] = gap_count
71 ungap_ind += 1
72 ID = str(seq_id[i]) + " " + seq_string[i]
73 chunks = re.findall(r'\d+', seq_string[i])
74 print (ID,chunks)
75
76 index_start, index_end =int(chunks[1])-1, int(chunks[2])-1
77 subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]:
78 index_end+dic_gap_counts[index_end]+1]
79
80 #----------------------
81 # BioPython does not handel the GF ID entry for alignment
82 # So we add entry in the second line manually
83 siotmp = StringIO()
84 AlignIO.write(subalign, siotmp, format="stockholm")
85 stk_lines = siotmp.getvalue().split('\n')
86 out_stk_handle.write('{}\n'.format(stk_lines[0]))
87 out_stk_handle.write('#=GF ID {}\n'.format(ID))
88 out_stk_handle.writelines('\n'.join(stk_lines[1:]))
89 #print out_stk_handle.getvalue()
90
91