Mercurial > repos > rnateam > graphclust_preprocessing
comparison splitStockholm.py @ 6:e31c659be8bc draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit 6767a5ffb02052c844e9d862c79912f998f39d8e
author | rnateam |
---|---|
date | Mon, 20 Nov 2017 05:01:21 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:f4ad5dceb619 | 6:e31c659be8bc |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ######## | |
4 # This script reads multiple alignments merged in single stockholm file | |
5 # and splits the alignment blocks according to data.names table | |
6 # The first sequence of each alignment file assumed to match to names table entries | |
7 # Author: M. Miladi | |
8 ######## | |
9 import os | |
10 import re | |
11 import sys | |
12 | |
13 from Bio import AlignIO, SeqIO | |
14 try: | |
15 from StringIO import StringIO | |
16 except ImportError: | |
17 from io import StringIO | |
18 | |
19 stk_file = sys.argv[1] | |
20 print ("Parsing and splitting stk file:{}".format(stk_file)) | |
21 target_f = "alignment_data_split.stk" | |
22 pattern = re.compile("^>.*$") | |
23 toWriteID = "" | |
24 | |
25 count_for_id = 1 | |
26 seq_counter = 0 | |
27 new_id = "" | |
28 | |
29 seq_id = [] | |
30 seq_string = [] | |
31 orig_id = [] | |
32 name_file = "FASTA/data.names" | |
33 array_all_chunks = [] | |
34 with open(name_file, 'r') as f: | |
35 for line in f: | |
36 if len(line.strip()) == 0: | |
37 continue | |
38 seq_id.append(int(line.split()[0])) | |
39 seq_string.append(line.split()[1]) | |
40 orig_id_srt = line.split()[3] | |
41 orig_id_srt = orig_id_srt.rsplit('_',1)[0] | |
42 orig_id.append(orig_id_srt) | |
43 | |
44 | |
45 | |
46 with open(stk_file) as stk_in: | |
47 alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna) | |
48 alignments_dic = {(a[0].id):a for a in alignments} | |
49 | |
50 | |
51 regx_gaps = '[-.~_]' # valid gap symbols all be converted to "-" | |
52 str_gaps = '-.~_' # valid gap symbols all be converted to "-" | |
53 | |
54 | |
55 chunks = [] | |
56 with open(target_f, 'w') as out_stk_handle: | |
57 for i in range(len(orig_id)): | |
58 | |
59 #---------------------- | |
60 # We need to map ungapped positions of the chunks to gapped positions of first sequence | |
61 gap_count = 0 | |
62 ungap_ind = 0 | |
63 dic_gap_counts = dict() | |
64 cur_alignment = alignments_dic[orig_id[i]] | |
65 for c in cur_alignment[0].seq: | |
66 #print ungap_ind | |
67 if c in str_gaps: | |
68 gap_count += 1 | |
69 else: | |
70 dic_gap_counts[ungap_ind] = gap_count | |
71 ungap_ind += 1 | |
72 ID = str(seq_id[i]) + " " + seq_string[i] | |
73 chunks = re.findall(r'\d+', seq_string[i]) | |
74 print (ID,chunks) | |
75 | |
76 index_start, index_end =int(chunks[1])-1, int(chunks[2])-1 | |
77 subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]: | |
78 index_end+dic_gap_counts[index_end]+1] | |
79 | |
80 #---------------------- | |
81 # BioPython does not handel the GF ID entry for alignment | |
82 # So we add entry in the second line manually | |
83 siotmp = StringIO() | |
84 AlignIO.write(subalign, siotmp, format="stockholm") | |
85 stk_lines = siotmp.getvalue().split('\n') | |
86 out_stk_handle.write('{}\n'.format(stk_lines[0])) | |
87 out_stk_handle.write('#=GF ID {}\n'.format(ID)) | |
88 out_stk_handle.writelines('\n'.join(stk_lines[1:])) | |
89 #print out_stk_handle.getvalue() | |
90 | |
91 |