comparison scripts/functions.py @ 10:f62c76aab669 draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
author lecorguille
date Mon, 24 Sep 2018 04:34:39 -0400
parents f1e24200e5ae
children
comparison
equal deleted inserted replaced
9:04a9ada73cc4 10:f62c76aab669
1 import itertools 1 #!/usr/bin/env python
2 #coding: utf-8
2 3
3 def simplify_fasta_name(fasta_name,LT): 4 import itertools, os
4 for abbreviation in LT:
5 if abbreviation in fasta_name:
6 new_fasta_name = abbreviation
7 5
8 return(new_fasta_name) 6 def dico(fasta_file, path_in):
7 """
8 Stores a fasta file in a dictionary : key/value -> header/sequence
9 9
10 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) 10 Args:
11 def dico(fasta_file,LT): 11 - fasta_file (String) : the name of fasta file
12 #count_fastaName = 0 12 - path_in (String) : path to the fasta file
13 bash1 = {}
14 with open(fasta_file, "r") as file:
15 for name, query in itertools.izip_longest(*[file]*2):
16 if not name:
17 break
18 if name[0] == ">":
19 #count_fastaName += 1
20 fasta_name = name[1:-1]
21 sequence = query[:-1]
22 if fasta_name not in bash1.keys():
23 fasta_name = simplify_fasta_name(fasta_name, LT)
24 bash1[fasta_name] = sequence
25 else :
26 print fasta_name
27 13
28 kk = bash1.keys() 14 Return:
29 key0 = kk[0] 15 - bash1 (dict) : the dictionary header/sequence
30 seq0 = bash1[key0] 16 """
31 ln_seq = len(seq0) 17 bash1 = {}
32 18
33 return(bash1) 19 with open(path_in+'/'+fasta_file, 'r') as F1:
20 for h,s in itertools.izip_longest(*[F1]*2):
21 fasta_name = h[1:3]
22 sequence = s[:-1]
23 if fasta_name not in bash1.keys():
24 bash1[fasta_name] = sequence
25 else:
26 print fasta_name
27
28 return bash1 # same length for all (alignment)
29
30 def write_output(names, sps_list, out_dir, results_dict):
31 """ Write results in csv files. There is one file per counted element (one file per amino-acid, one file per indice ...)
32
33 Args:
34 - names (list) : list with the names of elems
35 - sps_list (list) : species names, sorted alphabetically
36 - out_dir (String) : output directory
37 - results_dict (dict) : vcounts values of each element for each input file (keys names : elems from 'names argument')
38
39 """
40 for name in names:
41 out = open(name+".csv", 'w')
42 out.write('Group,' + sps_list[0:-1]+'\n')
43 for group in results_dict.keys():
44 count_of_elems = ''
45 for specs in sorted(results_dict[group].keys()):
46 count_of_elems += str(results_dict[group][specs][name]) + ','
47 out.write(group + ',' + count_of_elems[0:-1] + '\n')
48 out.close()
49 os.system('mv %s.csv %s/' %(name, out_dir))
50
51 def fill_with_NaN(what):
52 """ Used to create a dict only with NaN values ; used when a species is not present in an orthogroup
53
54 Args:
55 - what (list of Strings) : the names of the elements studied (nucleotide, amino-acids, indices of thermostability ...)
56
57 Return:
58 - NaN_values (dict) : dictionary with keys=elems of what, values=NaN
59 """
60
61 NaN_values = {}
62 for elem in what:
63 NaN_values[elem] = 'NaN'
64
65 return NaN_values