Mercurial > repos > abims-sbr > mutcount
diff scripts/functions.py @ 10:f62c76aab669 draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
| author | lecorguille |
|---|---|
| date | Mon, 24 Sep 2018 04:34:39 -0400 |
| parents | f1e24200e5ae |
| children |
line wrap: on
line diff
--- a/scripts/functions.py Tue Jul 03 10:55:46 2018 -0400 +++ b/scripts/functions.py Mon Sep 24 04:34:39 2018 -0400 @@ -1,33 +1,65 @@ -import itertools +#!/usr/bin/env python +#coding: utf-8 + +import itertools, os + +def dico(fasta_file, path_in): + """ + Stores a fasta file in a dictionary : key/value -> header/sequence + + Args: + - fasta_file (String) : the name of fasta file + - path_in (String) : path to the fasta file -def simplify_fasta_name(fasta_name,LT): - for abbreviation in LT: - if abbreviation in fasta_name: - new_fasta_name = abbreviation + Return: + - bash1 (dict) : the dictionary header/sequence + """ + bash1 = {} - return(new_fasta_name) + with open(path_in+'/'+fasta_file, 'r') as F1: + for h,s in itertools.izip_longest(*[F1]*2): + fasta_name = h[1:3] + sequence = s[:-1] + if fasta_name not in bash1.keys(): + bash1[fasta_name] = sequence + else: + print fasta_name + + return bash1 # same length for all (alignment) + +def write_output(names, sps_list, out_dir, results_dict): + """ Write results in csv files. There is one file per counted element (one file per amino-acid, one file per indice ...) -## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) -def dico(fasta_file,LT): - #count_fastaName = 0 - bash1 = {} - with open(fasta_file, "r") as file: - for name, query in itertools.izip_longest(*[file]*2): - if not name: - break - if name[0] == ">": - #count_fastaName += 1 - fasta_name = name[1:-1] - sequence = query[:-1] - if fasta_name not in bash1.keys(): - fasta_name = simplify_fasta_name(fasta_name, LT) - bash1[fasta_name] = sequence - else : - print fasta_name + Args: + - names (list) : list with the names of elems + - sps_list (list) : species names, sorted alphabetically + - out_dir (String) : output directory + - results_dict (dict) : vcounts values of each element for each input file (keys names : elems from 'names argument') - kk = bash1.keys() - key0 = kk[0] - seq0 = bash1[key0] - ln_seq = len(seq0) - - return(bash1) \ No newline at end of file + """ + for name in names: + out = open(name+".csv", 'w') + out.write('Group,' + sps_list[0:-1]+'\n') + for group in results_dict.keys(): + count_of_elems = '' + for specs in sorted(results_dict[group].keys()): + count_of_elems += str(results_dict[group][specs][name]) + ',' + out.write(group + ',' + count_of_elems[0:-1] + '\n') + out.close() + os.system('mv %s.csv %s/' %(name, out_dir)) + +def fill_with_NaN(what): + """ Used to create a dict only with NaN values ; used when a species is not present in an orthogroup + + Args: + - what (list of Strings) : the names of the elements studied (nucleotide, amino-acids, indices of thermostability ...) + + Return: + - NaN_values (dict) : dictionary with keys=elems of what, values=NaN + """ + + NaN_values = {} + for elem in what: + NaN_values[elem] = 'NaN' + + return NaN_values
