Mercurial > repos > abims-sbr > cds_search
changeset 11:06a28df198b6 draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
| author | lecorguille |
|---|---|
| date | Mon, 24 Sep 2018 03:58:34 -0400 |
| parents | 3d00be2d05f3 |
| children | |
| files | CDS_search.xml macros.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/old_orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/old_orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta |
| diffstat | 23 files changed, 214 insertions(+), 401 deletions(-) [+] |
line wrap: on
line diff
--- a/CDS_search.xml Fri Jul 06 02:52:15 2018 -0400 +++ b/CDS_search.xml Mon Sep 24 03:58:34 2018 -0400 @@ -1,4 +1,4 @@ -<tool name="CDS_search" id="cds_search" version="2.0.2"> +<tool name="CDS_search" id="cds_search" version="2.1.2"> <description> ORF and CDS search @@ -23,6 +23,7 @@ python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py $__tool_directory__/scripts/code_universel_modified.txt $length.min_length_seq + $nb_species_keep list_files > '$log' && @@ -120,11 +121,10 @@ <tests> - <!-- tests with name "orthogroup_x_witn_y_sequences.fasta" --> <test> <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" /> - <param name="nb_species_keep" value="2" /> - <param name="methionine" value="no" /> + <param name="nb_species_keep" value="3" /> + <param name="methionine" value="non" /> <section name="length"> <param name="min_length_seq" value="50" /> <param name="min_length_subseq" value="15" /> @@ -133,51 +133,34 @@ <param name="out_BESTORF" value="both" /> <param name="out_CDS" value="both" /> <param name="out_CDS_filter" value="both" /> - <output_collection name="output_BESTORF_aa" type="list"> + <output_collection name="output_BESTORF_aa" type="list" count="2"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_1_with_3_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_BESTORF_nuc" type="list"> + <output_collection name="output_BESTORF_nuc" type="list" count="2"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_1_with_3_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_CDS_aa" type="list"> + <output_collection name="output_CDS_aa" type="list" count="2"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_1_with_3_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_CDS_nuc" type="list"> + <output_collection name="output_CDS_nuc" type="list" count="2"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_1_with_3_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - - <output_collection name="output_filter_aa" type="list"> - <element name="orthogroup_1_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta" /> + <output_collection name="output_filter_aa" type="list" count="1"> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_filter_nuc" type="list"> - <element name="orthogroup_1_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta" /> - <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta" /> + <output_collection name="output_filter_nuc" type="list" count="1"> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta" /> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_filter_aa" type="list" count="4"/> - <output_collection name="output_filter_nuc" type="list" count="4"/> </test> - + <test> <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" /> <param name="nb_species_keep" value="2" /> - <param name="methionine" value="no" /> + <param name="methionine" value="oui" /> <section name="length"> <param name="min_length_seq" value="50" /> <param name="min_length_subseq" value="15" /> @@ -186,25 +169,26 @@ <param name="out_BESTORF" value="both" /> <param name="out_CDS" value="both" /> <param name="out_CDS_filter" value="both" /> - <output_collection name="output_BESTORF_aa" type="list"> + <output_collection name="output_BESTORF_aa" type="list" count="4"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_1_with_3_species.fasta" /> <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_7_with_3_species.fasta" /> <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_BESTORF_nuc" type="list"> + <output_collection name="output_BESTORF_nuc" type="list" count="4"> <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_1_with_3_species.fasta" /> <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_6_with_2_species.fasta" /> <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_7_with_3_species.fasta" /> <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_filter_aa" type="list"> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta" /> + <output_collection name="output_filter_aa" type="list" count="1"> + <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test2/orthogroup_14_with_2_species.fasta" /> </output_collection> - <output_collection name="output_filter_nuc" type="list"> - <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta" /> + <output_collection name="output_filter_nuc" type="list" count="1"> + <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test2/orthogroup_14_with_2_species.fasta" /> </output_collection> </test> + </tests> <help> @@ -315,6 +299,8 @@ </help> - <expand macro="citations" /> + <citations> + + </citations> </tool>
--- a/macros.xml Fri Jul 06 02:52:15 2018 -0400 +++ b/macros.xml Mon Sep 24 03:58:34 2018 -0400 @@ -7,13 +7,14 @@ <token name="@HELP_AUTHORS@"> .. class:: infomark -**Authors** Eric Fontanillas creates the scripts of this pipeline. +**Authors** Eric Fontanillas created the version 1 of this pipeline. Victor Mataigne developped version 2. .. class:: infomark -**Galaxy integration** ABiMS TEAM +**Galaxy integration** Julie Baffard and ABiMS TEAM, Roscoff Marine Station | Contact support.abims@sb-roscoff.fr for any questions or concerns about the Galaxy implementation of this tool. + | Credits : Gildas le Corguillé, Misharl Monsoor ---------------------------------------------------
--- a/scripts/S01_find_orf_on_multiple_alignment.py Fri Jul 06 02:52:15 2018 -0400 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Mon Sep 24 03:58:34 2018 -0400 @@ -1,24 +1,23 @@ -#!/usr/bin/python +#!/usr/bin/env python # coding: utf8 -## Author: Eric Fontanillas -## Modification: 03/09/14 by Julie BAFFARD -## Last modification : 05/03/18 by Victor Mataigne +# Author: Eric Fontanillas +# Modification: 03/09/14 by Julie BAFFARD +# Last modification : 25/07/18 by Victor Mataigne -## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria - ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF - ## CRITERIA 2 ## This longest part should be > 150nc or 50aa - ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa - ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA - ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA +# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria + # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF + # CRITERIA 2 - This longest part should be > 150nc or 50aa + # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa + # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA + # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA -#################################################### -###### DEF 2 : Create bash for genetic code ######## -#################################################### -### KEY = codon -### VALUE = Amino Acid +import string, os, time, re, zipfile, sys, argparse +from dico import dico def code_universel(F1): + """ Creates bash for genetic code (key : codon ; value : amino-acid) """ bash_codeUniversel = {} + with open(F1, "r") as file: for line in file.readlines(): L1 = string.split(line, " ") @@ -31,110 +30,85 @@ key = L1[0] value = L1[2] bash_codeUniversel[key] = value - return(bash_codeUniversel) -########################################################### - -###################################################################################################################### -##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 ##### -###################################################################################################################### -### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF) + return(bash_codeUniversel) + def multiple3(seq): - leng = len(seq) - modulo = leng%3 - if modulo == 0: # the results of dividing leng per 3 is an integer - new_seq = seq - elif modulo == 1: # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) - new_seq = seq[:-1] # remove the last nc - elif modulo == 2: # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3) - new_seq = seq[:-2] # remove the 2 last nc - len1 = len(new_seq) - return(new_seq, modulo) -########################################################## + """ Tests if the sequence is a multiple of 3, and if not removes extra-bases + !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ + + m = len(seq)%3 + if m != 0 : + return seq[:-m], m + else : + return seq, m +def detect_Methionine(seq_aa, Ortho, minimal_cds_length): + """ Detects if methionin in the aa sequence """ -############################# -###### DEF 4 : GET ORF ###### -############################# -##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences -##- CRITERIA1: Get the segment in the alignment with no codon stop + ln = len(seq_aa) + CUTOFF_Last_50aa = ln - minimal_cds_length + # Find all indices of occurances of "M" in a string of aa + list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] -###### DEF 4 - Part 1 - ###### -############################## -def simply_get_ORF(seq_dna, bash_codeUniversel): - seq_aa = "" - i = 0 - len1 = len(seq_dna) - while i < len1: - base1 = seq_dna[i] - base1 = string.capitalize(base1) - base2 = seq_dna[i+1] - base2 = string.capitalize(base2) - base3 = seq_dna[i+2] - base3 = string.capitalize(base3) + # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS + if list_indices != []: + first_M = list_indices[0] + if first_M < CUTOFF_Last_50aa: + Ortho = 1 # means orthologs found - codon = base1+base2+base3 - codon = string.replace(codon, "T", "U") + return(Ortho) + +def ReverseComplement2(seq): + """ Reverse complement DNA sequence """ + seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' + seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } - if codon in bash_codeUniversel.keys(): - aa = bash_codeUniversel[codon] - seq_aa = seq_aa + aa - else: - seq_aa = seq_aa +"?" ### Take account for gap "-" and "N" - i = i + 3 + return "".join([seq_dict[base] for base in reversed(seq)]) + +def simply_get_ORF(seq_dna, gen_code): + seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] + seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] - return(seq_aa) -########################################################## - + return ''.join(seq_by_aa) -###### DEF 4 - Part 2 - ###### -############################## -def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel): +def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec): + # Multiple sequence based : Based on the alignment of several sequences (orthogroup) + # Criteria 1 : Get the segment in the alignment with no codon stop - ## 1 ## Get the list of aligned aa seq for the 3 ORF: + # 1 - Get the list of aligned aa seq for the 3 ORF: bash_of_aligned_aa_seq_3ORF = {} bash_of_aligned_nuc_seq_3ORF = {} BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] + for fasta_name in bash_aligned_nc_seq.keys(): - ## 1.1. ## Get the raw sequence + # Get sequence, chek if multiple 3, then get 6 orfs sequence_nc = bash_aligned_nc_seq[fasta_name] - - ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not: - new_sequence_nc, modulo = multiple3(sequence_nc) ### DEF 3 ### - - ## 1.3. ## Get the 3 ORFs (nuc) for each sequence - seq_nuc_ORF1 = new_sequence_nc - seq_nuc_ORF2 = new_sequence_nc[1:-2] - seq_nuc_ORF3 = new_sequence_nc[2:-1] - seq_reversed=ReverseComplement2(seq_nuc_ORF1) - seq_nuc_ORF4=seq_reversed - seq_nuc_ORF5=seq_reversed[1:-2] - seq_nuc_ORF6=seq_reversed[2:-1] + new_sequence_nc, modulo = multiple3(sequence_nc) + new_sequence_rev = ReverseComplement2(new_sequence_nc) + # For each seq of the multialignment => give the 6 ORFs (in nuc) + bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] - LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6] - bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc ### For each seq of the multialignment => give the 6 ORFs (in nuc) + seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) + seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) + seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) + seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) + seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) + seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) + + # For each seq of the multialignment => give the 6 ORFs (in aa) + bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] - ## 1.4. ## Get the 3 ORFs (aa) for each sequence - seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel) ### DEF 4 - Part 1 - ## - seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel) ### DEF 4 - Part 1 - ## + # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) + BEST_MAX = 0 - LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6] - bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa ### For each seq of the multialignment => give the 6 ORFs (in aa) - - ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) - BEST_MAX = 0 - for i in [0,1,2,3,4,5]: ### Test the 6 ORFs + for i in [0,1,2,3,4,5]: # Test the 6 ORFs ORF_Aligned_aa = [] ORF_Aligned_nuc = [] - - ## 2.1 ## Get the alignment of sequence for a given ORF - ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd + # 2.1 - Get the alignment of sequence for a given ORF + # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i] aa_length = len(ORFsequence) @@ -145,12 +119,12 @@ for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys(): ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i] nuc_length = len(ORFsequence) - ORF_Aligned_nuc.append(ORFsequence) ### List of all sequences in the ORF nb "i" = + ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" = - ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment - ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) - ## Next step is to get the longuest subsequence whithout stop - ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" + # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment + # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF) + # Next step is to get the longuest subsequence whithout stop + # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*" MAX_LENGTH = 0 LONGUEST_SEGMENT_UNSTOPPED = "" j = 0 # Start from first position in alignment @@ -162,20 +136,20 @@ column.append(seq[j]) j = j+1 if "*" in column: - List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions - List_positions_subsequence = [] ## Re-initialyse list of positions + List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions + List_positions_subsequence = [] # Re-initialyse list of positions else: List_positions_subsequence.append(j) - ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) + # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF) LONGUEST_SUBSEQUENCE_LIST_POSITION = [] MAX=0 for sublist in List_of_List_subsequences: - if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH: + if len(sublist) > MAX and len(sublist) > minimal_cds_length: MAX = len(sublist) LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist - ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) + # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated) if LONGUEST_SUBSEQUENCE_LIST_POSITION != []: if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0: CDS_maybe_truncated = 1 @@ -185,17 +159,17 @@ CDS_maybe_truncated = 0 - ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) - ## Test whether it is the better ORF + # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF) + # Test whether it is the better ORF if MAX > BEST_MAX: BEST_MAX = MAX BEST_ORF = i+1 BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION - ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT) - ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) - ## And get the INDEX of the best ORF [0, 1, or 2] + # 3 - ONCE we have this better segment (BEST CODING SEGMENT) + # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) + # And get the INDEX of the best ORF [0, 1, or 2] if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] pos_MIN_aa = pos_MIN_aa - 1 @@ -205,13 +179,13 @@ BESTORF_bash_of_aligned_aa_seq = {} BESTORF_bash_of_aligned_aa_seq_CODING = {} for fasta_name in bash_of_aligned_aa_seq_3ORF.keys(): - index_BEST_ORF = BEST_ORF-1 ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 + index_BEST_ORF = BEST_ORF-1 # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3 seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF] seq_coding = seq[pos_MIN_aa:pos_MAX_aa] BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding - ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment + # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment pos_MIN_nuc = pos_MIN_aa * 3 pos_MAX_nuc = pos_MAX_aa * 3 @@ -223,224 +197,122 @@ BESTORF_bash_aligned_nc_seq[fasta_name] = seq BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding - else: ### no CDS found ### + else: # no CDS found BESTORF_bash_aligned_nc_seq = {} BESTORF_bash_aligned_nc_seq_CODING = {} BESTORF_bash_of_aligned_aa_seq = {} BESTORF_bash_of_aligned_aa_seq_CODING ={} - - - ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa - ########################################################################################################### - + # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} Ortho = 0 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] - Ortho = detect_Methionine(seq_aa, Ortho) ### DEF6 ### + Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### - ## CASE 1: A "M" is present and correctly localized (not in last 50 aa) + # CASE 1: A "M" is present and correctly localized (not in last 50 aa) if Ortho == 1: BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING - ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing: + # CASE 2: in case the CDS is truncated, so the "M" is maybe missing: if Ortho == 0 and CDS_maybe_truncated == 1: BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING - ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): + # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa): ## => the 2 bash "CDS_with_M" are left empty ("{}") return(BESTORF_bash_aligned_nc_seq, BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M) -########################################################## - -################################################################################################## -###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ###### -################################################################################################## -def allindices(string, sub): - listindex=[] - offset=0 - i = string.find(sub, offset) - while i >= 0: - listindex.append(i) - i = string.find(sub, i + 1) - return listindex -###################################################### - +def write_output_file(results_dict, name_elems, path_out): + if results_dict != {}: + name_elems[3] = str(len(results_dict.keys())) + new_name = "_".join(name_elems) -############################################################ -###### DEF 6 : Detect if methionin in the aa sequence ###### -############################################################ -def detect_Methionine(seq_aa, Ortho): - - ln = len(seq_aa) - nbre = sys.argv[2] - CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH - #Ortho = 0 ## means orthologs not found + out1 = open("%s/%s" %(path_out,new_name), "w") + for fasta_name in results_dict.keys(): + seq = results_dict[fasta_name] + out1.write("%s\n" %fasta_name) + out1.write("%s\n" %seq) + out1.close() - ## Find all indices of occurances of "M" in a string of aa - list_indices = allindices(seq_aa, "M") ### DEF5 ### - - ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS - if list_indices != []: - first_M = list_indices[0] - if first_M < CUTOFF_Last_50aa: - Ortho = 1 ## means orthologs found - - return(Ortho) -################################### - +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") + parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) + parser.add_argument("min_spec", help="Minimal number of species per alignment") + parser.add_argument("list_files", help="File with all input files names") + args = parser.parse_args() -############################################################ -###### DEF 7 : Reverse complement DNA sequence ###### -###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html -############################################################ -def ReverseComplement2(seq): - # too lazy to construct the dictionary manually, use a dict comprehension - seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' - seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } - return "".join([seq_dict[base] for base in reversed(seq)]) -############################ - - -####################### -##### RUN RUN RUN ##### -####################### -import string, os, time, re, zipfile, sys -from dico import dico - -MINIMAL_CDS_LENGTH = int(sys.argv[2]) ## in aa number - -### Get Universal Code -bash_codeUniversel = code_universel(sys.argv[1]) ### DEF2 ### + minimal_cds_length = int(args.min_cds_len) # in aa number + bash_codeUniversel = code_universel(args.codeUniversel) + minimum_species = int(args.min_spec) + + # Inputs from file containing list of species + list_files = [] + with open(args.list_files, 'r') as f: + for line in f.readlines(): + list_files.append(line.strip('\n')) -## INPUT from file containing list of species -list_files = [] -with open(sys.argv[3], 'r') as f: - for line in f.readlines(): - list_files.append(line.strip('\n')) - -os.mkdir("04_BEST_ORF_nuc") -Path_OUT1 = "04_BEST_ORF_nuc" -os.mkdir("04_BEST_ORF_aa") -Path_OUT2 = "04_BEST_ORF_aa" + # Directories for results + dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"] + for directory in dirs: + os.mkdir(directory) -os.mkdir("05_CDS_nuc") -Path_OUT3 = "05_CDS_nuc" -os.mkdir("05_CDS_aa") -Path_OUT4 = "05_CDS_aa" + count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0 + count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0 -os.mkdir("06_CDS_with_M_nuc") -Path_OUT5 = "06_CDS_with_M_nuc" -os.mkdir("06_CDS_with_M_aa") -Path_OUT6 = "06_CDS_with_M_aa" - -### Get the Bash corresponding to an alignment file in fasta format -count_file_processed = 0 -count_file_with_CDS = 0 -count_file_without_CDS = 0 -count_file_with_CDS_plus_M = 0 - -# ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), -# and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. + # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name), + # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified. + name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] -name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] - -# by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond -#n0 = 0 - -for file in list_files: - #n0 += 1 + # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond + #n0 = 0 - count_file_processed = count_file_processed + 1 - nb_gp = file.split('_')[1] # Keep trace of the orthogroup number - fasta_file_path = "./%s" %file - bash_fasta = dico(fasta_file_path) ### DEF 1 ### - BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### - - name_elems[1] = nb_gp + for file in list_files: + #n0 += 1 - ## a ## OUTPUT BESTORF_nuc - if BESTORF_nuc != {}: - name_elems[3] = str(len(BESTORF_nuc.keys())) - new_name = "_".join(name_elems) - count_file_with_CDS = count_file_with_CDS +1 - OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w") - for fasta_name in BESTORF_nuc.keys(): - seq = BESTORF_nuc[fasta_name] - OUT1.write("%s\n" %fasta_name) - OUT1.write("%s\n" %seq) - OUT1.close() - else: - count_file_without_CDS = count_file_without_CDS + 1 - - ## b ## OUTPUT BESTORF_nuc_CODING ===> THE MOST INTERESTING!!! - if BESTORF_aa != {}: - name_elems[3] = str(len(BESTORF_aa.keys())) - new_name = "_".join(name_elems) - OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w") - for fasta_name in BESTORF_aa.keys(): - seq = BESTORF_aa[fasta_name] - OUT2.write("%s\n" %fasta_name) - OUT2.write("%s\n" %seq) - OUT2.close() + count_file_processed = count_file_processed + 1 + nb_gp = file.split('_')[1] # Keep trace of the orthogroup number + fasta_file_path = "./%s" %file + bash_fasta = dico(fasta_file_path) + BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) + + name_elems[1] = nb_gp - ## c ## OUTPUT BESTORF_aa - if BESTORF_nuc_CODING != {}: - name_elems[3] = str(len(BESTORF_nuc_CODING.keys())) - new_name = "_".join(name_elems) - OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w") - for fasta_name in BESTORF_nuc_CODING.keys(): - seq = BESTORF_nuc_CODING[fasta_name] - OUT3.write("%s\n" %fasta_name) - OUT3.write("%s\n" %seq) - OUT3.close() + # Update counts and write group in corresponding output directory + if BESTORF_nuc != {}: + count_file_with_CDS += 1 + if len(BESTORF_nuc.keys()) >= minimum_species : + count_file_with_cds_and_enought_species += 1 + write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc + write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting + else: + count_file_without_CDS += 1 - ## d ## OUTPUT BESTORF_aa_CODING - if BESTORF_aa_CODING != {}: - name_elems[3] = str(len(BESTORF_aa_CODING.keys())) - new_name = "_".join(name_elems) - OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w") - for fasta_name in BESTORF_aa_CODING.keys(): - seq = BESTORF_aa_CODING[fasta_name] - OUT4.write("%s\n" %fasta_name) - OUT4.write("%s\n" %seq) - OUT4.close() + if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species: + write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2]) + write_output_file(BESTORF_aa_CODING, name_elems, dirs[3]) - ## e ## OUTPUT BESTORF_nuc_CDS_with_M - if BESTORF_nuc_CDS_with_M != {}: - name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys())) - new_name = "_".join(name_elems) - count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1 - OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w") - for fasta_name in BESTORF_nuc_CDS_with_M.keys(): - seq = BESTORF_nuc_CDS_with_M[fasta_name] - OUT5.write("%s\n" %fasta_name) - OUT5.write("%s\n" %seq) - OUT5.close() + if BESTORF_nuc_CDS_with_M != {}: + count_file_with_CDS_plus_M += 1 + if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species : + count_file_with_cds_M_and_enought_species += 1 + write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) + write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) - ## f ## OUTPUT BESTORF_aa_CDS_with_M - if BESTORF_aa_CDS_with_M != {}: - name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys())) - new_name = "_".join(name_elems) - OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w") - for fasta_name in BESTORF_aa_CDS_with_M.keys(): - seq = BESTORF_aa_CDS_with_M[fasta_name] - OUT6.write("%s\n" %fasta_name) - OUT6.write("%s\n" %seq) - OUT6.close() + print "*************** CDS detection ***************" + print "\nFiles processed: %d" %count_file_processed + print "\tFiles with CDS: %d" %count_file_with_CDS + print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) + print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M + print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) + print "\tFiles without CDS: %d \n" %count_file_without_CDS + print "" - #os.system("rm -rf %s" %file) - -## Print -print "*************** CDS detection ***************" -print "\nFiles processed: %d" %count_file_processed -print "\tFiles with CDS: %d" %count_file_with_CDS -print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M -print "\tFiles without CDS: %d \n" %count_file_without_CDS -print "" +if __name__ == '__main__': + main()
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py Fri Jul 06 02:52:15 2018 -0400 +++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py Mon Sep 24 03:58:34 2018 -0400 @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # coding: utf8 ## Author: Eric Fontanillas ## Modification: 03/09/14 by Julie BAFFARD @@ -48,7 +48,6 @@ MAX_LENGTH_SMALL_INDEL = 2 ## in aa MAX_LENGTH_SMALL_INDEL_nuc = 6 ## in nuc MIN_SPECIES_NB = int(sys.argv[1]) -MAX_sp = MIN_SPECIES_NB dicoco = {} dico_dico = {} list_new_file = []
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py Fri Jul 06 02:52:15 2018 -0400 +++ b/scripts/S03_remove_site_with_not_enough_species_represented.py Mon Sep 24 03:58:34 2018 -0400 @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # coding: utf8 ## Author: Eric Fontanillas ## Modification: 03/09/14 by Julie BAFFARD @@ -100,7 +100,6 @@ ### 0 ### PARAMETERS MIN_SPECIES_NB = int(sys.argv[1]) -MAX_sp = MIN_SPECIES_NB MIN_LENGTH_FINAL_ALIGNMENT_NUC = int(sys.argv[2]) n0 = 0 bad = 0
--- a/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -RVAAMDTCQTRIRLSVYPANSVWPSADHARDIHWGGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFQTLTSLSQPQDTMMGF*VFGEKRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTIWRLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDITTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKIISGNCGVVAI*VTHPPCPWRVPRSVICSVMFL ->Ap1491_1/1_1.000_963 -RVAAIDTCHTRIRLSVYPANSVWPSADQARDIH*GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFHTLTSLSQPQDTMMGFWVFGENRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTI*RLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDMTTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKMISGNCGVVAIWVTHPPCPWRVPRSVICSVMFL
--- a/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -PNRCHCSRS*HFAAS*RRSGIRCS*SVRCGPCRFRGRSERARLD*GRCSSPGRWR*SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV*IGP*AVCP*PCSPWPSGPFSAP*RS ->Ac1013_1/1_1.000_525 -PSRCRCSQS*HFAAS*RRSRIRRS*SVRCGPCRWRGRS*RARLDQGRCSSLGKWR*PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV*IEP*VVCP*PCSLWP*GLFSAP*RS
--- a/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -cgggtagctgccatggacacctgccagacacggatcaggttgtctgtgtatcctgcaaacagcgtctggccatcagctgaccatgccagggatatacactggggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcaggtaacctgagtgtccaaagtggtttgtttttagtttgcagtttgtcagattccaaaccttaaccagtttgtcccagccacaagacacaatgatgggattctgagtgtttggtgagaagcgaacacatgatacccactctgtgtgtccatcttcctgaatggtgtacttgcatacaccgagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctggcggttatcagctgagaaggccacgcttagcacatcttttgtgtgtccgacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagagcgaactgtccatctgatgacataacaacgtcagacacgaaatgtccatgtccacgcaaggccttgcgagggaaaccgtagcgcgattcctcacgagtcagctgccacaggatgagcgatttgtctcttgaagccgacaaaataatatcgggaaattgtggcgttgtagcaatttgagttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt ->Ap1491_1/1_1.000_963 -cgggtagctgccatagatacctgccacacacgaatcaggttgtctgtgtatccagcaaacagtgtctggccatcagctgaccaagccagggatatacactgaggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcagataacctgagtgtccgaagtggtttgtttttagcttgcagtttgtcagattccacaccttaaccagtttgtcccagccacaggacacaatgatgggattctgggtgtttggtgagaatcgaacacatgatacccactctgtgtgcccatcttcctgaatggtatacttgcacaccccaagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctgacggttatcagctgagaaagccacacttagcacgtccttcgtgtgtccaacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagggcgaactgtccatctgatgacatgacgacgtcagacacgaagtgtccatgtccgcgcaaggccttgcgagggaagccgtaacgcgattcctcgcgagtcagctgccacagaatgagcgatttgtctctagaagccgacaaaatgatatcaggaaattgtggcgttgtagcaatttgggttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt
--- a/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -cccaatcgctgtcactgctctcgaagttgacattttgctgcttcatgaagaaggtcaggaattcgttgtagttgatccgtccgctgtggtccttgtcgatttcgtggacgatctgaaagagctcgtctggattgaggaaggtgttcatctcccggaagatggcgttgatctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtttgaattgggccttgagctgtttgcccttgaccttgtagcccttggccttcagggcctttttcagctccttgaaggtca ->Ac1013_1/1_1.000_525 -cccagtcgctgtcgctgctctcaaagttgacattttgctgcttcatgaagaaggtcaagaattcgtcgtagttgatccgtccgctgtggtccttgtcgatggcgtggacgatcttgaagagctcgtctggatcaaggaaggtgttcatctcttggaaaatggcgttaacctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtttgaattgagccttgagttgtttgcccttgaccttgtagcctttggccttgagggcttttttcagctccttgaaggtca
--- a/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR ->Ap1491_1/1_1.000_963 -GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- a/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV ->Ac1013_1/1_1.000_525 -PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV
--- a/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg ->Ap1491_1/1_1.000_963 -ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga
--- a/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -tctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtt ->Ac1013_1/1_1.000_525 -cctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/old_orthogroup_7_with_3_species.fasta Mon Sep 24 03:58:34 2018 -0400 @@ -0,0 +1,6 @@ +>Am3527_1/1_1.000_270 +-------------------VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT---------------- +>Ap5050_1/1_1.000_243 +EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF +>Ac2173_1/1_1.000_330 +EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR ->Ap1491_1/1_1.000_963 -GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap2303_1/1_1.000_424 -FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN ->Ac3644_1/1_1.000_1626 -FNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV ->Ac1013_1/1_1.000_525 -PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta Mon Sep 24 03:58:34 2018 -0400 @@ -1,6 +1,6 @@ >Am3527_1/1_1.000_270 --------------------VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT---------------- +VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT >Ap5050_1/1_1.000_243 -EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF +VSVVHVERHVRSGKTPLLPFFTPSNHTCGKST >Ac2173_1/1_1.000_330 -EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF +VSVVHIERHVRSGKTPLLPFFTSPNHTCGKST
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/old_orthogroup_7_with_3_species.fasta Mon Sep 24 03:58:34 2018 -0400 @@ -0,0 +1,6 @@ +>Am3527_1/1_1.000_270 +---------------------------------------------------------gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg------------------------------------------------ +>Ap5050_1/1_1.000_243 +gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc +>Ac2173_1/1_1.000_330 +gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ac6688_1/1_1.000_963 -ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg ->Ap1491_1/1_1.000_963 -ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap2303_1/1_1.000_424 -tttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaac ->Ac3644_1/1_1.000_1626 -tttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaac
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ ->Ap5072_1/1_1.000_437 -tctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtt ->Ac1013_1/1_1.000_525 -cctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtt
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta Fri Jul 06 02:52:15 2018 -0400 +++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta Mon Sep 24 03:58:34 2018 -0400 @@ -1,6 +1,6 @@ >Am3527_1/1_1.000_270 ----------------------------------------------------------gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg------------------------------------------------ +gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg >Ap5050_1/1_1.000_243 -gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc +gtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacg >Ac2173_1/1_1.000_330 -gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc +gtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacg
