Mercurial > repos > abims-sbr > pairwise
comparison scripts/S06_post_processing_of_pairwise.py @ 4:6709645eff5d draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 08:53:53 -0500 |
| parents | c8af52875b0f |
| children |
comparison
equal
deleted
inserted
replaced
| 3:5f68b2fc02c1 | 4:6709645eff5d |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 ## AUTHOR: Eric Fontanillas | 2 ## AUTHOR: Eric Fontanillas |
| 3 ## LAST VERSION: 14/08/14 by Julie BAFFARD | 3 ## LAST VERSION: 14/08/14 by Julie BAFFARD |
| 4 | 4 |
| 5 MINIMUM_LENGTH = 1 #bp | 5 MINIMUM_LENGTH = 1 |
| 6 | |
| 7 | |
| 8 ############################ | |
| 9 ##### DEF1 : Get Pairs ##### | |
| 10 ############################ | |
| 11 def get_pairs(fasta_file_path): | |
| 12 F2 = open(fasta_file_path, "r") | |
| 13 list_pairwises = [] | |
| 14 while 1: | |
| 15 next2 = F2.readline() | |
| 16 if not next2: | |
| 17 break | |
| 18 if next2[0] == ">": | |
| 19 fasta_name_query = next2[:-1] | |
| 20 next3 = F2.readline() | |
| 21 fasta_seq_query = next3[:-1] | |
| 22 next3 = F2.readline() ## jump one empty line (if any after the sequence) | |
| 23 fasta_name_match = next3[:-1] | |
| 24 next3 = F2.readline() | |
| 25 fasta_seq_match = next3[:-1] | |
| 26 pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match] | |
| 27 | |
| 28 ## ADD pairwise with condition | |
| 29 list_pairwises.append(pairwise) | |
| 30 F2.close() | |
| 31 | |
| 32 return(list_pairwises) | |
| 33 ############################################## | |
| 34 | |
| 35 ################################# | |
| 36 ##### DEF2 : Extract length ##### | |
| 37 ################################# | |
| 38 def extract_length(length_string): # format length string = 57...902 | |
| 39 l3 = string.split(length_string, "...") | |
| 40 n1 = string.atoi(l3[0]) | |
| 41 n2 = string.atoi(l3[1]) | |
| 42 length = n2-n1 | |
| 43 | |
| 44 return(length) | |
| 45 ############################################## | |
| 46 | |
| 47 | |
| 48 #################################### | |
| 49 ##### DEF3 : Remove Redondancy ##### | |
| 50 #################################### | |
| 51 def filter_redondancy_and_length(list_paireu, MIN_LENGTH): | |
| 52 | |
| 53 bash1 = {} | |
| 54 list_pairout = [] | |
| 55 | |
| 56 for pair in list_paireu: | |
| 57 query_name = pair[0] | |
| 58 query_seq = pair[1] | |
| 59 match_name = pair[2] | |
| 60 match_seq = pair[3] | |
| 61 | |
| 62 l1 = string.split(query_name, "||") | |
| 63 short_query_name = l1[0][1:] | |
| 64 length_matched = extract_length(l1[1]) ### DEF2 ### | |
| 65 l2 = string.split(match_name, "||") | |
| 66 short_match_name = l2[0][1:] | |
| 67 binom = "%s_%s" %(short_query_name, short_match_name) | |
| 68 | |
| 69 ## TEST FOR REDONDANCY | |
| 70 ## REDONDANCY OF BINOME!!!! => MATCHE BETWEEN THE SAME 2 CONTIGS, BUT AT DIFFERENT POSITIONS ON THE CONTIG | |
| 71 ## REDONDANCY NOT REMOVED HERE: | |
| 72 ## 1/ Several "TERA" match with one "APN" (Counted in script "09_formatMatch_getBackNucleotides.py") | |
| 73 ## 2/ Several "APN" match with one "TERA" (Counted | |
| 74 if binom not in bash1.keys(): | |
| 75 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched] | |
| 76 else: | |
| 77 old_length = bash1[binom][-1] | |
| 78 if length_matched > old_length: | |
| 79 bash1[binom] = [query_name, query_seq, match_name, match_seq, length_matched] | |
| 80 | |
| 81 | |
| 82 for bino in bash1.keys(): | |
| 83 length = bash1[bino][-1] | |
| 84 if length > MIN_LENGTH: | |
| 85 list_pairout.append(bash1[bino]) | |
| 86 | |
| 87 return(list_pairout) | |
| 88 ############################################## | |
| 89 | |
| 90 | 6 |
| 91 ####################### | 7 ####################### |
| 92 ##### RUN RUN RUN ##### | 8 ##### RUN RUN RUN ##### |
| 93 ####################### | 9 ####################### |
| 94 import string, os, time, re, sys | 10 import string, os, time, re, sys |
| 11 from functions import get_pairs, extract_length, filter_redondancy | |
| 95 | 12 |
| 96 ## 1 ## INPUT/OUTPUT | 13 ## 1 ## INPUT/OUTPUT |
| 97 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db | 14 SHORT_FILE = sys.argv[1] #short-name-query_short-name-db |
| 98 | 15 |
| 99 F_IN = "%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE) | 16 F_IN = "%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE) |
| 104 F_OUT2 = "%s/09_onlyMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE) | 21 F_OUT2 = "%s/09_onlyMatch_filtered_%s.fasta" %(SHORT_FILE, SHORT_FILE) |
| 105 File_OUT2 = open(F_OUT2, "w") | 22 File_OUT2 = open(F_OUT2, "w") |
| 106 | 23 |
| 107 ## 2 ## RUN | 24 ## 2 ## RUN |
| 108 list_pairwises = get_pairs(F_IN) ### DEF1 ### | 25 list_pairwises = get_pairs(F_IN) ### DEF1 ### |
| 109 list_pairwises_filtered1 = filter_redondancy_and_length(list_pairwises, MINIMUM_LENGTH) ### DEF3 ### | 26 list_pairwises_filtered1 = filter_redondancy(list_pairwises, MINIMUM_LENGTH) ### DEF3 ### |
| 110 | 27 |
| 111 i = 0 | 28 i = 0 |
| 112 for pair in list_pairwises_filtered1: | 29 for pair in list_pairwises_filtered1: |
| 113 i = i+1 | 30 i = i+1 |
| 114 | 31 |
