Mercurial > repos > matthias > fasta_regex_finder
changeset 0:0c5613c6a863 draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools.git commit 98943baecfa613d91dbef112fce8c6189f0431db
author | matthias |
---|---|
date | Mon, 18 Dec 2017 05:16:59 -0500 |
parents | |
children | |
files | fastaregexfinder.py fastaregexfinder.xml test-data/TestSeqGroup-G4-sub.bed test-data/TestSeqGroup-G4.bed test-data/TestSeqGroup-G4.fasta test-data/test-1.bed test-data/test-2.bed test-data/test-3.bed test-data/test-4.bed test-data/test.fas |
diffstat | 10 files changed, 488 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaregexfinder.py Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,254 @@ +#!/usr/bin/env python + +import re +import sys +import string +import argparse +import operator + +VERSION='0.1.1' + +parser = argparse.ArgumentParser(description=""" + +DESCRIPTION + + Search a fasta file for matches to a regex and return a bed file with the + coordinates of the match and the matched sequence itself. + + Output bed file has columns: + 1. Name of fasta sequence (e.g. chromosome) + 2. Start of the match + 3. End of the match + 4. ID of the match + 5. Length of the match + 6. Strand + 7. Matched sequence as it appears on the forward strand + + For matches on the reverse strand it is reported the start and end position on the + forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' + present on the reverse strand is reported as ACCCTCCC). + + Note: Fasta sequences (chroms) are read in memory one at a time along with the + matches for that chromosome. + The order of the output is: chroms as they are found in the inut fasta, matches + sorted within chroms by positions. + +EXAMPLE: + ## Test data: + echo '>mychr' > /tmp/mychr.fa + echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa + + fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' + mychr 0 4 mychr_0_4_for 4 + ACTG + mychr 5 9 mychr_5_9_for 4 + ACTG + mychr 10 14 mychr_10_14_for 4 + ACTG + + fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 + mychr 0 4 mychr_0_4_for 4 + ACT[3,4] + mychr 5 9 mychr_5_9_for 4 + ACT[3,4] + mychr 10 14 mychr_10_14_for 4 + ACT[3,4] + + less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\w\wGn' + mychr 0 5 mychr_0_5_for 5 + ACTGn + mychr 5 10 mychr_5_10_for 5 + ACTGn + mychr 10 15 mychr_10_15_for 5 + ACTGn + +DOWNLOAD + fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder + + """, formatter_class= argparse.RawTextHelpFormatter) + +parser.add_argument('--fasta', '-f', + type= str, + help='''Input fasta file to search. Use '-' to read the file from stdin. + + ''', + required= True) + +parser.add_argument('--regex', '-r', + type= str, + help='''Regex to be searched in the fasta input. +Matches to the reverse complement will have - strand. +The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches +for G-quadruplexes. + ''', + default= '([gG]{3,}\w{1,7}){3,}[gG]{3,}') + +parser.add_argument('--matchcase', '-m', + action= 'store_true', + help='''Match case while searching for matches. Default is +to ignore case (I.e. 'ACTG' will match 'actg'). + ''') + +parser.add_argument('--noreverse', + action= 'store_true', + help='''Do not search the reverse complement of the input fasta. +Use this flag to search protein sequences. + ''') + +parser.add_argument('--maxstr', + type= int, + required= False, + default= 10000, + help='''Maximum length of the match to report in the 7th column of the output. +Default is to report up to 10000nt. +Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] + ''') + +parser.add_argument('--seqnames', '-s', + type= str, + nargs= '+', + default= [None], + required= False, + help='''List of fasta sequences in --fasta to +search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. +Default is to search all the sequences in input. + ''') +parser.add_argument('--quiet', '-q', + action= 'store_true', + help='''Do not print progress report (i.e. sequence names as they are scanned). + ''') + + + +parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION) + + +args = parser.parse_args() + +" --------------------------[ Check and parse arguments ]---------------------- " + +if args.matchcase: + flag= 0 +else: + flag= re.IGNORECASE + +" ------------------------------[ Functions ]--------------------------------- " + +def sort_table(table, cols): + """ Code to sort list of lists + see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ + + sort a table by multiple columns + table: a list of lists (or tuple of tuples) where each inner list + represents a row + cols: a list (or tuple) specifying the column numbers to sort by + e.g. (1,0) would sort by column 1, then by column 0 + """ + for col in reversed(cols): + table = sorted(table, key=operator.itemgetter(col)) + return(table) + +def trimMatch(x, n): + """ Trim the string x to be at most length n. Trimmed matches will be reported + with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of + the trimmed strng (e.g 4 here) and b is the full length of the match + EXAMPLE: + trimMatch('ACTGNNNN', 4) + >>>'ACTG[4,8]' + trimMatch('ACTGNNNN', 8) + >>>'ACTGNNNN' + """ + if len(x) > n and n is not None: + m= x[0:n] + '[' + str(n) + ',' + str(len(x)) + ']' + else: + m= x + return(m) + +def revcomp(x): + """Reverse complement string x. Ambiguity codes are handled and case conserved. + + Test + x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' + revcomp(x) + """ + compdict= {'A':'T', + 'C':'G', + 'G':'C', + 'T':'A', + 'R':'Y', + 'Y':'R', + 'S':'W', + 'W':'S', + 'K':'M', + 'M':'K', + 'B':'V', + 'D':'H', + 'H':'D', + 'V':'B', + 'N':'N', + 'a':'t', + 'c':'g', + 'g':'c', + 't':'a', + 'r':'y', + 'y':'r', + 's':'w', + 'w':'s', + 'k':'m', + 'm':'k', + 'b':'v', + 'd':'h', + 'h':'d', + 'v':'b', + 'n':'n'} + xrc= [] + for n in x: + xrc.append(compdict[n]) + xrc= ''.join(xrc)[::-1] + return(xrc) +# ----------------------------------------------------------------------------- + +psq_re_f= re.compile(args.regex, flags= flag) +## psq_re_r= re.compile(regexrev) + +if args.fasta != '-': + ref_seq_fh= open(args.fasta) +else: + ref_seq_fh= sys.stdin + +ref_seq=[] +line= (ref_seq_fh.readline()).strip() +chr= re.sub('^>', '', line) +line= (ref_seq_fh.readline()).strip() +gquad_list= [] +while True: + if not args.quiet: + sys.stderr.write('Processing %s\n' %(chr)) + while line.startswith('>') is False: + ref_seq.append(line) + line= (ref_seq_fh.readline()).strip() + if line == '': + break + ref_seq= ''.join(ref_seq) + if args.seqnames == [None] or chr in args.seqnames: + for m in re.finditer(psq_re_f, ref_seq): + matchstr= trimMatch(m.group(0), args.maxstr) + quad_id= str(chr) + '_' + str(m.start()) + '_' + str(m.end()) + '_for' + gquad_list.append([chr, m.start(), m.end(), quad_id, len(m.group(0)), '+', matchstr]) + if args.noreverse is False: + ref_seq= revcomp(ref_seq) + seqlen= len(ref_seq) + for m in re.finditer(psq_re_f, ref_seq): + matchstr= trimMatch(revcomp(m.group(0)), args.maxstr) + mstart= seqlen - m.end() + mend= seqlen - m.start() + quad_id= str(chr) + '_' + str(mstart) + '_' + str(mend) + '_rev' + gquad_list.append([chr, mstart, mend, quad_id, len(m.group(0)), '-', matchstr]) + gquad_sorted= sort_table(gquad_list, (1,2,3)) + gquad_list= [] + for xline in gquad_sorted: + xline= '\t'.join([str(x) for x in xline]) + print(xline) + chr= re.sub('^>', '', line) + ref_seq= [] + line= (ref_seq_fh.readline()).strip() + if line == '': + break + +#gquad_sorted= sort_table(gquad_list, (0,1,2,3)) +# +#for line in gquad_sorted: +# line= '\t'.join([str(x) for x in line]) +# print(line) +sys.exit()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaregexfinder.xml Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,161 @@ +<tool id="fasta_regex_finder" name="fasta_regex_finder" version="0.1.0"> + <description> + Search in fasta for regexp match + </description> + <requirements> + </requirements> + <version_command>python $__tool_directory__/fastaregexfinder.py --version</version_command> + <command detect_errors="exit_code"><![CDATA[ +python $__tool_directory__/fastaRegexFinder.py +--fasta "$input" +--regex "$regex" +#if $settings.advanced == "advanced" + $settings.matchcase + $settings.noreverse + --maxstr $settings.maxstr + #if $settings.seqnames != "" + --seqnames $settings.seqnames + #end if +#end if +--quiet +> $output + ]]></command> + <inputs> + <param type="data" name="input" format="fasta" /> + <param name="regex" size="30" type="text" value="([gG]{3,}\w{1,7}){3,}[gG]{3,}" label="Regular expression" help="(--regex)"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <conditional name="settings"> + <param name="advanced" type="select" label="Specify advanced parameters"> + <option value="simple" selected="true">No, use program defaults.</option> + <option value="advanced">Yes, see full parameter list.</option> + </param> + <when value="simple"> + </when> + <when value="advanced"> + <param name="matchcase" type="boolean" label="Match case" truevalue="--matchcase" falsevalue="" help="(--matchcase)" /> + <param name="noreverse" type="boolean" label="Do not search the reverse complement" truevalue="--noreverse" falsevalue="" help="(--noreverse)" /> + <param name="maxstr" type="integer" label="Maximum length of the match to report" value="10000" min="1" help="(--maxstr)" /> + <param name="seqnames" size="30" type="text" value="" label="Space separated list of fasta sequences to search" help="--seqnames"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output" format="bed" from_work_dir="TestSeqGroup-G4.bed" /> + </outputs> + <tests> + <test> + <param name="input" value="TestSeqGroup-G4.fasta"/> + <output name="output" file="TestSeqGroup-G4.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <output name="output" file="test-1.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="matchcase" value="--matchcase"/> + <output name="output" file="test-2.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="noreverse" value="--noreverse"/> + <output name="output" file="test-3.bed"/> + </test> + <test> + <param name="input" value="test.fas"/> + <param name="regex" value="ACTG"/> + <param name="advanced" value="advanced"/> + <param name="maxstr" value="3"/> + <output name="output" file="test-4.bed"/> + </test> + <test> + <param name="input" value="TestSeqGroup-G4.fasta"/> + <param name="advanced" value="advanced"/> + <param name="seqnames" value="HJ24-Shp2_oncogenicProtein2 HJ24-Shp2_oncogenicProtein"/> + <output name="output" file="TestSeqGroup-G4-sub.bed"/> + </test> +</tests> + <help><![CDATA[ +DESCRIPTION + +Search a fasta file for matches to a regular expression and return a bed file with the +coordinates of the match and the matched sequence itself. + +Output bed file has columns: + +1. Name of fasta sequence (e.g. chromosome) +2. Start of the match +3. End of the match +4. ID of the match +5. Length of the match +6. Strand +7. Matched sequence as it appears on the forward strand + +For matches on the reverse strand it is reported the start and end position on the +forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' +present on the reverse strand is reported as ACCCTCCC). + + +Note: Fasta sequences (chroms) are read in memory one at a time along with the +matches for that chromosome. +The order of the output is: chroms as they are found in the inut fasta, matches +sorted within chroms by positions. + +ARGUMENTS: + +- regex Regex to be searched in the fasta input. Matches to the reverse complement will have - strand. The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches for G-quadruplexes. +- matchcase Match case while searching for matches. Default is to ignore case (I.e. 'ACTG' will match 'actg'). +- noreverse Do not search the reverse complement of the input fasta. Use this flag to search protein sequences. +- maxstr Maximum length of the match to report in the 7th column of the output. Default is to report up to 10000nt. Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] +- seqnames List of fasta sequences in the input to search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. Default is to search all the sequences in input. + +EXAMPLE: + +Test data:: +>mychr +ACTGnACTGnACTGnTGAC + +Example1 regex=ACTG:: + + mychr 0 4 mychr_0_4_for 4 + ACTG + mychr 5 9 mychr_5_9_for 4 + ACTG + mychr 10 14 mychr_10_14_for 4 + ACTG + +Example2 regex=ACTG maxstr=3:: + + mychr 0 4 mychr_0_4_for 4 + ACT[3,4] + mychr 5 9 mychr_5_9_for 4 + ACT[3,4] + mychr 10 14 mychr_10_14_for 4 + ACT[3,4] + +Example3 regex=A\w\wG:: + + mychr 0 5 mychr_0_5_for 5 + ACTGn + mychr 5 10 mychr_5_10_for 5 + ACTGn + mychr 10 15 mychr_10_15_for 5 + ACTGn + + ]]></help> + <citations> + <citation type="bibtex"> +@misc{githubfastaRegexFinder, + author = {Dario Beraldi}, + year = {2017}, + title = {fastaRegexFinder}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder}, +}</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4-sub.bed Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,2 @@ +HJ24-Shp2_oncogenicProtein 17 58 HJ24-Shp2_oncogenicProtein_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein2 17 58 HJ24-Shp2_oncogenicProtein2_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4.bed Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,3 @@ +HJ24-Shp2_oncogenicProtein 17 58 HJ24-Shp2_oncogenicProtein_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein2 17 58 HJ24-Shp2_oncogenicProtein2_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG +HJ24-Shp2_oncogenicProtein3 17 58 HJ24-Shp2_oncogenicProtein3_17_58_for 41 + GGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/TestSeqGroup-G4.fasta Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,56 @@ +>PA#2/8 +ATACCAGCTTATTCAATTAGCAACATGAGGGGGATAGAGGGGGTGGGTTCTCTCGGCTACAATCGTAATCAGTTAG +>PA#14/89 +ATACCAGCTTATTCAATTGGGCACCACGGGAGTCGGCCACATTTGGAGTTGTTTTTGCACAATCGTAATCAGTTAG +>PA-C10 +ATACCAGCTTATTCAATTGCGCACCACGGGAGTTGGCCACATTTGGAGTTGTTTTTGCACAATCGTAATCAGTTAG +>PA#4/22 +ATACCAGCTTATTCAATTGCAGTACTGATGAGTGTAGCCGTATGATTATCGTTTGTGGACAATCGTAATCAGTTAG +>PA#4/34 +ATACCAGCTTATTCAATTCCCCAACGAGTCGATATGTAGCCCACACTCTGATTCGTCCACAATCGTAATCAGTTAG +>PA#2/11 +ATACCAGCTTATTCAATTGGAGACGACAAACTATTACGTACTACGGCATGCACTTGGTACAATCGTAATCAGTTAG +>PA-C8 +ATACCAGCTTATTCAATTAGGCCAGATGAGGGGTGCCCATGGCGGGGTGGCTGCTCCAACAATCGTAATCAGTTAG +>PA#14/82 +ATACCAGCTTATTCAATTCCACAACCGAACTCGTAAGACGTATGTAGCCGCCAACTGTACAATCGTAATCAGTTAG +>PA#2/3 +ATACCAGCTTATTCAATTCGACAAGTGGGCATTACGATTCTAGCCCTGATTATGTTCCACAATCGTAATCAGTTAG +>PA-C9 +ATACCAGCTTATTCAATTACCGAGGAGATAACGTTGTAGCCGTCCATCATCTGATTCGACAATCGTAATCAGTTAG +>PA#2/6 +ATACCAGCTTATTCAATTACCGATCACTAGCCGACTAATTGGTTTCCGATCGCAGTCCACAATCGTAATCAGTTAG +>PA-C11 +ATACCAGCTTATTCAATTCGATGGAGCTGATGATTGTTGCCGATCTGACTGTTGTTCCACAATCGTAATCAGTTAG +>PA-C13 +ATACCAGCTTATTCAATTCCCCTAACGTTACTGGATGTAGTCCGACTAACTTATGCGTACAATCGTAATCAGTTAG +>PA-C42 +ATACCAGCTTATTCAATTGCAGATTACGCCTTGTAGCCCGCACTGATCTCGATGTTTGGACAATCGTAATCAGTTAG +>PA-C16 +ATACCAGCTTATTCAATTCCCACGAGTGTAGCCGATTCTTCTGTACTCTTGTCCTCGTACAATCGTAATCAGTTAG +>PA-C15 +ATACCAGCTTATTCAATTACGTGTTGTAGCCGACCCCTGTTGATTGTTTTCCTGTACCACAATCGTAATCAGTTAG +>EA#14.3 +ATACCAGCTTATTCAATTTGAGGCGGGTGGGTGGGTTGAATACGCTGATTACCCCATCGGAGAACGTTAAGGCGCTTCAGATAGTAAGTGCAATCT +>EA#9.4 +ATACCAGCTTATTCAATTGCTGCGAGGTGGGTGGGTGGGAGCAATTGATCCTCGCTTAGCTTCTACGGTGGGCTATCTAGATAGTAAGTGCAATCT +>EA#5.10 +ATACCAGCTTATTCAATTCACCACACCTGCACCCCTGACTTCCCACTTATATCTACTACTCCGTCTCAAGCCCGTTTGAGATAGTAAGTGCAATCT +>EA#14.5 +ATACCAGCTTATTCAATTCCGAGTTTGGGTGGGAGTGGTGGGTTCGGAATTGTTAGTTATTTGGGTTTATGCGAGGTGAGATAGTAAGTGCAATCT +>EA#14.8 +ATACCAGCTTATTCAATTGACGGGGTGTTGTCGTATGCTGTAGAAGCCGTAATTTTTTTTGTTTTCCCTGCCCACCTAGATAGTAAGTGCAATCT +>EA#14.4 +ATACCAGCTTATTCAATTCCACAGGTTGTATGGGGAATAAGGTGGGTGCGCGAGATAGTAAGTGCAATCT +>EA#11.5 +ATACCAGCTTATTCAATTCCCACACCCTAACCGTAGAGCTAAGCTTTTCTTACTACTGACAGTGCTTTACCGTTTGCAAGATAGTAAGTGCAATCT +>HJ24-Shp2_oncogenicProtein +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>HJ24-Shp2_oncogenicProtein2 +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>HJ24-Shp2_oncogenicProtein3 +AGCGTCGAATACCACACGGGGGTTTTGGTGGGGGGGGCTGGGTTGTCTTGGGGGTGGGCTAATGGAGCTCGTGGTCAT +>RT6-HIVRT +ATCCGCCTGATTAGCGATACTCAGGCGTTAGGGAAGGGCGTCGAAAGCAGGGTGGGACTTGAGCAAAATCACCTGCAGGGG +>AptG4-HumanRNaseH1 +CGGTCGCTCCGTGTGGCTTGGGTTGGGTGTGGCAGTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-1.bed Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,3 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 5 9 mychr_5_9_for 4 + actg +mychr 10 14 mychr_10_14_rev 4 - CAGT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-2.bed Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,2 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 10 14 mychr_10_14_rev 4 - CAGT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-3.bed Mon Dec 18 05:16:59 2017 -0500 @@ -0,0 +1,2 @@ +mychr 0 4 mychr_0_4_for 4 + ACTG +mychr 5 9 mychr_5_9_for 4 + actg