Mercurial > repos > bgruening > rrna
changeset 0:1a12c379df0c draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rRNA commit 1973f3035c10db80883d80847ea254289f5cce2a-dirty
author | bgruening |
---|---|
date | Thu, 17 Sep 2015 16:50:41 -0400 |
parents | |
children | |
files | fasta.py rRNA_prediction.xml rna_hmm3.py |
diffstat | 3 files changed, 485 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta.py Thu Sep 17 16:50:41 2015 -0400 @@ -0,0 +1,180 @@ + +# Copyright (C) 2003, 2004, 2006 by Thomas Mailund <mailund@birc.au.dk> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at +# your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, +# USA. + +""" +A parser for FASTA files. + +Copyright (C) 2003, 2004, 2006 by Thomas Mailund <mailund@birc.au.dk> +""" + +class MalformedInput: + "Exception raised when the input file does not look like a fasta file." + pass + +class FastaRecord: + "Wrapper around a fasta record." + def __init__(self, header, sequence): + "Create a record with the given header and sequence." + self.header = header + self.sequence = sequence + + def __str__(self): + result = ['>'+self.header] + for i in xrange(0,len(self.sequence),60): + result.append(self.sequence[i:i+60]) + return '\n'.join(result) + + +def _fasta_itr_from_file(file): + "Provide an iteration through the fasta records in file." + + h = file.readline().strip() + if h[0] != '>': + raise MalformedInput() + h = h[1:] + + seq = [] + for line in file: + line = line.strip() # remove newline + if not len(line): + continue + if line[0] == '>': + yield FastaRecord(h,''.join(seq)) + + h = line[1:] + seq = [] + continue + + seq += [line] + + yield FastaRecord(h,''.join(seq)) + + +def _fasta_itr_from_name(fname): + "Provide an iteration through the fasta records in the file named fname. " + f = open(fname) + for rec in _fasta_itr_from_file(f): + yield rec + f.close() + + +def _fasta_itr(src): + """Provide an iteration through the fasta records in file `src'. + + Here `src' can be either a file object or the name of a file. + """ + if type(src) == str: + return _fasta_itr_from_name(src) + elif type(src) == file: + return _fasta_itr_from_file(src) + else: + raise TypeError + +def fasta_get_by_name(itr,name): + "Return the record in itr with the given name." + x = name.strip() + for rec in itr: + if rec.header.strip() == x: + return rec + return None + +class fasta_itr: + "An iterator through a sequence of fasta records." + def __init__(self,src): + "Create an iterator through the records in src." + self.__itr = _fasta_itr(src) + + def __iter__(self): + return self + def next(self): + return self.__itr.next() + + def __getitem__(self,name): + return fasta_get_by_name(iter(self),name) + +class fasta_slice: + """Provide an iteration through the fasta records in file `src', from + index `start' to index `stop'. + + Here `src' can be either a file object or the name of a file. + """ + def __init__(self, src, start, stop): + """Provide an iteration through the fasta records in file `src', from + index `start' to index `stop'. + + Here `src' can be either a file object or the name of a file. + """ + self.__itr = _fasta_itr(src) + self.__current = 0 + self.__start = start + self.__stop = stop + + def __iter__(self): + return self + + def next(self): + while self.__current < self.__start: + # skip past first records until we get to `start' + self.__itr.next() + self.__current += 1 + + if self.__current >= self.__stop: + # stop after `stop' + raise StopIteration + + self.__current += 1 + return self.__itr.next() + + def __getitem__(self,name): + return fasta_get_by_name(iter(self),name) + +def get_sequence(src,name): + "Return the record in src with the given name." + return fasta_itr(src)[name] + + +# TESTING... +if __name__ == '__main__': + import sys + if len(sys.argv) != 2: + print "wrong programmer error" + sys.exit(2) + + print 'iterating through all sequences in input file' + for rec in fasta_itr(sys.argv[1]): + print rec + print + + #print 'input sequences (terminated with ^D)' + #for rec in fasta_itr(sys.stdin): + # print rec + #print + + print 'iterating through input, from the second sequence' + for rec in fasta_slice(sys.argv[1], 1, 3): + print rec + print + + print 'the sequence for "bar"' + print fasta_itr(sys.argv[1])["bar"] + print fasta_slice(sys.argv[1],0,3)["bar"] + print get_sequence(sys.argv[1],"bar") + print + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rRNA_prediction.xml Thu Sep 17 16:50:41 2015 -0400 @@ -0,0 +1,96 @@ +<tool id="meta_rna" name="rRNA" version="0.1"> + <description>Identification of ribosomal RNA genes in metagenomic fragments</description> + <requirements> + <requirement type="binary">hmmsearch3.0</requirement> + </requirements> + <command interpreter='python'> +<![CDATA[ + rna_hmm3.py + --cpu "\${GALAXY_SLOTS:-4}" + -i $input + -L /home/galaxy/lib/meta_rna/HMM3 + -k $kingdom + -e $evalue + + #if(str($isMaskFile)=='yes') + --mask $MaskSequenceFile + #end if + #if(str($isSequenceFile)=='yes') + --seq $SequenceFile + #end if + + --gff $GFFFile + + #please set export PATH=/path_to_hmm3_search + #and adjust the -L parameter + +]]> + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Genome Sequence"/> + <param name="evalue" type="text" value="0.01" label="E-value" help="e-value cut-off for hmmsearch, default 0.01"/> + + <param name="kingdom" type="select" display="checkboxes" multiple="true" label="Kingdom" help="If more than one is used, the program will select best kingdom for the sequence according to evalue"> + <option value="arc" selected="True">Archaeal</option> + <option value="bac" selected="True">Bacterial</option> + <option value="euk" selected="True">Eukaryotic</option> + </param> + + <param name="isMaskFile" type="select" label="Mask fasta sequence as additional output?"> + <option value="yes">yes</option> + <option value="no" selected="True">no</option> + </param> + <param name="isSequenceFile" type="select" label="Fasta sequence as additional output?"> + <option value="yes">yes</option> + <option value="no" selected="True">no</option> + </param> + + </inputs> + <outputs> + <data format="gff" name="GFFFile" label="rRNA coordinate file"/> + <data format="fasta" name="MaskSequenceFile" label="rRNA sequences"> + <filter>isMaskFile == "yes"</filter> + </data> + <data format="fasta" name="SequenceFile" label="rRNA sequences"> + <filter>isSequenceFile == "yes"</filter> + </data> + </outputs> + <help> +<![CDATA[ + +**What it does** + +Identification of ribosomal RNA genes in metagenomic fragments. + + +**Input** + +Nucleotide sequence in FASTA format. + + +**Example** + +Suppose you have the following DNA formatted sequences:: + + >seq1 + CACGTGGAATCCCGTTTGAAGATAGGAGGACCATCTCCTAAGGCTAAATACTACTTGGTG + ACCGATAGTGAACCAGTACAGTGATGGAAAGGTGAAAAGAACCCCGGGAGGGGAGTGAAA + GAGAACCTGAAACTGTGTGCTTACAATTAGTCAGAGCCCGTTAATGGGTGATGGCATGCC + TTTTGTAGAATGAACCGGCGAGTTATGTTACATAGCAAGGTTAAGGATGAAGGTCCGGAG + CCGAAGCGAAAGCGAGTCTGAATAGGGCGCTTAAGTTGTGTGATGTAGACCCGAAACTGG + GTGATCTAGCCATGAGCAGGTTGAAGTAAGGGTAGTACCTTATGGAGGACCGAACCGCCG + CCTGTTGAAAAAGGCTCGGATGACTTGTGGCTAGGGGAGA + + +Running this tool will produce this:: + ##seq_name method feature start end evalue strand gene + seq1 rna_hmm3 rRNA 1 400 1.3e-120 + Bacterial:23S_rRNA + +]]> + + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp161</citation> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rna_hmm3.py Thu Sep 17 16:50:41 2015 -0400 @@ -0,0 +1,209 @@ +#! /usr/bin/env python +import os +import re +import sys +import string +import optparse +import fasta +import math +import tempfile + +def format(seq, N=60): + nseg = int(math.ceil(len(seq)/(N+0.0))) + return '\n'.join([seq[i*N:(i+1)*N] for i in range(nseg)]) +# write into fasta format file + +parser = optparse.OptionParser(version="%prog ") + +parser.add_option("-i", "--input", dest="input_fasta",action="store", + help="name of input file in fasta format") +parser.add_option("-L", "--LibHmm", dest="hmm_path",action="store", + default="HMM3",help="path of hmm database") +parser.add_option("--gff", dest="out_gff",action="store", + help="name of output gff file") +parser.add_option("--seq", dest="out_seq",action="store", + help="name of output sequence file") +parser.add_option("--mask", dest="out_mask",action="store", + help="name of output mask file") +parser.add_option("-k", "--kingdoms", dest="kingdoms",action="store", + default="arc,bac,euk",help="kingdom used") +parser.add_option("-m", "--moltypes", dest="moltypes",action="store", + default="lsu,ssu,tsu",help="molecule type detected") +parser.add_option("-e","--Evalue", dest="evalue",action="store",type="float", + default=0.01,help="evalue cut-off for hmmsearch") +parser.add_option("--cpu", dest="cpu",action="store",type="int", + default=4,help="number of cpus hmmsearch should use") + + +try: + (options, args) = parser.parse_args() +except: + parser.print_help() + sys.exit(1) + +if options.input_fasta is None or options.hmm_path is None: + parser.print_help() + sys.exit(1) + +#print "%s"% os.path.abspath(options.hmm_path) +#os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path) +#print os.environ["HMMERDB"] +fname = os.path.abspath(options.input_fasta) + +tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN","ctagyrswmkvhdbnCTAGYRSWMKVHDBN") + + +def rev_record(record): + return ">"+record.header+"|rev\n"+format(record.sequence[::-1].translate(tr)) + + +records = [rec for rec in fasta.fasta_itr(fname)] +headers = [[rec.header,len(rec.sequence)] for rec in records] + + +temp_fasta = tempfile.NamedTemporaryFile(delete=False) +ff = open(temp_fasta.name,'w') +for (i, rec) in enumerate(records): + ff.write('>s'+str(i)+'\n'+format(rec.sequence)+'\n') + ff.write('>s'+str(i)+'|rev\n'+format(rec.sequence[::-1].translate(tr))+'\n') +ff.close() +#sys.exit(1) +# a temporary fasta file, use s(int) to easy the parsing + +def parse_hmmsearch(kingdom, moltype, src): +# function to parse hmmsearch output + resu = [] + data = open(src).readlines() + inds = [-1]+[i for (i,x) in enumerate(data[2]) if x==" "] + inds = [(inds[j]+1,inds[j+1]) for j in range(len(inds)-1)] + data = [line for line in data if line[0] != "#"] + for line in data: + if not len(line.strip()): + continue + [read, acc, tlen, qname, qaccr, qlen, seq_evalue, seq_score, seq_bias, \ + seq_num, seq_of, dom_cEvalue, dom_iEvalue, dom_score, dom_bias, \ + hmm_start, hmm_end, dom_start, dom_end, env_start, env_end] = \ + line.split()[:21] +# [line[x[0]:x[1]].strip() for x in inds[:21]] + if string.atof(dom_iEvalue) < options.evalue: +# resu.append("\t".join([read, acc, tlen, qname, qaccr, \ +# qlen, seq_evalue, seq_score, seq_bias, seq_num, seq_of, \ +# dom_cEvalue, dom_iEvalue, dom_score, dom_bias, hmm_start, \ +# hmm_end, dom_start, dom_end, env_start, env_end])) + resu.append("\t".join([qname, dom_start, dom_end, read, dom_iEvalue])) + +# print resu[0] +# print resu[-1] + return resu + + + +hmm_resu = [] +for kingdom in options.kingdoms.split(','): + for moltype in options.moltypes.split(','): + #print kingdom, moltype + #hmm_out_fname = "%s.%s_%s.out" % (out_fname, kingdom, moltype) + temp_hmm_out = tempfile.NamedTemporaryFile(delete=False) + #dom_out_fname = "%s.%s_%s.dom" % (out_fname, kingdom, moltype) + temp_dom_out = tempfile.NamedTemporaryFile(delete=False) + #cmd = '/home/thumper6/camera-annotation/sitao/hmmer3.0/hmmer-3.0-linux-intel-x86_64/binaries/hmmsearch --cpu 1 -o %s --domtblout %s -E %g %s/%s_%s.hmm %s' % \ + cmd = 'hmmsearch --cpu %s -o %s --domtblout %s -E %g %s/%s_%s.hmm %s' % \ + (options.cpu, temp_hmm_out.name, temp_dom_out.name, \ + options.evalue,os.path.abspath(options.hmm_path),kingdom,moltype,temp_fasta.name) +# hmm_resu += parse_hmmsearch(os.popen(cmd)) + os.system(cmd) + hmm_resu += parse_hmmsearch(kingdom, moltype, temp_dom_out.name) + os.remove(temp_hmm_out.name) + os.remove(temp_dom_out.name) + +dict_read2kingdom = {} +for line in hmm_resu: + [feature_type, r_start, r_end, read, evalue] = line.strip().split('\t') + read = read.split('|')[0] + evalue = string.atof(evalue) + kingdom = feature_type.split('_')[0] + if read in dict_read2kingdom: + if evalue < dict_read2kingdom[read][1]: + dict_read2kingdom[read] = [kingdom, evalue] + else: + dict_read2kingdom[read] = [kingdom, evalue] + +header = ['##seq_name','method','feature','start','end','evalue','strand','gene'] +ff = open(options.out_gff,"w") +dict_rRNA = {'arc_lsu':'Archaeal:23S_rRNA','arc_ssu':'Archaeal:16S_rRNA','arc_tsu':'Archaeal:5S_rRNA', + 'bac_lsu':'Bacterial:23S_rRNA','bac_ssu':'Bacterial:16S_rRNA','bac_tsu':'Bacterial:5S_rRNA', + 'euk_lsu':'Eukaryotic:28S_rRNA','euk_ssu':'Eukaryotic18S_rRNA','euk_tsu':'Eukaryotic:8S_rRNA'} + +ff.write('\t'.join(header)+'\n') +for line in hmm_resu: +# [kingdom, moltype, read, acc, tlen, qname, qaccr, \ +# qlen, seq_evalue, seq_score, seq_bias, seq_num, seq_of, \ +# dom_cEvalue, dom_iEvalue, dom_score, dom_bias, hmm_start, \ +# hmm_end, dom_start, dom_end, env_start, env_end] = line.strip().split('\t') + [feature_type, r_start, r_end, read, evalue] = line.strip().split('\t') + if dict_read2kingdom[read.split('|')[0]][0] != feature_type.split('_')[0]: + continue + feature_type = dict_rRNA[feature_type] + if read.endswith('|rev'): + strand = '-' + tmp = map(string.atoi,[r_start,r_end]) + pos = string.atoi(read[1:-4]) + header = headers[pos][0] + L = headers[pos][1] + [r_end,r_start] = [str(L+1-x) for x in tmp] + else: + strand = '+' + pos = string.atoi(read[1:]) + header = headers[pos][0] + ff.write('\t'.join([header.split()[0], 'rna_hmm3','rRNA',r_start,r_end,evalue,strand,feature_type])+'\n') +ff.close() + + +os.remove(temp_fasta.name) + +#postprocessing +rRNA_dict={} + +if options.out_gff: + for line in open(options.out_gff).readlines()[1:]: + line=line.strip().split() + #print '%s %s'% (line[0],line[1]) + if rRNA_dict.get(line[0],None) is None: + rRNA_dict[line[0]] = [] + #else: + rRNA_dict[line[0]].append( (string.atoi(line[3]),string.atoi(line[4]),line[6],line[7])) + +if options.out_mask: + f_mask = open(options.out_mask,"w") + +if options.out_seq: + f_seq = open(options.out_seq,"w") +for rec in fasta.fasta_itr( options.input_fasta ): + header = rec.header.split()[0] + seq = rec.sequence[:] + tno = 1 + for (start,end,strand,rRNA_type) in rRNA_dict.get(header,[]): + seq = seq[:(start-1)]+'N'*(end-start+1)+seq[end:] + + #print "%s %d"% (header,tno) + if options.out_seq: + f_seq.write(">%s.%d /start=%d /end=%d /strand=%s %s\n" % (header,tno,start,end,strand,rRNA_type)) + if strand=="+": # forward strand + f_seq.write(format(rec.sequence[(start-1):end])+"\n") + else: + f_seq.write(format(rec.sequence[(start-1):end][::-1].translate(tr))+"\n") + tno = tno+1 + + #f_mask.write(">"+header+"\n") + #next line by liwz + if options.out_mask: + f_mask.write(">"+rec.header+"\n") + f_mask.write(format(seq)+"\n") + +if options.out_mask: + f_mask.close() +if options.out_seq: + f_seq.close() + + +