Mercurial > repos > greg > fasta_extract
changeset 0:bc3f2a5c7b53 draft
Uploaded
author | greg |
---|---|
date | Sun, 10 Jan 2016 13:03:12 -0500 |
parents | |
children | 3fb7f36c2c8a |
files | fasta_extract.py fasta_extract.xml fasta_extract_macros.xml fasta_extract_utils.py tool-data/all_fasta.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 7 files changed, 558 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_extract.py Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,84 @@ +import argparse +import csv +import os +import sys + +from fasta_extract_utils import Fasta + + +def reverse_complement(bases): + complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} + return ''.join(complements[b.upper()] for b in reversed(bases)) + + +def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False): + attrs = 'u%dd%d' % (subtract_from_start, add_to_end) + if extend_existing: + attrs += 'x' + if consider_strand: + attrs += '_s' + if orphan: + attrs += '_orphan' + format = 'gff' + else: + format = 'fasta' + return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format)) + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + + +parser = argparse.ArgumentParser() +parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.') +parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.') +parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.') +parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.') +parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.') +args = parser.parse_args() + +fasta = Fasta(args.genome_file) + +for (input_filename, hid) in args.inputs: + extend_existing = args.extend_existing == 'existing' + consider_strand = args.strand == 'yes' + reader = csv.reader(open(input_filename, 'rU'), delimiter='\t') + fasta_output_path = get_output_path(args.hid, + args.subtract_from_start, + args.add_to_end, + extend_existing, + consider_strand) + output = open(fasta_output_path, 'wb') + gff_output_path = get_output_path(args.hid, + args.subtract_from_start, + args.add_to_end, + extend_existing, + consider_strand, + orphan=True) + orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t') + for row in reader: + if len(row) != 9 or row[0].startswith('#'): + continue + try: + cname = row[0] + start = int(row[3]) + end = int(row[4]) + strand = row[6] + if extend_existing: + start -= args.subtract_from_start + end += args.add_to_end + else: + midpoint = (start + end) // 2 + start = midpoint - args.subtract_from_start + end = midpoint + args.add_to_end + if 1 <= start and end <= len(fasta[cname]): + output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand)) + bases = fasta[cname][start-1:end] + if consider_strand and strand == '-': + bases = reverse_complement(bases) + output.write('%s\n' % bases) + else: + orphan_writer.writerow(row) + except Exception, e: + stop_err(str(e))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_extract.xml Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,71 @@ +<tool id="fasta_extract" name="Extract fasta sequences" version="1.0.0"> + <description>using coordinates from assembled genomes</description> + <macros> + <import>fasta_extract_macros.xml</import> + </macros> + <expand macro="requirements" /> + <command> + <![CDATA[ + mkdir -p output_dir && + python $__tool_directory__/fasta_extract.py + #for $i in $inputs: + --input "$i" "${i.hid}" + #end for + #if str($reference_genome_cond.reference_genome_source) == "cached": + #set genome_file = $reference_genome_cond.reference_genome.fields.path + #else: + #set genome_file = $reference_genome_cond.reference_genome + #end if + --genome_file "$genome_file" + --subtract_from_start $subtract_from_start + --add_to_end $add_to_end + --extend_existing $extend_existing + --strand $strand + ]]> + </command> + <inputs> + <param name="inputs" type="data" format="gff" multiple="True" label="Fetch sequences for intervals in"> + <validator type="unspecified_build" /> + </param> + <conditional name="reference_genome_cond"> + <param name="reference_genome_source" type="select" label="Choose the source for the reference genome"> + <option value="cached">Locally Cached</option> + <option value="history">From History</option> + </param> + <when value="cached"> + <param name="reference_genome" type="select" label="Using reference genome"> + <options from_data_table="all_fasta" /> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + </when> + <when value="history"> + <param name="reference_genome" type="data" format="fasta" label="Using reference genome"/> + </when> + </conditional> + <param name="subtract_from_start" type="integer" value="50" min="0" label="Distance to subtract from start" /> + <param name="add_to_end" type="integer" value="50" min="0" label="Distance to add to end" /> + <param name="extend_existing" type="select" label="Extend existing start and end coordinates or extend from computed midpoint?"> + <option value="midpoint" selected="True">Extend from computed midpoint</option> + <option value="existing">Extend from existing start and end coordinates</option> + </param> + <param name="strand" type="select" label="Extract reverse complement sequence on reverse strand?"> + <option value="no" selected="True">No</option> + <option value="yes">Yes</option> + </param> + </inputs> + <outputs> + <collection name="fasta_extract_output" type="list" label="Extract fasta sequences ${on_string}"> + <discover_datasets pattern="(?P<designation>.*)" directory="output_dir" ext="fasta" visible="false" /> + </collection> + </outputs> + <tests> + <test> + </test> + </tests> + <help> + +**What it does** + + </help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_extract_macros.xml Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,29 @@ +<?xml version='1.0' encoding='UTF-8'?> +<macros> + <token name="@WRAPPER_VERSION@">1.0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="2.3.0">anaconda</requirement> + </requirements> + </xml> + <xml name="stdio"> + <stdio> + <exit_code range="1:"/> + <exit_code range=":-1"/> + <regex match="Error:"/> + <regex match="Exception:"/> + </stdio> + </xml> + <xml name="citations"> + <citations> + <citation type="bibtex"> + @unpublished{None, + author = {None}, + title = {None}, + year = {None}, + eprint = {None}, + url = {http://www.huck.psu.edu/content/research/independent-centers-excellence/center-for-eukaryotic-gene-regulation} + }</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_extract_utils.py Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,343 @@ +import cPickle +import numpy +import os +import string +import sys + +COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')) +MAGIC = "@flattened@" + + +def ext_is_flat(ext): + fh = open(ext) + t = fh.read(len(MAGIC)) + fh.close() + return MAGIC == t + + +def is_up_to_date(a, b): + return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime + + +class FastaRecord(object): + __slots__ = ('fh', 'start', 'stop') + ext = ".flat" + idx = ".gdx" + + def __init__(self, fh, start, stop): + self.fh = fh + self.stop = stop + self.start = start + + @property + def __array_interface__(self): + return {'shape': (len(self), ), + 'typestr': '|S1', + 'version': 3, + 'data': buffer(self)} + + def __getitem__(self, islice): + fh = self.fh + fh.seek(self.start) + if isinstance(islice, (int, long)): + if islice < 0: + if -islice > self.stop - self.start: + raise IndexError + fh.seek(self.stop + islice) + else: + fh.seek(self.start + islice) + return fh.read(1) + if islice.start in (0, None) and islice.stop in (None, sys.maxint): + if islice.step in (1, None): + return fh.read(self.stop - self.start) + return fh.read(self.stop - self.start)[::islice.step] + istart, istop = self._adjust_slice(islice) + if istart is None: + return "" + l = istop - istart + if l == 0: + return "" + fh.seek(istart) + if islice.step in (1, None): + return fh.read(l) + return fh.read(l)[::islice.step] + + def __len__(self): + return self.stop - self.start + + def __repr__(self): + return "%s('%s', %i..%i)" % (self.__class__.__name__, + self.fh.name, + self.start, + self.stop) + + def __str__(self): + return self[:] + + def _adjust_slice(self, islice): + if islice.start is not None and islice.start < 0: + istart = self.stop + islice.start + else: + if islice.start is None: + istart = self.start + else: + istart = self.start + islice.start + if islice.stop is not None and islice.stop < 0: + istop = self.stop + islice.stop + else: + istop = islice.stop is None and self.stop or (self.start + islice.stop) + # This will give empty string. + if istart > self.stop: + return self.stop, self.stop + if istart < self.start: + istart = self.start + if istop < self.start: + istop = self.start + elif istop > self.stop: + istop = self.stop + return istart, istop + + @classmethod + def copy_inplace(klass, flat_name, fasta_name): + """ + Overwrite the .fasta file with the .fasta.flat file and save + something in the .flat as a place-holder. + """ + os.rename(flat_name, fasta_name) + # Still need the flattened file to show it's current. + flatfh = open(fasta_name + klass.ext, 'wb') + flatfh.write(MAGIC) + flatfh.close() + + @classmethod + def is_current(klass, fasta_name): + utd = is_up_to_date(fasta_name + klass.idx, fasta_name) + if not utd: + return False + return is_up_to_date(fasta_name + klass.ext, fasta_name) + + @classmethod + def modify_flat(klass, flat_file): + return open(flat_file, 'rb') + + @classmethod + def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace): + """ + Returns the __getitem__'able index. and the thing from which to get the seqs. + """ + f = fasta_obj.fasta_name + if klass.is_current(f): + fh = open(f + klass.idx, 'rb') + idx = cPickle.load(fh) + fh.close() + if flatten_inplace or ext_is_flat(f + klass.ext): + flat = klass.modify_flat(f) + else: + flat = klass.modify_flat(f + klass.ext) + if flatten_inplace and not ext_is_flat(f + klass.ext): + del flat + else: + return idx, flat + idx = {} + flatfh = open(f + klass.ext, 'wb') + for i, (seqid, seq) in enumerate(seqinfo_generator): + if flatten_inplace: + if i == 0: + flatfh.write('>%s\n' % seqid) + else: + flatfh.write('\n>%s\n' % seqid) + start = flatfh.tell() + flatfh.write(seq) + stop = flatfh.tell() + idx[seqid] = (start, stop) + flatfh.close() + if flatten_inplace: + klass.copy_inplace(flatfh.name, f) + fh = open(f + klass.idx, 'wb') + cPickle.dump(idx, fh, -1) + fh.close() + return idx, klass.modify_flat(f) + fh = open(f + klass.idx, 'wb') + cPickle.dump(idx, fh, -1) + fh.close() + return idx, klass.modify_flat(f + klass.ext) + + +class NpyFastaRecord(FastaRecord): + __slots__ = ('start', 'stop', 'mm', 'as_string') + + def __init__(self, mm, start, stop, as_string=True): + self.mm = mm + self.start = start + self.stop = stop + self.as_string = as_string + + def __repr__(self): + return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop) + + @classmethod + def modify_flat(klass, flat_file): + mm = numpy.memmap(flat_file, dtype="S1", mode="r") + return mm + + def getdata(self, islice): + if isinstance(islice, (int, long)): + if islice >= 0: + islice += self.start + else: + islice += self.stop + if islice < 0: + raise IndexError + return self.mm[islice] + start, stop = self._adjust_slice(islice) + return self.mm[start:stop:islice.step] + + def __getitem__(self, islice): + d = self.getdata(islice) + return d.tostring() if self.as_string else d + + @property + def __array_interface__(self): + return {'shape': (len(self), ), + 'typestr': '|S1', + 'version': 3, + 'data': self[:]} + + +class DuplicateHeaderException(Exception): + + def __init__(self, header): + Exception.__init__(self, 'headers must be unique: %s is duplicated' % header) + + +class Fasta(dict): + + def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None): + self.fasta_name = fasta_name + self.record_class = record_class + self.key_fn = key_fn + self.index, self.prepared = self.record_class.prepare(self, + self.gen_seqs_with_headers(key_fn), + flatten_inplace) + self.chr = {} + + def __contains__(self, key): + return key in self.index + + def __getitem__(self, i): + # Implements the lazy loading + if self.key_fn is not None: + i = self.key_fn(i) + if i in self.chr: + return self.chr[i] + c = self.index[i] + self.chr[i] = self.record_class(self.prepared, c[0], c[1]) + return self.chr[i] + + def __len__(self): + return len(self.index) + + @classmethod + def as_kmers(klass, seq, k, overlap=0): + kmax = len(seq) + assert overlap < k, ('overlap must be < kmer length') + i = 0 + while i < kmax: + yield i, seq[i:i + k] + i += k - overlap + + def gen_seqs_with_headers(self, key_fn=None): + """ + Remove all newlines from the sequence in a fasta file + and generate starts, stops to be used by the record class. + """ + fh = open(self.fasta_name, 'r') + # Do the flattening (remove newlines) + # check of unique-ness of headers. + seen_headers = {} + header = None + seqs = None + for line in fh: + line = line.rstrip() + if not line: + continue + if line[0] == ">": + if seqs is not None: + if header in seen_headers: + raise DuplicateHeaderException(header) + seen_headers[header] = None + yield header, "".join(seqs) + header = line[1:].strip() + if key_fn is not None: + header = key_fn(header) + seqs = [] + else: + seqs.append(line) + if seqs != []: + if header in seen_headers: + raise DuplicateHeaderException(header) + yield header, "".join(seqs) + fh.close() + + def iterkeys(self): + for k in self.index.iterkeys(): + yield k + + def iteritems(self): + for k in self.keys(): + yield k, self[k] + + def keys(self): + return self.index.keys() + + def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True): + """ + Take a feature and use the start/stop or exon_keys to return the sequence from the + associated fasta file. By default, just return the full sequence between start + and stop, but if exon_keys is set to an iterable, search for those keys and use the + first to create a sequence and return the concatenated result. + + Note that sequence is 2 characters shorter than the entire feature, to account for + the introns at base-pairs 12 and 16. + + Also note, this function looks for an item with key of 'rnas'. If one is not found, + it continues on to 'exons'. If it doesn't find any of the exon keys it will fall + back on the start, stop of the feature: + + f: a feature, a feature can have exons. + asstring: if true, return the sequence as a string, if false, return as a numpy array + auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence + one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals + """ + assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys()) + fasta = self[f['chr']] + sequence = None + if exon_keys is not None: + sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based) + if sequence is None: + start = f['start'] - int(one_based) + sequence = fasta[start: f['stop']] + if auto_rc and f.get('strand') in (-1, '-1', '-'): + sequence = COMPLEMENT(sequence)[::-1] + if asstring: + return sequence + return numpy.array(sequence, dtype='c') + + def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True): + """ + f: a feature dict + fasta: a Fasta object + exon_keys: an iterable of keys, to look for start/stop arrays to get sequence. + base: if base ('locations') exists, look there fore the exon_keys, not in the base + of the object. + """ + fbase = f.get(base, f) + for ek in exon_keys: + if ek not in fbase: + continue + locs = fbase[ek] + seq = "" + for start, stop in locs: + seq += fasta[start - int(one_based):stop] + return seq + return None
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/all_fasta.loc.sample Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/all_fasta.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sun Jan 10 13:03:12 2016 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="anaconda" version="2.3.0"> + <repository changeset_revision="d3f29b11da06" name="package_anaconda_2_3_0" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>