# HG changeset patch
# User greg
# Date 1452448992 18000
# Node ID bc3f2a5c7b5310440d36855ea2c5e3d54ac32e92
Uploaded
diff -r 000000000000 -r bc3f2a5c7b53 fasta_extract.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_extract.py Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,84 @@
+import argparse
+import csv
+import os
+import sys
+
+from fasta_extract_utils import Fasta
+
+
+def reverse_complement(bases):
+ complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+ return ''.join(complements[b.upper()] for b in reversed(bases))
+
+
+def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False):
+ attrs = 'u%dd%d' % (subtract_from_start, add_to_end)
+ if extend_existing:
+ attrs += 'x'
+ if consider_strand:
+ attrs += '_s'
+ if orphan:
+ attrs += '_orphan'
+ format = 'gff'
+ else:
+ format = 'fasta'
+ return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format))
+
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit(1)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.')
+parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
+parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
+parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
+parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.')
+args = parser.parse_args()
+
+fasta = Fasta(args.genome_file)
+
+for (input_filename, hid) in args.inputs:
+ extend_existing = args.extend_existing == 'existing'
+ consider_strand = args.strand == 'yes'
+ reader = csv.reader(open(input_filename, 'rU'), delimiter='\t')
+ fasta_output_path = get_output_path(args.hid,
+ args.subtract_from_start,
+ args.add_to_end,
+ extend_existing,
+ consider_strand)
+ output = open(fasta_output_path, 'wb')
+ gff_output_path = get_output_path(args.hid,
+ args.subtract_from_start,
+ args.add_to_end,
+ extend_existing,
+ consider_strand,
+ orphan=True)
+ orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t')
+ for row in reader:
+ if len(row) != 9 or row[0].startswith('#'):
+ continue
+ try:
+ cname = row[0]
+ start = int(row[3])
+ end = int(row[4])
+ strand = row[6]
+ if extend_existing:
+ start -= args.subtract_from_start
+ end += args.add_to_end
+ else:
+ midpoint = (start + end) // 2
+ start = midpoint - args.subtract_from_start
+ end = midpoint + args.add_to_end
+ if 1 <= start and end <= len(fasta[cname]):
+ output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand))
+ bases = fasta[cname][start-1:end]
+ if consider_strand and strand == '-':
+ bases = reverse_complement(bases)
+ output.write('%s\n' % bases)
+ else:
+ orphan_writer.writerow(row)
+ except Exception, e:
+ stop_err(str(e))
diff -r 000000000000 -r bc3f2a5c7b53 fasta_extract.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_extract.xml Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,71 @@
+
diff -r 000000000000 -r bc3f2a5c7b53 fasta_extract_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_extract_macros.xml Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,29 @@
+
+
+ 1.0
+
+
+ anaconda
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @unpublished{None,
+ author = {None},
+ title = {None},
+ year = {None},
+ eprint = {None},
+ url = {http://www.huck.psu.edu/content/research/independent-centers-excellence/center-for-eukaryotic-gene-regulation}
+ }
+
+
+
diff -r 000000000000 -r bc3f2a5c7b53 fasta_extract_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_extract_utils.py Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,343 @@
+import cPickle
+import numpy
+import os
+import string
+import sys
+
+COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx'))
+MAGIC = "@flattened@"
+
+
+def ext_is_flat(ext):
+ fh = open(ext)
+ t = fh.read(len(MAGIC))
+ fh.close()
+ return MAGIC == t
+
+
+def is_up_to_date(a, b):
+ return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
+
+
+class FastaRecord(object):
+ __slots__ = ('fh', 'start', 'stop')
+ ext = ".flat"
+ idx = ".gdx"
+
+ def __init__(self, fh, start, stop):
+ self.fh = fh
+ self.stop = stop
+ self.start = start
+
+ @property
+ def __array_interface__(self):
+ return {'shape': (len(self), ),
+ 'typestr': '|S1',
+ 'version': 3,
+ 'data': buffer(self)}
+
+ def __getitem__(self, islice):
+ fh = self.fh
+ fh.seek(self.start)
+ if isinstance(islice, (int, long)):
+ if islice < 0:
+ if -islice > self.stop - self.start:
+ raise IndexError
+ fh.seek(self.stop + islice)
+ else:
+ fh.seek(self.start + islice)
+ return fh.read(1)
+ if islice.start in (0, None) and islice.stop in (None, sys.maxint):
+ if islice.step in (1, None):
+ return fh.read(self.stop - self.start)
+ return fh.read(self.stop - self.start)[::islice.step]
+ istart, istop = self._adjust_slice(islice)
+ if istart is None:
+ return ""
+ l = istop - istart
+ if l == 0:
+ return ""
+ fh.seek(istart)
+ if islice.step in (1, None):
+ return fh.read(l)
+ return fh.read(l)[::islice.step]
+
+ def __len__(self):
+ return self.stop - self.start
+
+ def __repr__(self):
+ return "%s('%s', %i..%i)" % (self.__class__.__name__,
+ self.fh.name,
+ self.start,
+ self.stop)
+
+ def __str__(self):
+ return self[:]
+
+ def _adjust_slice(self, islice):
+ if islice.start is not None and islice.start < 0:
+ istart = self.stop + islice.start
+ else:
+ if islice.start is None:
+ istart = self.start
+ else:
+ istart = self.start + islice.start
+ if islice.stop is not None and islice.stop < 0:
+ istop = self.stop + islice.stop
+ else:
+ istop = islice.stop is None and self.stop or (self.start + islice.stop)
+ # This will give empty string.
+ if istart > self.stop:
+ return self.stop, self.stop
+ if istart < self.start:
+ istart = self.start
+ if istop < self.start:
+ istop = self.start
+ elif istop > self.stop:
+ istop = self.stop
+ return istart, istop
+
+ @classmethod
+ def copy_inplace(klass, flat_name, fasta_name):
+ """
+ Overwrite the .fasta file with the .fasta.flat file and save
+ something in the .flat as a place-holder.
+ """
+ os.rename(flat_name, fasta_name)
+ # Still need the flattened file to show it's current.
+ flatfh = open(fasta_name + klass.ext, 'wb')
+ flatfh.write(MAGIC)
+ flatfh.close()
+
+ @classmethod
+ def is_current(klass, fasta_name):
+ utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
+ if not utd:
+ return False
+ return is_up_to_date(fasta_name + klass.ext, fasta_name)
+
+ @classmethod
+ def modify_flat(klass, flat_file):
+ return open(flat_file, 'rb')
+
+ @classmethod
+ def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
+ """
+ Returns the __getitem__'able index. and the thing from which to get the seqs.
+ """
+ f = fasta_obj.fasta_name
+ if klass.is_current(f):
+ fh = open(f + klass.idx, 'rb')
+ idx = cPickle.load(fh)
+ fh.close()
+ if flatten_inplace or ext_is_flat(f + klass.ext):
+ flat = klass.modify_flat(f)
+ else:
+ flat = klass.modify_flat(f + klass.ext)
+ if flatten_inplace and not ext_is_flat(f + klass.ext):
+ del flat
+ else:
+ return idx, flat
+ idx = {}
+ flatfh = open(f + klass.ext, 'wb')
+ for i, (seqid, seq) in enumerate(seqinfo_generator):
+ if flatten_inplace:
+ if i == 0:
+ flatfh.write('>%s\n' % seqid)
+ else:
+ flatfh.write('\n>%s\n' % seqid)
+ start = flatfh.tell()
+ flatfh.write(seq)
+ stop = flatfh.tell()
+ idx[seqid] = (start, stop)
+ flatfh.close()
+ if flatten_inplace:
+ klass.copy_inplace(flatfh.name, f)
+ fh = open(f + klass.idx, 'wb')
+ cPickle.dump(idx, fh, -1)
+ fh.close()
+ return idx, klass.modify_flat(f)
+ fh = open(f + klass.idx, 'wb')
+ cPickle.dump(idx, fh, -1)
+ fh.close()
+ return idx, klass.modify_flat(f + klass.ext)
+
+
+class NpyFastaRecord(FastaRecord):
+ __slots__ = ('start', 'stop', 'mm', 'as_string')
+
+ def __init__(self, mm, start, stop, as_string=True):
+ self.mm = mm
+ self.start = start
+ self.stop = stop
+ self.as_string = as_string
+
+ def __repr__(self):
+ return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop)
+
+ @classmethod
+ def modify_flat(klass, flat_file):
+ mm = numpy.memmap(flat_file, dtype="S1", mode="r")
+ return mm
+
+ def getdata(self, islice):
+ if isinstance(islice, (int, long)):
+ if islice >= 0:
+ islice += self.start
+ else:
+ islice += self.stop
+ if islice < 0:
+ raise IndexError
+ return self.mm[islice]
+ start, stop = self._adjust_slice(islice)
+ return self.mm[start:stop:islice.step]
+
+ def __getitem__(self, islice):
+ d = self.getdata(islice)
+ return d.tostring() if self.as_string else d
+
+ @property
+ def __array_interface__(self):
+ return {'shape': (len(self), ),
+ 'typestr': '|S1',
+ 'version': 3,
+ 'data': self[:]}
+
+
+class DuplicateHeaderException(Exception):
+
+ def __init__(self, header):
+ Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
+
+
+class Fasta(dict):
+
+ def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None):
+ self.fasta_name = fasta_name
+ self.record_class = record_class
+ self.key_fn = key_fn
+ self.index, self.prepared = self.record_class.prepare(self,
+ self.gen_seqs_with_headers(key_fn),
+ flatten_inplace)
+ self.chr = {}
+
+ def __contains__(self, key):
+ return key in self.index
+
+ def __getitem__(self, i):
+ # Implements the lazy loading
+ if self.key_fn is not None:
+ i = self.key_fn(i)
+ if i in self.chr:
+ return self.chr[i]
+ c = self.index[i]
+ self.chr[i] = self.record_class(self.prepared, c[0], c[1])
+ return self.chr[i]
+
+ def __len__(self):
+ return len(self.index)
+
+ @classmethod
+ def as_kmers(klass, seq, k, overlap=0):
+ kmax = len(seq)
+ assert overlap < k, ('overlap must be < kmer length')
+ i = 0
+ while i < kmax:
+ yield i, seq[i:i + k]
+ i += k - overlap
+
+ def gen_seqs_with_headers(self, key_fn=None):
+ """
+ Remove all newlines from the sequence in a fasta file
+ and generate starts, stops to be used by the record class.
+ """
+ fh = open(self.fasta_name, 'r')
+ # Do the flattening (remove newlines)
+ # check of unique-ness of headers.
+ seen_headers = {}
+ header = None
+ seqs = None
+ for line in fh:
+ line = line.rstrip()
+ if not line:
+ continue
+ if line[0] == ">":
+ if seqs is not None:
+ if header in seen_headers:
+ raise DuplicateHeaderException(header)
+ seen_headers[header] = None
+ yield header, "".join(seqs)
+ header = line[1:].strip()
+ if key_fn is not None:
+ header = key_fn(header)
+ seqs = []
+ else:
+ seqs.append(line)
+ if seqs != []:
+ if header in seen_headers:
+ raise DuplicateHeaderException(header)
+ yield header, "".join(seqs)
+ fh.close()
+
+ def iterkeys(self):
+ for k in self.index.iterkeys():
+ yield k
+
+ def iteritems(self):
+ for k in self.keys():
+ yield k, self[k]
+
+ def keys(self):
+ return self.index.keys()
+
+ def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True):
+ """
+ Take a feature and use the start/stop or exon_keys to return the sequence from the
+ associated fasta file. By default, just return the full sequence between start
+ and stop, but if exon_keys is set to an iterable, search for those keys and use the
+ first to create a sequence and return the concatenated result.
+
+ Note that sequence is 2 characters shorter than the entire feature, to account for
+ the introns at base-pairs 12 and 16.
+
+ Also note, this function looks for an item with key of 'rnas'. If one is not found,
+ it continues on to 'exons'. If it doesn't find any of the exon keys it will fall
+ back on the start, stop of the feature:
+
+ f: a feature, a feature can have exons.
+ asstring: if true, return the sequence as a string, if false, return as a numpy array
+ auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence
+ one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals
+ """
+ assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
+ fasta = self[f['chr']]
+ sequence = None
+ if exon_keys is not None:
+ sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
+ if sequence is None:
+ start = f['start'] - int(one_based)
+ sequence = fasta[start: f['stop']]
+ if auto_rc and f.get('strand') in (-1, '-1', '-'):
+ sequence = COMPLEMENT(sequence)[::-1]
+ if asstring:
+ return sequence
+ return numpy.array(sequence, dtype='c')
+
+ def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
+ """
+ f: a feature dict
+ fasta: a Fasta object
+ exon_keys: an iterable of keys, to look for start/stop arrays to get sequence.
+ base: if base ('locations') exists, look there fore the exon_keys, not in the base
+ of the object.
+ """
+ fbase = f.get(base, f)
+ for ek in exon_keys:
+ if ek not in fbase:
+ continue
+ locs = fbase[ek]
+ seq = ""
+ for start, stop in locs:
+ seq += fasta[start - int(one_based):stop]
+ return seq
+ return None
diff -r 000000000000 -r bc3f2a5c7b53 tool-data/all_fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
+#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
diff -r 000000000000 -r bc3f2a5c7b53 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,7 @@
+
+
+
+ value, dbkey, name, path
+
+
+
diff -r 000000000000 -r bc3f2a5c7b53 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sun Jan 10 13:03:12 2016 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+