Mercurial > repos > alenail > chipsequtil_old
changeset 2:f59f5348d281 draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/.gitignore Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,5 @@ +*.swp +build +src/chipsequtil/org_settings.cfg +dist +*.pyc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/MANIFEST.in Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,5 @@ +include README.txt +include org_settings.cfg.sample +include setup.* +recursive-include scripts *.py +recursive-include src *.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/README.txt Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,29 @@ +Installation +============ + +Before installing, make a copy of *org_settings.cfg.sample* to *org_settings.cfg* : + + $> cp org_settings.cfg.sample org_settings.cfg + +In the new *org_settings.cfg*, create/edit the paths and categories desired for +your system as appropriate. When you have configured the file to your +satisfaction, copy it into the root source directory: + + $> cp org_settings.cfg src/chipsequtil/ + +You can then install the package with: + + $> python setup.py install + + +If you'd like to install the package to a non-system directory (e.g., if you +don't have permission to install system-wide packages), you can provide the +*--prefix=PATH* argument to the install command: + + $> python setup.py install --prefix=/path/to/dir + +Remember to add */path/to/dir* to your PYTHONPATH environment variable if it +is not already there. If you wish to add more system-wide paths/organisms to +org_settings.cfg, either edit the file in the source directory as above and +reinstall (good way) or edit the file in the directory where the package is +installed (less good way).
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/Makefile Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,89 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ChIPSeqUtil.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ChIPSeqUtil.qhc" + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ + "run these through (pdf)latex." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt."
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/get_script_help.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +import glob +import signal +import time +from subprocess import Popen, PIPE +from textwrap import TextWrapper + +class Alarm(Exception): + pass + +def alarm_handler(signum, frame): + raise Alarm + +signal.signal(signal.SIGALRM, alarm_handler) + +scripts = [#'../scripts/build_chipseq_infosite.py', + '../scripts/chipseq_pipeline.py', + #'../scripts/combine_gerald_stats.py', + #'../scripts/compare_microarray_binding.py', + '../scripts/create_pipeline_script.py', + '../scripts/extract_promoters.py', + '../scripts/filter_bed_by_position_count.py', + '../scripts/filter_macs_peaks.py', + '../scripts/filter_gps_peaks.py', + '../scripts/filter_mapped_known_genes.py', + #'../scripts/generate_stats_doc.py', + '../scripts/gerald_stats.py', + '../scripts/gerald_to_bed.py', + #'../scripts/integrate_macs_ucsc.py', + '../scripts/join_mapped_known_genes.py', + '../scripts/map_intervals.py', + '../scripts/map_peaks_to_genes.py', + '../scripts/map_peaks_to_known_genes.py', + '../scripts/motif_scan.py', + '../scripts/nibFrag.py', + '../scripts/org_settings.py', + '../scripts/peaks_to_fasta.py', + '../scripts/plot_pos_vs_neg_peaks.py', + '../scripts/plot_peak_loc_dist.py', + #'../scripts/probeset_to_known_gene.py', + '../scripts/rejection_sample_fasta.py', + '../scripts/sort_bed.py', + #'../scripts/split_file.py', + #'../scripts/split_qsub.py', + #'../scripts/THEME.sh', + #'../scripts/wait_for_qsub.py', + '../scripts/wait_for_jobid.py', + '../scripts/wqsub.py', + '../scripts/wqsub_drmaa.py', + ] + +if __name__ == '__main__' : + + tw = TextWrapper(initial_indent=" ",subsequent_indent=" ") + script_help_out = '' + refs = '' + for script in scripts : + cmd = 'python %s -h'%script + p = Popen(cmd,shell=True,stdout=PIPE,stderr=PIPE) + + stdout, stderr = None, None + signal.alarm(3) # 3 seconds + try: + stdout, stderr = p.communicate() + signal.alarm(0) # reset the alarm + except Alarm: + pass + + script_str = script.replace('../scripts/','') + + + refs += ' - :ref:`%(script_str)s <%(script_str)s>`\n'%{'script_str':script_str} + script_help_out += '.. _%s:\n\n'%script_str + script_help_out += '%s::\n\n'%script_str + if stderr is None : + script_help_out += tw.fill('empty docstring\n') + else : + script_help_out += '\n'.join([' '+x for x in stdout.split('\n')]) + script_help_out += '\n'.join([' '+x for x in stderr.split('\n')]) + script_help_out += '\n\n' + script_help_out += ':ref:`top <top>`\n\n' + + rst_str = """\ +Illumina pipeline script reference +================================== + +The following is the output of the scripts provided by this package when invoked +on the command line with *-h*. + +.. _top: + +Scripts: +%(refs)s + +%(script_help_out)s +"""%{'refs':refs,'script_help_out':script_help_out} + + print rst_str
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/conf.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- +# +# ChIPSeqUtil documentation build configuration file, created by +# sphinx-quickstart on Mon Oct 31 13:12:52 2011. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'ChIPSeqUtil' +copyright = u'2011, Adam Labadorf' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.5' +# The full version, including alpha/beta/rc tags. +release = '1.5' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'ChIPSeqUtildoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'ChIPSeqUtil.tex', u'ChIPSeqUtil Documentation', + u'Adam Labadorf', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/index.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,51 @@ +.. ChIPSeqUtil documentation master file, created by + sphinx-quickstart on Mon Oct 31 13:12:52 2011. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to ChIPSeqUtil's documentation! +======================================= + +ChIPSeqUtil is a python module and accompanying set of scripts used in the +analysis of ChIPSeq short read data. It is designed as a 'push-button' solution +that is easy for non-linux-experts to use but is flexible and extensible enough +to accomodate special cases when they inevitably arise. The default pipeline +performs the following analysis steps: + +1. runs a peak caller (MACS by default) +2. optionally creates and stages bigwig files for viewing on UCSC Genome Browser +3. filters peaks based on confidence criteria (e.g. p-value) +4. maps peaks to genes using UCSC knownGene annotations +5. performs hypothesis-based motif analysis using TRANSFAC motifs +6. builds a web page consolidating results + +ChIPSeqUtil has the following dependencies: + + - MACS (or some other peaks caller) + - TAMO + - reStUtil + - pypeline + - bx python + +.. note:: add links to these bullets + +ChIPSeqUtil has only been tested on ubuntu-based linux distributions and no +certification is made for other OSes. That being said, some/all of it may +still work. + +Contents: + +.. toctree:: + :maxdepth: 2 + + quick_start + script_reference + module_reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_reference.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,11 @@ + +Module Reference +================ + +The module documentation of the chipsequtil python package is here. + +.. toctree:: + + module_src/chipsequtil + module_src/nib + module_src/seq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/chipsequtil.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,27 @@ + +chipsequtil +=========== + +Contents +-------- + +.. toctree:: + + file_wrappers + org_settings + + +.. automodule:: chipsequtil + :members: + :undoc-members: + +Miscellaneous Functions +----------------------- + +.. autofunction:: get_file_parts +.. autofunction:: parse_number +.. autofunction:: gerald_to_bed +.. autofunction:: reverse_complement +.. autofunction:: get_gc_content +.. autofunction:: get_gc_content_distribution +.. autofunction:: get_size_distribution
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/file_wrappers.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,28 @@ + +File Wrappers +============= + +.. module:: chipsequtil + +.. autoclass:: SmartFileIter + :members: + +SmartFileIter-based classes +--------------------------- + +.. autoclass:: BEDFile +.. autoclass:: GPSFile +.. autoclass:: MACSFile +.. autoclass:: KnownGeneFile + +Other wrappers +-------------- + +Not all of the file wrappers in this package have been converted to SmartFileIters +yet, these work but are less robust. + +.. autoclass:: AffyBiocFile +.. autoclass:: GERALDOutput + :members: +.. autoclass:: RefGeneFile +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/motiftools.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,56 @@ + +Motif Classes and Functions +=========================== + +This module is essentially a copy of TAMO.MotifTools, moved into chipsequtil +for strategic sheep purposes. + +.. automodule:: chipsequtil.motiftools + +The Motif Class +--------------- + +.. autoclass:: Motif + :members: + +Functions +--------- + +.. .. autofunction:: revcomplement +.. autofunction:: Motif_from_ll +.. autofunction:: Motif_from_counts +.. autofunction:: Motif_from_text +.. autofunction:: copy +.. .. autofunction:: minwindowdiff +.. .. autofunction:: minaligndiff +.. autofunction:: diff +.. autofunction:: maskdiff +.. autofunction:: infomaskdiff +.. autofunction:: diverge +.. autofunction:: bestseqs +.. autofunction:: seqs2fasta +.. autofunction:: top_nmers +.. autofunction:: m_matches +.. autofunction:: compare_seqs +.. autofunction:: shuffle_bases +.. autofunction:: random_diff_avestd +.. autofunction:: random_motif +.. autofunction:: toDict +.. autofunction:: toDictVect +.. autofunction:: submotif +.. autofunction:: shuffledP +.. autofunction:: revcompmotif +.. autofunction:: sum +.. autofunction:: giflogo +.. autofunction:: seqlogo +.. autofunction:: merge +.. autofunction:: avestd +.. autofunction:: load +.. autofunction:: save_motifs +.. autofunction:: print_motif +.. autofunction:: print_motifs +.. autofunction:: nlog10 +.. autofunction:: txt2motifs +.. autofunction:: pickletxt2motifs +.. autofunction:: sortby +.. .. autoclass:: MotifToolsException
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/nib.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,36 @@ + +.. module:: chipsequtil.nib + +nibFrag API +=========== + +These functions and classes are a native python implementation of Jim Kent's nibFrag +utility and file format. The scripts and classes read *.nib* files and can +extract sequences from them as fast or faster than the standalone tools, and +also make sequence data accessible and efficient from within python scripts. +There is no provided utility to create *.nib* files, the original source scripts +must be used and are not provided in this distribution. They might be found on +`Jim Kent's homepage <http://users.soe.ucsc.edu/~kent/>`_. + + +The NibDB Class +--------------- + +.. autoclass:: NibDB + :members: + +Functions +--------- + +Most of these functions should not be used directly, rather they are called +by the NibDB class and implement the gritty details of reading *.nib* files. +Use the NibDB class instead unless you know what you're doing. + + +.. autofunction:: get_nib +.. autofunction:: get_nib_batch +.. autofunction:: get_nib_seq +.. autofunction:: get_nib_header +.. autofunction:: get_nib_header_batch +.. autofunction:: validate_nib_file +.. autofunction:: get_nib_seq_batch
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/org_settings.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,91 @@ + +The `org_settings` System +========================= + +Many scripts in this package require a number of different source files that all +correspond to a single reference genome (*e.g.* mm9). The `org_settings` set of +functions and *org_settings.py* script consolidates sets of paths/variables that +correspond to different references to be bundled together in a customizable, +accessible way. The bundles are configured as a package-wide settings on install +and alternatively by a user-specific configuration file. The format of the file +follows the conventions in `configparser`_. + +.. _configparser: http://docs.python.org/library/configparser.html + +Reference genomes are specified in a configuration file as follows:: + + [mm9] + description=UCSC mm9 (July '07 build) with full TRANSFAC hypothesis set + genome=mm9 + genome_dir=/nfs/genomes/mouse_gp_jul_07 + genome_size=2107000000 + ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes + annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt + refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt + known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s.txt + known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s.txt + affy_to_known_path=%(genome_dir)s/anno/knownToMOE43-%(genome)s.txt + theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo + theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov + +This will make **mm9** available as an organism reference to the `org_settings` +functions. The *ucsc_chrom_sizes*, *annotation_path*, *refgene_anno_path*, +*known_gene_anno_path*, *known_gene_xref_path*, and *affy_to_known_path* are +files downloaded from http://hgdownload.cse.ucsc.edu/downloads.html organims +annotation databases. The fields in the above example are all required for the +package to work properly - however, any additional variables may be added as +desired. + +API Functions +------------- + +.. module:: chipsequtil + +.. autofunction:: get_org_settings +.. autofunction:: get_all_settings +.. autofunction:: get_global_settings +.. autofunction:: get_local_settings +.. autofunction:: check_org_settings + +The *org_settings.py* script +---------------------------- + +The script *org_settings.py* is a command line interface into the `org_settings` +system. It has the following usage:: + + $> org_settings.py -h + Usage: org_settings.py [options] [<org key> [<org setting>]] + + Tool for retrieving sets of organism-specific settings and paths. Original + paths are set at install time, and can be overridden in the file ~/.org + settings.cfg. Allows output of settings in a variety of shell environment + syntaxes. The tool attempts to guess which shell environment is being used by + examining the SHELL environment variable unless explicitly set. When run + without an argument, returns a listing of all settings available. + + Options: + -h, --help show this help message and exit + -s SYNTAX, --syntax=SYNTAX + syntax flavor of output to produce + [default: %auto] + -l, --list print all available settings for + human consumption + $> org_settings.py -s bash mm9 genome_dir + /nfs/genomes/mouse_gp_jul_07 + $> + +If you use bash as your shell, you can use shell expansion to conveniently build +commands such as the following:: + + $> map_peaks_to_known_genes.py $(org_settings.py mm9 known_gene_anno_path) \ + $(org_settings.py mm9 known_gene_xref_path) macs_peaks.xls + +Installing +---------- + +The file *org_settings.cfg* exists in the root directory of the source distribution. +This file should be modified and then copied into the *src/chipsequtil/* directory +before installation for org settings that should be available on the system as a +whole. Alternatively, users may create the file *.org_settings.cfg* in their home +directories and add sections like the one above so they may customize their own +sets of variables.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/seq.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,34 @@ + +.. module:: chipsequtil.seq + +Sequence data functions and classes +=================================== + +This module has simple methods for reading in FASTA and FASTQ formatted files. +*fasta_itr* and *fastq_itr* should be used when it is unnecessary or undesired +to have all sequences loaded into memory. *FASTAFile* and *FASTQFile* classes +store all sequence information in memory, but allow efficient dictionary-style +random access to sequences and quality scores as well as repeated whole-file +iteration. + +Functions +--------- + +.. autofunction:: fasta_itr +.. autofunction:: fasta_to_dict +.. autofunction:: write_fasta_to_file + +.. autofunction:: fastq_itr +.. autofunction:: fastq_to_dict +.. autofunction:: write_fastq_to_file + +Classes +------- + +.. autoclass:: FASTAFile + :members: + +.. autoclass:: FASTQFile + :members: + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/module_src/util.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,4 @@ + +Utility functions and classes +============================= +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/quick_start.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,5 @@ + +Quick Start Documentation +========================= + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/docs/source/script_reference.rst Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,892 @@ +Illumina pipeline script reference +================================== + +The following is the output of the scripts provided by this package when invoked +on the command line with *-h*. + +.. _top: + +Scripts: + - :ref:`chipseq_pipeline.py <chipseq_pipeline.py>` + - :ref:`create_pipeline_script.py <create_pipeline_script.py>` + - :ref:`extract_promoters.py <extract_promoters.py>` + - :ref:`filter_bed_by_position_count.py <filter_bed_by_position_count.py>` + - :ref:`filter_macs_peaks.py <filter_macs_peaks.py>` + - :ref:`filter_gps_peaks.py <filter_gps_peaks.py>` + - :ref:`filter_mapped_known_genes.py <filter_mapped_known_genes.py>` + - :ref:`gerald_stats.py <gerald_stats.py>` + - :ref:`gerald_to_bed.py <gerald_to_bed.py>` + - :ref:`join_mapped_known_genes.py <join_mapped_known_genes.py>` + - :ref:`map_intervals.py <map_intervals.py>` + - :ref:`map_peaks_to_genes.py <map_peaks_to_genes.py>` + - :ref:`map_peaks_to_known_genes.py <map_peaks_to_known_genes.py>` + - :ref:`motif_scan.py <motif_scan.py>` + - :ref:`nibFrag.py <nibFrag.py>` + - :ref:`org_settings.py <org_settings.py>` + - :ref:`peaks_to_fasta.py <peaks_to_fasta.py>` + - :ref:`plot_pos_vs_neg_peaks.py <plot_pos_vs_neg_peaks.py>` + - :ref:`plot_peak_loc_dist.py <plot_peak_loc_dist.py>` + - :ref:`rejection_sample_fasta.py <rejection_sample_fasta.py>` + - :ref:`sort_bed.py <sort_bed.py>` + - :ref:`wait_for_jobid.py <wait_for_jobid.py>` + - :ref:`wqsub.py <wqsub.py>` + - :ref:`wqsub_drmaa.py <wqsub_drmaa.py>` + + +.. _chipseq_pipeline.py: + +chipseq_pipeline.py:: + + Usage: chipseq_pipeline.py [options] <organism> <experiment alignment filename> [<control alignment filename>] + + 1st generation ChIPSeq analysis pipeline: + + - runs MACS to find peaks and sorts peaks by p-value + - sorts peaks by pvalue and isolates top *n* + - maps peaks to genes + - extracts fasta files for gene peaks in experiments + - constructs background sequences matching foreground distribution + - runs THEME.py on input sequences w/ refinement + - builds an infosite with stats from this analysis + + Control input file is optional. *organism* argument is passed to the + *org_settings.py* command to specify organism specific parameters, ensure + that the following commands return valid paths: + + If running MACS: + - org_settings.py <organism> genome_size + - org_settings.py <organism> genome_dir + - org_settings.py <organsim> refgene_anno_path + + If running THEME: + - org_settings.py <organism> theme_hypotheses + - org_settings.py <organism> theme_markov + + + + Options: + -h, --help show this help message and exit + --auto run all steps non-interactively (for batch mode, e.g.) + --steplist=STEPLIST with --auto, run specific steps + --exp-name=EXP_NAME name for the experiment/pipeline, used for convenience + [default: current directory name] + --bed-args=BED_ARGS double quote wrapped arguments for gerald_to_bed.py + [default: --stdout --chromo-strip=.fa] + --macs-exec=MACS_EXEC + the executable to use for MACS, if not an absolute + path it needs to be on your shell environment path + [default: macs14] + --macs-args=MACS_ARGS + double quote wrapped arguments for macs, only changing + --mfold, --tsize, --bw, and --pvalue recommended + [default: --pvalue=1e-5] + --map-args=MAP_ARGS double quote wrapped arguments for mapping peaks to + genes [default: --tss --upstream-window=10000 + --downstream-window=10000] + --filter-peaks-args=FILTER_PEAKS_ARGS + double quote wrapped arguments for + filter_macs_peaks.py [default: --sort-by=pvalue + --top=1000 -f 'tags>20'] + --filter-neg-peaks-args=FILTER_NEG_PEAKS_ARGS + double quote wrapped arguments for + filter_macs_peaks.py applied to negative peaks + [default: -f 'tags>20'] + --peaks-to-fa-args=PEAKS_TO_FA_ARGS + double quote wrapped arguments for peaks_to_fasta.py + [default: --fixed-peak-width=200] + --bg-exec=BG_EXEC the executable to use for generating background + sequences for THEME, if not an absolute path it needs + to be on your shell environment path [default: + rejection_sample_fasta.py] + --bg-args=BG_ARGS double quote wrapped arguments for background sequence + generation utility [default: --num-seq=2.1x] + --theme-args=THEME_ARGS + double quote wrapped arguments for THEME.py [default: + --beta=0.7 --cv=5 --trials=25] + --motif-pval-cutoff=MOTIF_PVAL + the p-value cutoff for sending non-refined enrichmed + motifs to THEME for refinement + --parallelize parallelize portions of the pipeline using qsub, only + works from SGE execution hosts + --ucsc perform tasks for automated integration with UCSC + genome browser [default:False] + --build-infosite-args=INFOSITE_ARGS + arguments to pass to build_chipseq_infosite.py + [default: None] + + UCSC Integration Options (with --ucsc): + --stage-dir=STAGE_DIR + root directory where UCSC integration files should be + made available [default: ./] + --stage-url=STAGE_URL + URL where UCSC integration files will be made + available over the web [default: http://localhost/] + + Note: it is advised to leave the --*-args arguments unchanged + unless you really know what you're doing. + + +:ref:`top <top>` + +.. _create_pipeline_script.py: + +create_pipeline_script.py:: + + This is an interactive script that creates an executable script to use + for ChIPSeq analyses. When prompted for experiment and control files, + tab completion is available a la bash or tcsh shells. Press Ctrl-C at + any time to quit. + Usage: create_pipeline_script.py + + Script for creating a custom run script for ChIPSeq/DNAse hypersensitivity + experiments. User is asked for paths and settings required for ChIPSeq + analysis using the *chipseq_pipeline.py* utility and produces an executable + run script with helpful information on how to run it. Also creates a JSON + formatted file containing all the parameters for this pipeline run. + + Options: + -h, --help show this help message and exit + + Note: this script only works in Unix-style environments + + ================= ChIPSeq Experiment Pipeline Script Generator ================= + + +:ref:`top <top>` + +.. _extract_promoters.py: + +extract_promoters.py:: + + Usage: extract_promoters.py [options] <organism> + + Extract the promoter sequences in FASTA format from all genes + or a list of genes specified in an input file. Gene annotation is RefGene + corresponding to the organism passed in, paths returned by: + + $> org_settings.py <organism> refgene_anno_path + $> org_settings.py <organism> genome_dir + + must be valid. + + Options: + -h, --help show this help message and exit + -u UPSTREAM, --upstream=UPSTREAM + upstream window from TSS to extract [default: 3000] + -d DOWNSTREAM, --downstream=DOWNSTREAM + downstream window from TSS to extract [default: 1000] + -l GENE_LIST, --gene-list=GENE_LIST + file containing a list of gene identifiers to extract, + one per line [default: none] + -t GENE_TYPE, --gene-type=GENE_TYPE + type of gene identifier in gene list, choose from + ['symbol', 'refgene'] [default: symbol] + -o OUTPUT, --output=OUTPUT + file to write fasta records to [default: stdout] + + +:ref:`top <top>` + +.. _filter_bed_by_position_count.py: + +filter_bed_by_position_count.py:: + + Usage: filter_bed_by_position_count.py [options] <bed file> + + Analyze BED file and filter out alignments above some threshold that align to + a single genomic position. + + Options: + -h, --help show this help message and exit + -n MAX_COUNT, --max-count=MAX_COUNT + max tag count at a given position, filter above + [default: 5] + --output=OUTPUT write output to file + + Note: only works if BED file is sorted! + + +:ref:`top <top>` + +.. _filter_macs_peaks.py: + +filter_macs_peaks.py:: + + Usage: filter_macs_peaks.py [options] <MACS peak file> + + Filter MACS peaks by supplied criteria. Available filter features are: + + length + tags + pvalue + fold_enrichment + fdr + + Filters are provided as expressions using the [-f |--filter] option, e.g. the + command + + filter_macs_peaks.py -f "tags>100" --filter="pvalue<=1e-9" + --filter="100<length<=200" <MACS peak file> + + finds only peaks with more than 100 tags, a pvalue of less than 1e9, and a + length between 100, exclusive, and 200, inclusive. Any number of filters may + be provided, and only peaks that match *all* filters pass. User is warned if + filters result in zero results. Only inequality operators are valid. + Invoking with no filter arguments returns all peaks. To sort, use the --sort- + by option, e.g. + + filter_macs_peaks.py -f "pvalue<=1e-9" --sort-by=pvalue <MACS peak file> + + sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. + All fields are sorted ascending by default. Output is prepended with comments + describing what the file contains, i.e. which filters are applied, how many + records there are, etc. + + Note: MACS -10*log10(pvalue) values are converted to normal pvalues + + + Options: + -h, --help show this help message and exit + -f FILTERS, --filter=FILTERS + add filter expression + --sort-by=SORT_BY comma delimited list of features to sort by, filtered + peaks are not sorted by default, if provided peaks are + sorted ascending by default + --sort-dir=SORT_DIR direction to sort [default: ASCEND] + --top=TOP accepts an integer, output at most this many peaks + [default: all] + --output=OUTPUT filename to output filtered peaks to [default: stdout] + --encode-filters write out records to a file <MACS peaks + file>_<filters>.xls (incompatible with --output + option) + --summary only print out summary information for the filter + --no-header do not print out header or metadata info + --shuffle shuffle order of filtered records, useful for + selecting random peaks + --print-encoded-fn print out the filename that would be created by + --encode-filters + + +:ref:`top <top>` + +.. _filter_gps_peaks.py: + +filter_gps_peaks.py:: + + Usage: filter_gps_peaks.py [options] <GPS peak file> + + Filter GPS peaks by supplied criteria. Available filter features are: + + IP + Control + Fold + qvalue + pvalue + IPvsEMP + IPvsCTR + + Filters are provided as expressions using the [-f |--filter] option, e.g. the + command + + filter_gps_peaks.py -f "IP>100" --filter="pvalue<=1e-9" <GPS peak file> + + finds only peaks with more than 100 tags and a pvalue of less than 1e9. Any + number of filters may be provided, and only peaks that match *all* filters + pass. User is warned if filters result in zero results. Only inequality + operators are valid. Invoking with no filter arguments returns all peaks. To + sort, use the --sort-by option, e.g. + + filter_gps_peaks.py -f "pvalue<=1e-9" --sort-by=pvalue <GPS peak file> + + sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. + All fields are sorted ascending by default. Output is prepended with comments + describing what the file contains, i.e. which filters are applied, how many + records there are, etc. + + Note: GPS P_-log10 and Q_-log10 values are converted to normal pvalues and + qvalues + + + Options: + -h, --help show this help message and exit + -f FILTERS, --filter=FILTERS + add filter expression + --sort-by=SORT_BY comma delimited list of features to sort by, filtered + peaks are not sorted by default, if provided peaks are + sorted ascending by default + --sort-dir=SORT_DIR direction to sort [default: ASCEND] + --top=TOP accepts an integer, output at most this many peaks + [default: all] + --output=OUTPUT filename to output filtered peaks to [default: stdout] + --encode-filters write out records to a file <GPS peaks + file>_<filters>.xls (incompatible with --output + option) + --summary only print out summary information for the filter + --no-header do not print out header or metadata info + --shuffle shuffle order of filtered records, useful for + selecting random peaks + --print-encoded-fn print out the filename that would be created by + --encode-filters + + +:ref:`top <top>` + +.. _filter_mapped_known_genes.py: + +filter_mapped_known_genes.py:: + + Usage: filter_mapped_known_genes.py [options] <mapped known genes file> + + Filter columns and rows from *join_mapped_known_genes.py* output which was + invoked with *--binary-plus* and *--field-types* flags. Specify full column + names for either binding or expression data with the *--bind-cols* and + *--affy-cols* arguments, respectively. The special fieldname *MAPPED* from + *join_mapped_known_genes.py* is used to determine whether a file contains a + mapping for each gene. To filter genes by their associated binding or + expression data, specify *--bind-filter* or *--affy-filter* as follows: + + - *any* - report gene if at least one input file maps to the gene + - *all* - report gene if every input file maps to the gene + - *absent* - report gene if no input file maps to the gene + - *none* - do not filter genes at all (default) + + Results of binding and expression filters are 'and'ed together, e.g.: + + --bind-filter=all --affy-filter=absent + + returns only genes for which all binding files and none of the expression + files map. + + + Options: + -h, --help show this help message and exit + --bind-cols=BIND_COLS + comma delimited list of binding data column names to + include, [default: all] + --affy-cols=AFFY_COLS + comma delimited list of expression data column names + to include, [default: all] + --bind-filter=BIND_FILT + gene set to include based on binding data [default: + none] + --affy-filter=AFFY_FILT + gene set to include based on expression data [default: + none] + --output=OUTPUT write output to file + + Note: when specifying column names, be sure to escape characters like + (,),&,*,etc... that shells interpret with a \, e.g. --bind- + cols=-10\*log10\(pvalue\) + + +:ref:`top <top>` + +.. _gerald_stats.py: + +gerald_stats.py:: + + Usage: gerald_stats.py [options] <filename> [<filename>...] + + Outputs various stats about the GERALD formatted file(s) input. If multiple + files are provided statistics are aggregated according to the specified output + format. Output formats available via --format=X : + + # *python* - print an eval()'able python dictionary w/ counts + # *rst* - print statistics in a reStructured text table (default) + # *tab* - print statistics in a tab delimited form w/ header names + + Except for *python* format, each input file has its own output line. *python* + summarizes all alignments. + + + Options: + -h, --help show this help message and exit + --output=OUTPUT write output to file [default: stdout] + --format=FORMAT format to print out stats [default: rst] + + +:ref:`top <top>` + +.. _gerald_to_bed.py: + +gerald_to_bed.py:: + + Usage: gerald_to_bed.py [options] <GERALD file> [<GERALD file>...] + + Convert the GERALD alignment formatted files into BED format. Input file + named <path>/<filename>.<ext> is translated into <path>/<filename>.bed unless + --output or --stdout is specified, in which case formatted lines are written + to file or standard output, respectively. If multiple input files are + supplied with the --output or --stdout option all formatted lines are + concatenated together. Formatting only occurs for GERALD input lines that have + a valid Match Position field (i.e. successfully aligned somewhere). + + Options: + -h, --help show this help message and exit + --output=OUTPUT write all records to file + --stdout write out all formatted lines to stdout + --min-fields only format the first three fields + --pass-only only format lines with Y in the Pass Filtering field + --chromo-strip=CHROMO_STRIP + pattern to remove from chromo field in BED output + (e.g. --chromo-strip=.fa to remve .fa from chrX.fa) + [default: .fa] + + +:ref:`top <top>` + +.. _join_mapped_known_genes.py: + +join_mapped_known_genes.py:: + + Usage: join_mapped_known_genes.py -b <mapped DNA binding file>|-a <mapped microarray file> [-b <mapped DNA binding file> ...] [-a <mapped microarray file> ...] + + Join all files on the first column, concatenating records with matching + entries onto one line per entry. Understands DNA binding data as mapped with + *map_peaks_to_known_genes.py* utility microarray data as mapped by + *probeset_to_known_genes.py* utility, passed to program using *-b* and *-a* + options respectively. If a file contains more than one mapping to a gene + additional columns are added. At least one file of either type is required. + Field names are written as <filename>.<original field name>.<map number> + + Options: + -h, --help show this help message and exit + -a AFFY_FILE, --affy-file=AFFY_FILE + add a mapped microarray file + -b BIND_FILE, --bind-file=BIND_FILE + add a mapped DNA binding file (e.g. MACS, BED) + -m MACS_FILE, --macs-file=MACS_FILE + DEPRECATED: use -b instead, add a mapped default MACS + formatted peaks (*.xls) file + --output=OUTPUT file to output joined records to [default: stdout] + --first-only only output the first mapping to a gene from each file + --binary output only one column per file with a 0 or 1 to + indicate whether a mapping exists in that file + --binary-plus output one column per file with a 0 or 1 to indicate + whether a mapping exists in that file in addition to + all other columns + --field-types prepend BIND or AFFY to the beginning of all + appropriate columns + + Note: microarray files should have been created by bioconductor, and all files + should have a row of fieldnames as the first line + + +:ref:`top <top>` + +.. _map_intervals.py: + +map_intervals.py:: + + Usage: map_intervals.py [options] <from> <to> + + Find records in <to> interval file that map to records in <from> interval + file. Files should be tab delimited and are expected to have a chromosome + column, a start column, and an end column. The indices of these columns can + be specified on the command line but by default are the first three columns, + respectively. Prints out to stdout by default one new line separated row per + row in <from> with a line from <to> where there is a mapping. If no mapping is + found (e.g. when specifying a maximum margin to search within) the word None + is printed. By default only prints nearest record, with ties settled by + smallest line number in <to>. + + Options: + -h, --help show this help message and exit + -w WINDOW, --window=WINDOW + window as <int upstream> <int downstream> to search + for intervals [default: (1000000000.0, 1000000000.0)] + -f FROM_IND, --from=FROM_IND + coordinates of chromosome, start, stop in <from> file + -i, --skip-from-header + <from> has a header that should be skipped + -t TO_IND, --to=TO_IND + coordinates of chromosome, start, stop in <to> file + -j, --skip-to-header <to> has a header that should be skipped + + +:ref:`top <top>` + +.. _map_peaks_to_genes.py: + +map_peaks_to_genes.py:: + + Usage: map_peaks_to_genes.py [options] <refGene file> <peaks file> + + Map the peaks in <peaks file> to genes in <refGene file>. <refGene file> is + format is as specified in + http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql. <peaks + file> format is as produced by MACS. + + Options: + -h, --help show this help message and exit + --upstream-window=UPST_WIN + window width in base pairs to consider promoter region + [default: 5500] + --downstream-window=DNST_WIN + window width in base pairs to consider downstream + region [default: 2500] + --map-output=PEAK_OUTPUT + filename to output mapped peaks in BED format to + [default: stdout] + --stats-output=STATS_OUTPUT + filename to output summary stats in conversion + [default: stderr] + --peaks-format=PEAKS_FMT + format of peaks input file [default: MACS] + + +:ref:`top <top>` + +.. _map_peaks_to_known_genes.py: + +map_peaks_to_known_genes.py:: + + Usage: map_peaks_to_known_genes.py [options] <knownGene file> <knownGene xRef file> <peaks file> + + + Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> + isformat is as specified in + http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks + file> format is as produced by MACS. If *auto* is chosen (default) file + extension is examined for *.xls* for default MACS format or *.bed* for BED + format. If the --detailoption is provided, the following extra fields are + appended to each row: + + peak loc, dist from feature, score, map type, map subtype + + + Options: + -h, --help show this help message and exit + --upstream-window=UPST_WIN + window width in base pairs to consider promoter region + [default: 5500] + --downstream-window=DNST_WIN + window width in base pairs to consider downstream + region [default: 2500] + --tss calculate downstream window from transcription start + site instead of transcription end site + --map-output=PEAK_OUTPUT + filename to output mapped peaks to [default: stdout] + --stats-output=STATS_OUTPUT + filename to output summary stats in conversion + [default: stderr] + --peaks-format=PEAKS_FMT + format of peaks input file [default: auto] + --detail add extra fields to output, see description + --intergenic write intergenic peaks to the gene file as well with + None as gene ID + + +:ref:`top <top>` + +.. _motif_scan.py: + +motif_scan.py:: + + Usage: motif_scan.py [options] <org> <peaks fn> <TAMO motif fn> + + Do some motif scanning stuffs + + Options: + -h, --help show this help message and exit + -n TOP_N, --top-n=TOP_N + use top n peaks by pvalue for sequence scanning + [default: all] + -i MOTIF_IND, --motif-indices=MOTIF_IND + which indices from <TAMO motif fn> to use [default: + all] + -d DIR, --dir=DIR write all results into this directory + --fixed-peak-width=FIXED_W + use only a fixed peak window around the summit instead + of whole peak + + +:ref:`top <top>` + +.. _nibFrag.py: + +nibFrag.py:: + + Usage: nibFrag.py [options] file.nib start end strand [outfile] + -- or -- + nibFrag.py [options] --batch file.nib batchfile [batchfile ...] + + A python implementation of Jim Kent's nibFrag utility that allows outputting + to stdout. Otherwise the functionality is identical for the non-batch usage. + Batch mode accepts one or more files containing sets of coordinates to extract + from the nib file. Only BED formatting is accepted at the moment. All + sequences are concatenated together in FASTA format. To retrieve the entire + sequence, use END as the end argument. + + Options: + -h, --help show this help message and exit + --no-header only output sequence (no fasta header) + --wrap-width=WRAP_WIDTH + wrap output sequence at this number of bases, 0 + indicates no wrap (sequence ends up on single line) + [default: 50] + --batch run in batch mode, interpret arguments after nib file + as queries + --batch-format=BATCH_FORMAT + format to interpret batch files [default: BED] + + Original nibFrag options: + --masked use lower case characters for bases meant to be masked + out + --hardMasked use upper case for non masked-out and 'N' characters + for masked-out bases + --upper use upper case characters for all bases + --name=NAME Use given name after '>' in output sequence + --dbHeader=DBHEADER + Add full database info to the header, with or without + -name option + --tbaHeader=TBAHEADER + Format header for compatibility with tba, takes + database name as argument + + Note: When specifying --name optionin batch mode, also specify --dbHeader to + ensure unique FASTA headers. + + +:ref:`top <top>` + +.. _org_settings.py: + +org_settings.py:: + + Usage: org_settings.py [options] [<org key> [<org setting>]] + + Tool for retrieving sets of organism-specific settings and paths. Original + paths are set at install time, and can be overridden in the file ~/.org + settings.cfg. Allows output of settings in a variety of shell environment + syntaxes. The tool attempts to guess which shell environment is being used by + examining the SHELL environment variable unless explicitly set. When run + without an argument, returns a listing of all settings available. + + Options: + -h, --help show this help message and exit + -s SYNTAX, --syntax=SYNTAX + syntax flavor of output to produce + [default: %auto] + -l, --list print all available settings for + human consumption + + +:ref:`top <top>` + +.. _peaks_to_fasta.py: + +peaks_to_fasta.py:: + + Usage: peaks_to_fasta.py [options] <organism> <peak file> [<peak file> ...] + + Extract sequences for peaks in provided peak file(s). Can interpret MACS or + BED output, determined automatically by .xls or .bed extensions respectively + (force explicit format with --peak-format option). Outputs fasta sequences + for the peaks in all files extracted from the reference genome specified by + the output of *org_settings.py <organism> genome_dir* to stdout by + default.Chromosome names in peak files must match nib filenames without + extension (e.g. peak line: chr1 0 100 searches *genome_dir*/chr1.nib). Fasta + records have the following format: + + ><chromosome>:<start>-<end>;fn=<name of file>:<line number>;db_fn=<db + filename>;fmt=<format>;<source alignment info> + <sequence...> + + <db filename> is the filename where the sequence was extracted, <format> is + the format of the input file (MACS or BED), and <source alignment info> + contains all the fields from the originating alignment according to the source + format. + + Options: + -h, --help show this help message and exit + --min-header only store <chromosome>:<start>-<end> in header + --peak-format=PEAK_FORMAT + peak file format, 'auto' determines format by + extension, choices: MACS, BED, auto [default: auto] + --output=OUTPUT filename to output fasta records to [default: stdout] + --fixed-peak-width=FIXED_PEAK_WIDTH + return a fixed number of bases flanking peak summit + (*summit* field in MACS, (end-start)/2 in BED), + ignoring start/stop coords [default: None] + --wrap-width=WRAP_WIDTH + wrap fasta sequences to specified width. -1 indicates + no wrap [default: 70] + + +:ref:`top <top>` + +.. _plot_pos_vs_neg_peaks.py: + +plot_pos_vs_neg_peaks.py:: + + Usage: plot_pos_vs_neg_peaks.py [options] <pos peaks fn> <neg peaks fn> + + Options: + -h, --help show this help message and exit + -o OUT_FN, --output=OUT_FN + filename of output image + + +:ref:`top <top>` + +.. _plot_peak_loc_dist.py: + +plot_peak_loc_dist.py:: + + Usage: plot_peak_loc_dist.py [options] <peaks fn> <gene list fn> + + Produce a pie chart of the locations of peaks in different bins (promoter, + gene, exon, intron, etc.) and, optionally, save the different records to their + own files for subsequent analysis. Also produce a histogram of distance from + feature values in mapping file. Peaks file is expected to be as output by + MACS, or alternately as a BED file but then the -b plot is not available. + Gene list file is expected to be in the format as output by + peaks_to_known_genes.py script. + + Options: + -h, --help show this help message and exit + -b BAR_FN, --bar-fn=BAR_FN + filename for pvalue stacked bar chart + -g GENE_PIE_FN, --gene-pie-fn=GENE_PIE_FN + filename for pie chart image + -p PEAK_PIE_FN, --peak-pie-fn=PEAK_PIE_FN + filename for pie chart image + -f DIST_FN, --dist-fn=DIST_FN + filename for distance from feature image + -s, --save write out files containing peaks for each category + -d OUT_DIR, --output-dir=OUT_DIR + output files created by --save option to this + directory + --no-plot dont show (but save) the figure produced + --peaks-format=PEAK_FMT + format of peaks file, either MACS or BED [default: + MACS] + + +:ref:`top <top>` + +.. _rejection_sample_fasta.py: + +rejection_sample_fasta.py:: + + Usage: rejection_sample_fasta.py [options] <organism> <fasta file> [<fasta file> ... ] + + Use rejection sampling to generate a set of background/random + sequences matching the distance to nearest transcription start site, sequence + length, and GC content distributions of the input fasta file(s). Generated + sequences are genomic sequences sampled based on these distributions. All + sequences + from all files are used to generate the background sequences. The following + command must output a path to a nib genomic sequence directory and refGene + annotation, respectively : + + $> org_settings.py <organism> genome_dir + $> org_settings.py <organism> refgene_anno_path + + Utility prints out generated fasta records to stdout by default. Input + sequences + from chr20 are mapped to chrX, chr21 are mapped to chrY, and sequences from + chrM + are not used. + + + Options: + -h, --help show this help message and exit + -n NUM_SEQS, --num-seqs=NUM_SEQS + number of sequences to generate, either absolute + number or factor of # input sequences, e.g. 2.5x for + 2.5 times the # of input sequences [default: 1x] + --output=OUTPUT file to output fasta records to [default: stdout] + --bed also produce a BED formatted file representing sampled + sequences + --bed-output=BED_OUTPUT + with --bed, file to output BED records to [default: + output.bed] + -v, --verbose print out debug information + + +:ref:`top <top>` + +.. _sort_bed.py: + +sort_bed.py:: + + Usage: sort_bed.py [options] <BED file> [<BED file> <BED file>...] + + Sort the BED formatted files first by chromosome (field 1) and then by start + coordinate (field 2). Lines from all files submitted are concatenated and + sorted in the final output. + + Options: + -h, --help show this help message and exit + --output=OUTPUT filename to write the sorted BED lines [default: stdout] + + +:ref:`top <top>` + +.. _wait_for_jobid.py: + +wait_for_jobid.py:: + + Usage: wait_for_jobid.py [options] <job id> [<job id>...] + + Poll qstat and wait until all <job id>s are finished + + Options: + -h, --help show this help message and exit + + +:ref:`top <top>` + +.. _wqsub.py: + +wqsub.py:: + + Usage: [wqsub.py] [options] command + + Wrap the specified command into a qsub script and submit it for execution. + Script captures both stdout and stderr to the current directory. By default, + all of the user's environment variables are put into the script (compatible + with SGE only ATM). + + Options: + -h, --help show this help message and exit + --wqsub-name=WQSUB_NAME + job name to submit as <--wqsub-name>_<first non- + whitespace chars in command> [default: wqsub] + --wqsub-ext=WQSUB_EXT + file extension to use for stdout files + --wqsub-keep-script do not delete qsub script generated after job + submission + --wqsub-no-env do not include any local environment variables in the + script + --wqsub-no-submit create script but do not submit job (useful for + generating scripts) + --wqsub-drm=DRM the DRM to generate scripts for [default: SGE] + --wqsub-drm-arg=DRM_ARGS + arguments to pass as parameters in the job script + specific to the DRM, use multiple option flags to + specify multiple parameters + --wqsub-wait poll the DRM and do not return control until job is + finished (only works for TORQUE) + + Note: this script only works in Unix-style environments. + + +:ref:`top <top>` + +.. _wqsub_drmaa.py: + +wqsub_drmaa.py:: + + Traceback (most recent call last): + File "../scripts/wqsub_drmaa.py", line 9, in <module> + import drmaa + ImportError: No module named drmaa + + +:ref:`top <top>` + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/mapping/map_to_known_gene.sh Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,44 @@ +#!/bin/bash + +# Usage: map_peaks_to_known_genes.py [options] <knownGene file> <knownGene xRef file> <peaks file> +# +# +# Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> +# is +# format is as specified in +# http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql. +# <peaks file> format is as produced by MACS. If *auto* is chosen (default) +# file extension is examined for *.xls* for default MACS format or *.bed* for +# BED format. If the --detailoption is provided, the following extra fields are +# appended to each row: +# +# peak loc, dist from feature, score, map type, map subtype +# +# +# Options: +# -h, --help show this help message and exit +# --upstream-window=UPST_WIN +# window width in base pairs to consider promoter region +# [default: 5500] +# --downstream-window=DNST_WIN +# window width in base pairs to consider downstream +# region [default: 2500] +# --tss calculate downstream window from transcription start +# site instead of transcription end site +# --map-output=PEAK_OUTPUT +# filename to output mapped peaks to [default: stdout] +# --stats-output=STATS_OUTPUT +# filename to output summary stats in conversion +# [default: stderr] +b# --peaks-format=PEAKS_FMT +# format of peaks input file [default: auto] +# --detail add extra fields to output, see description + +ORG=mm9 +KG_FN=$(org_settings.py $ORG known_gene_anno_path) +XREF_FN=$(org_settings.py $ORG known_gene_xref_path) +OPTS="--detail --tss --upstream-window=10000 --downstream-window=10000" +PEAKS_FN=test_peaks.xls + +echo map_peaks_to_known_genes.py $OPTS $KG_FN $XREF_FN $PEAKS_FN +map_peaks_to_known_genes.py $OPTS $KG_FN $XREF_FN $PEAKS_FN \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/mapping/test_peaks.xls Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,21 @@ +# genes: +# uc007aet.1 chr1 - 3195984 3205713 3195984 3195984 2 3195984,3203519, 3197398,3205713, uc007aet.1 +# uc008wgw.1 chr5 + 3522764 3525260 3522764 3522764 1 3522764, 3525260, uc008wgw.1 +# +# chr5 3522663 3522664 1 0 1 0 0 1 - promoter +# chr5 3522863 3522864 1 0 1 0 0 1 - in gene +# chr5 3532563 3532564 1 0 1 0 0 1 - in downsteam +# chr1 3205814 3205815 1 0 1 0 0 1 - promoter +# chr1 3205614 3205615 1 0 1 0 0 1 - in gene +# chr1 3195913 3195914 1 0 1 0 0 1 - in downstream +# chr1 319588 319588 1 0 1 0 0 1 - unmapped +# +# chr1 is - strand, chr5 + strand, assumes 10k window around TSS +chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%) +chr5 3522663 3522664 1 0 1 0 0 1 +chr5 3522863 3522864 1 0 1 0 0 1 +chr5 3532564 3532565 1 0 1 0 0 1 +chr1 3205814 3205815 1 0 1 0 0 1 +chr1 3205614 3205615 1 0 1 0 0 1 +chr1 3195913 3195914 1 0 1 0 0 1 +chr1 319588 319588 1 0 1 0 0 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/nib/shuffled_peaks.bed Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,1000 @@ +chr19 29505473 29505892 MACS_peak_4348 103.85 +chr5 23950711 23951266 MACS_peak_6268 83.33 +chr1 75303135 75303785 MACS_peak_206 88.17 +chr3 105611391 105612033 MACS_peak_5420 56.03 +chr4 140654843 140655635 MACS_peak_6105 178.49 +chr2 37590398 37590707 MACS_peak_4677 75.45 +chr1 107761995 107762362 MACS_peak_312 96.07 +chr3 153387629 153388143 MACS_peak_5657 52.58 +chr11 88165911 88166520 MACS_peak_1474 62.73 +chr11 109512132 109512551 MACS_peak_1616 128.82 +chr18 57085271 57085755 MACS_peak_4115 107.73 +chr13 96661232 96661599 MACS_peak_2313 62.56 +chr3 95164133 95164494 MACS_peak_5342 93.42 +chr3 107434353 107434982 MACS_peak_5438 65.35 +chr11 6525702 6526208 MACS_peak_1057 56.89 +chr17 71137869 71138311 MACS_peak_3922 65.19 +chr5 120915880 120916171 MACS_peak_6566 100.90 +chr14 115241544 115242039 MACS_peak_2840 66.36 +chr3 115548096 115548809 MACS_peak_5466 146.81 +chr3 143368788 143369115 MACS_peak_5597 63.16 +chr12 73861752 73862246 MACS_peak_1870 80.18 +chr4 83619188 83619568 MACS_peak_5815 52.20 +chr7 80763465 80763988 MACS_peak_7410 71.38 +chr11 78816343 78817112 MACS_peak_1360 53.58 +chr10 80160393 80161035 MACS_peak_822 294.44 +chr13 32893584 32894176 MACS_peak_2117 81.21 +chr10 78218410 78218726 MACS_peak_790 64.14 +chr11 58907018 58907334 MACS_peak_1205 98.43 +chr3 104162680 104163086 MACS_peak_5410 55.17 +chr6 39156271 39156786 MACS_peak_6854 61.68 +chr18 85020575 85021002 MACS_peak_4215 74.27 +chr6 72166566 72167067 MACS_peak_6931 69.03 +chr17 56748737 56749331 MACS_peak_3884 106.79 +chr2 57090575 57091032 MACS_peak_4713 76.38 +chr6 52662598 52663126 MACS_peak_6888 97.50 +chr5 88982859 88983700 MACS_peak_6425 295.50 +chr5 134967688 134968192 MACS_peak_6645 72.85 +chr17 29089160 29089657 MACS_peak_3724 82.93 +chr8 123062177 123062589 MACS_peak_8088 58.85 +chr11 85534180 85534673 MACS_peak_1423 87.33 +chr15 66990142 66990609 MACS_peak_3114 118.53 +chr8 106966580 106967082 MACS_peak_7997 113.60 +chr11 106888391 106889001 MACS_peak_1583 69.90 +chr19 11848049 11848520 MACS_peak_4306 51.63 +chr15 8584865 8585230 MACS_peak_2922 62.73 +chr17 87913100 87913467 MACS_peak_3983 114.07 +chr13 34254496 34254848 MACS_peak_2122 67.47 +chr1 59914119 59914399 MACS_peak_135 57.79 +chr4 140629745 140629986 MACS_peak_6102 81.30 +chr2 180446822 180447260 MACS_peak_5086 99.29 +chr2 29804429 29804860 MACS_peak_4600 65.92 +chr12 32992278 32992842 MACS_peak_1783 84.01 +chr14 99698259 99698564 MACS_peak_2803 84.57 +chr19 3832712 3833378 MACS_peak_4224 118.71 +chr15 100536597 100537082 MACS_peak_3300 154.87 +chr7 109390646 109391459 MACS_peak_7527 161.60 +chr7 151692825 151693219 MACS_peak_7719 66.56 +chr14 52639405 52639860 MACS_peak_2557 52.74 +chr1 158257693 158258023 MACS_peak_461 64.88 +chr12 76836098 76836626 MACS_peak_1878 62.73 +chr1 182998458 182998880 MACS_peak_570 52.74 +chr2 51797359 51797797 MACS_peak_4703 65.46 +chr8 96707068 96707513 MACS_peak_7960 104.94 +chr3 28143185 28143670 MACS_peak_5131 101.35 +chr6 88889418 88889830 MACS_peak_7010 52.74 +chr2 131937255 131937594 MACS_peak_4912 72.89 +chr7 25688982 25689460 MACS_peak_7246 62.73 +chr19 46938054 46938331 MACS_peak_4436 92.02 +chr7 138515654 138516191 MACS_peak_7671 84.15 +chr14 29767339 29767710 MACS_peak_2466 51.44 +chr15 86002731 86003183 MACS_peak_3240 72.40 +chr15 103088442 103089223 MACS_peak_3322 883.55 +chr19 33127653 33128234 MACS_peak_4366 116.11 +chr5 135450040 135450529 MACS_peak_6650 101.01 +chr15 51080445 51080929 MACS_peak_3050 62.73 +chr9 124009677 124010094 MACS_peak_8582 65.09 +chr1 107856029 107856432 MACS_peak_313 52.07 +chr10 107555226 107555677 MACS_peak_929 79.40 +chr7 55762430 55762866 MACS_peak_7364 91.92 +chr12 96882121 96882495 MACS_peak_1959 70.03 +chr3 68480776 68481485 MACS_peak_5235 78.55 +chr1 89537056 89537406 MACS_peak_259 53.07 +chr14 27335329 27335792 MACS_peak_2450 52.74 +chr17 56949680 56949993 MACS_peak_3889 81.91 +chr5 118928141 118928605 MACS_peak_6556 117.21 +chr8 84911554 84912100 MACS_peak_7907 71.83 +chr8 129108351 129108844 MACS_peak_8142 54.79 +chr3 78877870 78878229 MACS_peak_5251 71.22 +chr19 18650375 18650861 MACS_peak_4324 62.73 +chr6 87942729 87943305 MACS_peak_6992 81.55 +chr12 92821124 92821370 MACS_peak_1955 69.53 +chr11 18065187 18065398 MACS_peak_1077 97.88 +chr17 84515588 84516165 MACS_peak_3966 458.83 +chr9 92169110 92169873 MACS_peak_8447 108.56 +chr14 14920422 14920757 MACS_peak_2398 123.05 +chr9 34798448 34798810 MACS_peak_8223 70.98 +chr3 94306130 94306466 MACS_peak_5319 95.95 +chr5 115790919 115791717 MACS_peak_6543 254.81 +chr11 68780920 68781624 MACS_peak_1249 96.66 +chr1 55084208 55084643 MACS_peak_101 56.99 +chr11 115938781 115939242 MACS_peak_1655 106.79 +chr7 134851363 134852112 MACS_peak_7651 388.87 +chr2 25413082 25413751 MACS_peak_4557 108.33 +chr9 70760521 70761198 MACS_peak_8400 125.82 +chr1 132526233 132526605 MACS_peak_367 51.37 +chr12 77462231 77462609 MACS_peak_1880 61.61 +chr2 131322118 131322495 MACS_peak_4905 173.69 +chr12 8886534 8886943 MACS_peak_1732 62.46 +chr1 134921392 134922134 MACS_peak_388 97.78 +chr12 50546587 50546853 MACS_peak_1811 72.51 +chr16 44347497 44348102 MACS_peak_3445 67.73 +chr16 91448123 91448772 MACS_peak_3510 110.35 +chr8 96932624 96932949 MACS_peak_7968 67.46 +chr9 50409776 50410148 MACS_peak_8274 68.39 +chr15 39018860 39019403 MACS_peak_3023 96.24 +chrX 7548382 7548918 MACS_peak_8587 182.65 +chr1 36568547 36568801 MACS_peak_47 57.79 +chr3 133241295 133241605 MACS_peak_5543 56.48 +chr3 36470919 36471238 MACS_peak_5148 54.12 +chr5 137974253 137974619 MACS_peak_6683 59.42 +chr4 107278613 107279232 MACS_peak_5866 117.82 +chr8 3621220 3621676 MACS_peak_7722 76.85 +chr11 68792865 68793384 MACS_peak_1250 61.41 +chr11 107283838 107284259 MACS_peak_1593 62.31 +chr17 36162344 36162790 MACS_peak_3801 77.75 +chr2 119176647 119177021 MACS_peak_4841 59.32 +chr14 75947689 75947989 MACS_peak_2746 115.64 +chr2 32837666 32838081 MACS_peak_4650 56.37 +chr5 21772275 21772751 MACS_peak_6260 88.64 +chr4 88181586 88181956 MACS_peak_5819 83.97 +chr17 46210576 46211375 MACS_peak_3824 152.68 +chr8 113290700 113290975 MACS_peak_8033 68.02 +chr14 100246709 100247166 MACS_peak_2804 114.56 +chr18 21097256 21097529 MACS_peak_4028 188.09 +chr15 58175270 58175626 MACS_peak_3078 52.59 +chr9 61513942 61514355 MACS_peak_8334 216.69 +chr10 92184761 92185425 MACS_peak_881 113.68 +chr2 125450541 125451011 MACS_peak_4863 84.01 +chr7 120579702 120580147 MACS_peak_7571 84.01 +chr17 28313728 28314505 MACS_peak_3710 147.88 +chr17 85092137 85092578 MACS_peak_3972 60.05 +chr7 52391580 52392059 MACS_peak_7336 71.31 +chr4 106607491 106607860 MACS_peak_5861 62.73 +chr15 76531134 76532498 MACS_peak_3158 205.05 +chr12 86815403 86815709 MACS_peak_1937 55.62 +chr8 97381250 97381634 MACS_peak_7975 67.56 +chr2 18892130 18892531 MACS_peak_4517 53.98 +chr13 93362690 93363352 MACS_peak_2290 156.01 +chr4 134276344 134276744 MACS_peak_6023 66.29 +chr5 136189308 136189833 MACS_peak_6660 92.87 +chr13 54712548 54712992 MACS_peak_2192 70.78 +chr3 95116459 95117202 MACS_peak_5338 276.81 +chr15 55668280 55668565 MACS_peak_3068 57.79 +chr7 86508145 86508581 MACS_peak_7430 65.59 +chr13 64134767 64135424 MACS_peak_2229 84.01 +chr14 75405717 75405947 MACS_peak_2740 56.48 +chr2 34655577 34655906 MACS_peak_4662 86.22 +chr2 178420601 178420979 MACS_peak_5071 60.99 +chr7 80675775 80676079 MACS_peak_7406 57.04 +chr6 120314001 120314656 MACS_peak_7092 155.03 +chr11 103889450 103889863 MACS_peak_1547 105.51 +chr1 75209595 75210147 MACS_peak_201 195.59 +chr4 136209837 136210242 MACS_peak_6063 91.62 +chr19 38298472 38299109 MACS_peak_4384 52.09 +chr3 146318049 146318677 MACS_peak_5622 65.43 +chr8 97525645 97526124 MACS_peak_7981 83.75 +chr6 42299260 42299977 MACS_peak_6864 156.01 +chr13 95746101 95746664 MACS_peak_2305 118.67 +chr5 68262648 68262928 MACS_peak_6374 76.86 +chr9 4309901 4310202 MACS_peak_8156 57.32 +chr2 130455636 130455898 MACS_peak_4896 68.02 +chr7 133920084 133920580 MACS_peak_7627 94.25 +chr3 144712794 144713309 MACS_peak_5603 333.24 +chr4 41492809 41493178 MACS_peak_5745 61.67 +chr6 83725731 83726256 MACS_peak_6965 72.37 +chr14 123928421 123928771 MACS_peak_2892 53.07 +chr11 94409579 94409974 MACS_peak_1489 68.44 +chr2 165618765 165619347 MACS_peak_5039 77.35 +chr1 97210080 97210414 MACS_peak_302 73.84 +chr19 31412009 31412328 MACS_peak_4353 67.09 +chr7 146028031 146028398 MACS_peak_7696 57.49 +chr14 98617003 98617302 MACS_peak_2799 57.51 +chr19 44406048 44406439 MACS_peak_4413 66.95 +chr14 26681413 26681976 MACS_peak_2449 117.79 +chr2 128037989 128038430 MACS_peak_4878 52.74 +chr17 61434287 61434641 MACS_peak_3905 62.86 +chr15 36390225 36390517 MACS_peak_2989 66.98 +chr14 27398759 27399655 MACS_peak_2452 361.52 +chr11 116115836 116116290 MACS_peak_1661 77.03 +chr15 36579667 36580306 MACS_peak_2996 51.96 +chr1 57568835 57569128 MACS_peak_112 60.79 +chr15 67474872 67475357 MACS_peak_3123 158.15 +chr10 19428365 19428826 MACS_peak_632 89.84 +chr14 113392921 113393120 MACS_peak_2836 66.98 +chr15 38448807 38449350 MACS_peak_3019 59.81 +chr14 20991935 20992435 MACS_peak_2406 75.91 +chr6 134006321 134006678 MACS_peak_7142 71.38 +chr12 112127235 112127724 MACS_peak_2013 80.19 +chr14 76244671 76245541 MACS_peak_2752 107.12 +chr11 104164505 104164874 MACS_peak_1549 69.13 +chr7 134536698 134537132 MACS_peak_7646 78.65 +chr1 137867871 137868260 MACS_peak_415 143.30 +chr18 34665859 34666370 MACS_peak_4058 61.96 +chr1 129101475 129101945 MACS_peak_348 77.88 +chr11 72295448 72295925 MACS_peak_1293 156.01 +chr17 24591995 24592521 MACS_peak_3651 114.96 +chr15 3945339 3946408 MACS_peak_2896 271.22 +chr8 122250900 122251332 MACS_peak_8064 71.78 +chr11 115938158 115938571 MACS_peak_1654 58.78 +chr9 114597610 114598135 MACS_peak_8535 92.87 +chr6 43207256 43207620 MACS_peak_6869 70.82 +chr3 152935129 152935658 MACS_peak_5650 60.73 +chr3 94655429 94656010 MACS_peak_5324 210.22 +chr9 57368841 57369352 MACS_peak_8313 53.61 +chr4 3157974 3158349 MACS_peak_5679 52.74 +chr11 107211666 107212176 MACS_peak_1586 89.67 +chr15 42269449 42270170 MACS_peak_3035 131.90 +chr9 70682529 70683032 MACS_peak_8396 186.94 +chr8 27125446 27126059 MACS_peak_7778 102.67 +chr9 20896025 20896479 MACS_peak_8195 67.10 +chr15 75551370 75551790 MACS_peak_3136 66.67 +chr15 55028995 55029425 MACS_peak_3064 90.94 +chr16 18308240 18308586 MACS_peak_3350 58.96 +chr3 93353745 93354375 MACS_peak_5318 103.23 +chr16 23107242 23107924 MACS_peak_3367 113.70 +chr18 36486603 36487009 MACS_peak_4080 53.46 +chr18 5390330 5390807 MACS_peak_4001 113.10 +chr17 56428661 56429186 MACS_peak_3882 118.67 +chr2 18860310 18861083 MACS_peak_4512 84.24 +chr7 97888242 97888576 MACS_peak_7477 57.36 +chr3 21810071 21810487 MACS_peak_5121 118.67 +chr17 78181904 78182525 MACS_peak_3946 77.05 +chr14 56197450 56198063 MACS_peak_2598 129.98 +chr9 99140804 99141128 MACS_peak_8467 58.22 +chr10 92623323 92623821 MACS_peak_885 100.15 +chr4 140616351 140617131 MACS_peak_6099 80.20 +chr10 61142776 61143539 MACS_peak_744 80.20 +chr7 104485058 104485742 MACS_peak_7488 317.41 +chr11 115939476 115940007 MACS_peak_1656 92.32 +chr10 94580987 94581311 MACS_peak_903 56.69 +chr15 76157364 76157952 MACS_peak_3152 125.64 +chr13 14155415 14155855 MACS_peak_2065 52.91 +chr15 67066485 67066934 MACS_peak_3117 84.01 +chr7 29227640 29228147 MACS_peak_7277 73.17 +chr13 6514405 6514820 MACS_peak_2047 104.32 +chr4 140542557 140543005 MACS_peak_6097 144.88 +chr5 111937855 111938599 MACS_peak_6514 128.49 +chr16 44018427 44018767 MACS_peak_3442 64.02 +chr1 133421664 133422047 MACS_peak_377 82.81 +chrX 166419443 166419942 MACS_peak_8678 54.41 +chr15 93105701 93105937 MACS_peak_3251 154.40 +chr1 108780375 108780748 MACS_peak_320 57.56 +chr11 84636850 84637366 MACS_peak_1410 80.20 +chr17 24995915 24996584 MACS_peak_3656 135.78 +chr14 58033892 58034211 MACS_peak_2613 58.66 +chr13 29847874 29848368 MACS_peak_2108 158.86 +chr1 13520675 13521060 MACS_peak_11 108.01 +chr2 156137538 156137972 MACS_peak_4990 78.65 +chr8 87550632 87550994 MACS_peak_7941 66.66 +chr3 151768385 151768678 MACS_peak_5634 56.48 +chr3 108012888 108013451 MACS_peak_5443 78.80 +chr13 44597050 44597814 MACS_peak_2154 202.82 +chr2 31917741 31918033 MACS_peak_4624 91.45 +chr3 132521750 132522383 MACS_peak_5537 143.48 +chr12 4879663 4880069 MACS_peak_1724 78.35 +chr6 91628640 91629356 MACS_peak_7022 67.31 +chr3 81433756 81434158 MACS_peak_5257 67.93 +chr7 54138715 54139193 MACS_peak_7359 128.73 +chr5 137102584 137103013 MACS_peak_6672 81.24 +chr8 59967224 59967628 MACS_peak_7830 62.73 +chr14 73689765 73690147 MACS_peak_2729 76.26 +chr11 117671467 117671893 MACS_peak_1678 66.26 +chr1 133214967 133215419 MACS_peak_376 174.60 +chr15 72853276 72853636 MACS_peak_3127 52.74 +chr11 109334214 109334929 MACS_peak_1611 51.65 +chrX 45266253 45266899 MACS_peak_8617 128.35 +chr2 131877465 131877919 MACS_peak_4910 132.70 +chr9 20779965 20780304 MACS_peak_8192 60.28 +chr3 90068955 90069393 MACS_peak_5310 78.47 +chr5 76187734 76188295 MACS_peak_6404 65.35 +chr11 104180396 104181197 MACS_peak_1550 84.55 +chr9 43839155 43839734 MACS_peak_8243 170.66 +chr15 85812555 85813334 MACS_peak_3239 124.98 +chr16 30691946 30692407 MACS_peak_3394 226.47 +chr2 110401236 110401587 MACS_peak_4817 63.11 +chr5 125914300 125914711 MACS_peak_6623 65.51 +chr2 166483417 166483707 MACS_peak_5047 52.10 +chr8 60131046 60131454 MACS_peak_7833 52.74 +chr1 153024254 153024901 MACS_peak_439 66.68 +chr6 135133407 135133856 MACS_peak_7151 66.57 +chr7 82993032 82993383 MACS_peak_7423 52.99 +chr12 36728733 36729377 MACS_peak_1795 106.79 +chr19 54161870 54162283 MACS_peak_4440 93.96 +chr13 21366775 21367132 MACS_peak_2077 58.81 +chr7 140828409 140828831 MACS_peak_7684 68.81 +chr7 52771782 52772179 MACS_peak_7344 57.63 +chr11 57258571 57259095 MACS_peak_1191 98.14 +chr10 19855329 19855821 MACS_peak_636 125.55 +chr9 48594723 48595281 MACS_peak_8268 79.19 +chr4 41278073 41278681 MACS_peak_5744 81.51 +chr18 44988493 44988911 MACS_peak_4095 69.91 +chr1 74438395 74439179 MACS_peak_195 162.82 +chr3 108830511 108830918 MACS_peak_5453 62.62 +chr13 96427044 96427529 MACS_peak_2310 152.26 +chr1 142384049 142384472 MACS_peak_423 79.51 +chr1 179064649 179064883 MACS_peak_543 74.87 +chr3 105490131 105490555 MACS_peak_5418 63.69 +chr2 90508129 90508418 MACS_peak_4780 76.86 +chr15 81846602 81847127 MACS_peak_3217 117.96 +chr18 3270592 3271094 MACS_peak_3989 195.43 +chr1 108606863 108607335 MACS_peak_318 95.47 +chr13 75935312 75935640 MACS_peak_2250 63.30 +chr16 30789953 30790403 MACS_peak_3396 148.02 +chr10 111409491 111409958 MACS_peak_950 131.28 +chr9 40880928 40881362 MACS_peak_8236 65.72 +chr8 123191898 123192493 MACS_peak_8089 118.67 +chr12 86713029 86713482 MACS_peak_1934 95.23 +chr18 65281748 65282564 MACS_peak_4150 161.32 +chr9 37296593 37297143 MACS_peak_8230 129.00 +chr18 75530251 75530647 MACS_peak_4189 68.37 +chr14 64162422 64162897 MACS_peak_2650 62.73 +chr10 82222485 82222777 MACS_peak_854 129.73 +chr10 51248911 51249493 MACS_peak_714 104.02 +chr19 45612299 45612910 MACS_peak_4419 89.38 +chr16 59515986 59516330 MACS_peak_3480 72.46 +chr1 37364506 37364915 MACS_peak_55 77.39 +chr9 107436160 107436580 MACS_peak_8495 73.91 +chr6 123239085 123239498 MACS_peak_7098 80.30 +chr8 24145434 24145873 MACS_peak_7767 65.39 +chr17 59064066 59064738 MACS_peak_3903 193.40 +chr18 81626532 81626968 MACS_peak_4206 53.17 +chr8 72498191 72498470 MACS_peak_7850 76.86 +chr2 127033717 127034197 MACS_peak_4869 122.43 +chr3 153427354 153428352 MACS_peak_5658 173.73 +chr13 95777240 95777685 MACS_peak_2306 62.73 +chr6 90654616 90655084 MACS_peak_7016 74.57 +chr6 115545743 115546136 MACS_peak_7072 52.74 +chr7 52392685 52393201 MACS_peak_7337 105.13 +chr1 174445177 174445620 MACS_peak_534 65.13 +chr5 139853354 139853932 MACS_peak_6702 77.65 +chr17 44266175 44266552 MACS_peak_3809 97.39 +chr9 78919711 78920097 MACS_peak_8424 50.34 +chr2 120210305 120210628 MACS_peak_4846 69.96 +chr8 97679869 97680475 MACS_peak_7985 130.84 +chr14 70029196 70029514 MACS_peak_2696 58.75 +chr11 97574402 97574747 MACS_peak_1511 53.47 +chr2 56968627 56969614 MACS_peak_4711 285.57 +chr7 26472954 26473335 MACS_peak_7258 62.73 +chr1 146985918 146986334 MACS_peak_432 62.66 +chr6 30276109 30276518 MACS_peak_6816 73.17 +chr18 4969715 4970163 MACS_peak_3999 62.73 +chr6 85298851 85299333 MACS_peak_6971 130.45 +chr18 62318702 62319054 MACS_peak_4130 55.87 +chr7 97493416 97493783 MACS_peak_7473 95.23 +chr5 84728325 84728797 MACS_peak_6420 65.95 +chr15 96290510 96290960 MACS_peak_3260 75.41 +chr5 64493902 64494502 MACS_peak_6348 155.47 +chr12 70683782 70684144 MACS_peak_1854 74.38 +chr7 28259485 28260176 MACS_peak_7269 157.45 +chr3 102072769 102073100 MACS_peak_5391 64.80 +chr3 121177634 121178278 MACS_peak_5487 124.60 +chr3 141995570 141995959 MACS_peak_5587 74.70 +chr10 12681163 12681522 MACS_peak_617 57.44 +chr7 35770301 35770804 MACS_peak_7310 130.48 +chr3 107901318 107901701 MACS_peak_5442 68.87 +chr4 155406985 155407313 MACS_peak_6229 63.11 +chr14 46277533 46277983 MACS_peak_2523 63.09 +chr7 142790268 142790503 MACS_peak_7693 89.37 +chr9 66360249 66360570 MACS_peak_8377 57.12 +chr15 95621015 95621459 MACS_peak_3254 77.44 +chr4 71861086 71862075 MACS_peak_5807 206.03 +chr11 121065722 121066055 MACS_peak_1707 99.74 +chr19 9041528 9042024 MACS_peak_4289 107.21 +chr8 98477882 98478259 MACS_peak_7992 69.80 +chr18 75722207 75722491 MACS_peak_4193 91.45 +chr15 57812241 57812965 MACS_peak_3074 52.05 +chr3 58917608 58918452 MACS_peak_5209 152.48 +chr4 41660450 41660822 MACS_peak_5747 57.64 +chr11 11641587 11641928 MACS_peak_1066 55.63 +chr8 50911172 50911534 MACS_peak_7818 70.98 +chr11 120209562 120209886 MACS_peak_1697 66.59 +chr14 66971802 66972207 MACS_peak_2681 62.73 +chr3 98621426 98621709 MACS_peak_5379 72.51 +chr12 49775350 49775758 MACS_peak_1809 67.51 +chr12 17040311 17040756 MACS_peak_1752 86.89 +chr14 70465516 70466231 MACS_peak_2709 158.40 +chr4 106926454 106926892 MACS_peak_5863 65.46 +chr11 5221117 5221579 MACS_peak_1043 129.84 +chr11 51762768 51763314 MACS_peak_1170 137.47 +chr12 73948553 73949012 MACS_peak_1872 142.64 +chr15 12123626 12124218 MACS_peak_2933 94.43 +chr15 12246914 12247416 MACS_peak_2937 294.48 +chr2 7924537 7924842 MACS_peak_4478 63.31 +chr16 56916814 56917191 MACS_peak_3470 51.00 +chr14 57190198 57191173 MACS_peak_2608 120.19 +chr5 138011402 138012367 MACS_peak_6684 707.79 +chr1 36153980 36154800 MACS_peak_40 119.59 +chr9 105397273 105397630 MACS_peak_8483 90.84 +chr4 148542288 148542494 MACS_peak_6147 99.30 +chr7 134234472 134235313 MACS_peak_7633 215.08 +chr1 187186557 187186854 MACS_peak_584 84.01 +chr2 156703464 156703925 MACS_peak_5000 135.84 +chr2 45507624 45507896 MACS_peak_4694 66.03 +chr2 25110687 25111472 MACS_peak_4543 265.66 +chr13 23494534 23495087 MACS_peak_2082 74.85 +chr2 118738734 118739174 MACS_peak_4833 58.77 +chrX 11733021 11733752 MACS_peak_8601 84.09 +chr3 153560124 153560559 MACS_peak_5664 53.23 +chr8 97479035 97479520 MACS_peak_7976 156.01 +chr9 114662010 114662635 MACS_peak_8538 65.64 +chr18 56618529 56618905 MACS_peak_4110 56.89 +chr17 34057391 34058028 MACS_peak_3762 56.35 +chr1 99519858 99520254 MACS_peak_306 57.70 +chr4 136194817 136195184 MACS_peak_6060 98.40 +chr7 16611238 16611688 MACS_peak_7204 52.74 +chr1 60215214 60215684 MACS_peak_140 88.89 +chr6 149257575 149258040 MACS_peak_7180 65.32 +chr4 8159311 8159627 MACS_peak_5687 55.94 +chr14 45660604 45661144 MACS_peak_2518 98.84 +chr11 84024342 84024705 MACS_peak_1402 50.69 +chr11 108110784 108111439 MACS_peak_1606 82.28 +chr7 87590346 87590812 MACS_peak_7448 54.95 +chr9 35018443 35018749 MACS_peak_8226 74.08 +chr7 61764305 61764697 MACS_peak_7375 62.07 +chr3 137620670 137621228 MACS_peak_5569 110.82 +chr8 89147603 89148183 MACS_peak_7945 106.99 +chr10 80982282 80982979 MACS_peak_848 148.39 +chr2 113012940 113013326 MACS_peak_4821 56.59 +chr16 93767743 93768080 MACS_peak_3559 109.86 +chr2 4483390 4484698 MACS_peak_4459 128.33 +chr6 128792917 128793800 MACS_peak_7130 123.88 +chr5 148241425 148242026 MACS_peak_6759 97.14 +chr4 34829946 34830380 MACS_peak_5730 65.72 +chr3 37558222 37559100 MACS_peak_5161 173.11 +chr2 90894346 90894793 MACS_peak_4781 72.76 +chr8 107486121 107486440 MACS_peak_7999 81.28 +chr7 140064742 140065140 MACS_peak_7681 73.17 +chr12 30367083 30367515 MACS_peak_1770 73.89 +chrX 11711607 11711970 MACS_peak_8600 84.60 +chr15 5058192 5058833 MACS_peak_2901 89.38 +chr7 104727397 104728070 MACS_peak_7489 70.84 +chr6 133055524 133055892 MACS_peak_7138 62.47 +chr3 95558657 95559026 MACS_peak_5348 79.54 +chr17 35326947 35327306 MACS_peak_3781 66.90 +chr14 52816486 52817050 MACS_peak_2560 64.66 +chr1 87632880 87633301 MACS_peak_233 58.17 +chr9 57495286 57495888 MACS_peak_8318 128.82 +chr11 87571803 87572391 MACS_peak_1454 59.68 +chr4 101511482 101511839 MACS_peak_5847 71.38 +chr15 12251825 12252367 MACS_peak_2938 54.29 +chr8 24276703 24277308 MACS_peak_7770 91.11 +chr6 117981548 117981940 MACS_peak_7086 68.66 +chr7 118300107 118300551 MACS_peak_7564 60.79 +chr5 77553172 77553619 MACS_peak_6415 77.65 +chr7 133428410 133429279 MACS_peak_7615 176.96 +chr5 54386367 54386928 MACS_peak_6343 135.99 +chr2 157967843 157968322 MACS_peak_5015 52.74 +chr1 13579885 13580466 MACS_peak_13 106.79 +chr17 47825794 47826338 MACS_peak_3846 103.13 +chr15 96115848 96116105 MACS_peak_3259 89.28 +chr6 8018474 8018781 MACS_peak_6782 61.94 +chr1 58769938 58770557 MACS_peak_122 106.56 +chr18 13100063 13100479 MACS_peak_4020 101.54 +chr1 95462306 95462739 MACS_peak_289 61.51 +chr13 8456217 8456514 MACS_peak_2052 121.95 +chr8 87426937 87427392 MACS_peak_7932 147.42 +chr3 69488182 69488508 MACS_peak_5241 55.06 +chr5 108495385 108495819 MACS_peak_6502 53.36 +chr7 26391500 26391968 MACS_peak_7253 62.73 +chr14 122222542 122222864 MACS_peak_2879 60.02 +chr7 16880847 16881143 MACS_peak_7213 72.15 +chr10 84379493 84379935 MACS_peak_862 65.19 +chr1 93218296 93218729 MACS_peak_276 67.88 +chr7 134005243 134005813 MACS_peak_7631 156.01 +chr9 25059978 25060419 MACS_peak_8215 65.26 +chr2 4802272 4802882 MACS_peak_4463 90.71 +chr9 114640488 114640918 MACS_peak_8537 63.14 +chr1 155044510 155044840 MACS_peak_453 52.74 +chr2 181598797 181599317 MACS_peak_5098 52.05 +chr16 30227325 30227906 MACS_peak_3389 144.56 +chr2 33582864 33583285 MACS_peak_4654 61.53 +chr2 38920882 38921337 MACS_peak_4683 62.69 +chr12 8639627 8640095 MACS_peak_1730 89.27 +chr1 193244835 193245254 MACS_peak_592 63.53 +chr19 28042093 28042496 MACS_peak_4342 185.71 +chr18 67399653 67399896 MACS_peak_4155 104.20 +chr15 81702453 81702935 MACS_peak_3214 70.29 +chr2 4354267 4354521 MACS_peak_4454 66.03 +chr17 71599086 71599527 MACS_peak_3929 65.92 +chr11 115016216 115016976 MACS_peak_1629 260.88 +chr13 49402730 49403235 MACS_peak_2170 96.40 +chr1 173607566 173608216 MACS_peak_521 161.65 +chr4 149943597 149944833 MACS_peak_6170 714.32 +chr2 30033180 30033596 MACS_peak_4605 60.37 +chr12 73775435 73775826 MACS_peak_1868 193.90 +chr19 6686904 6687279 MACS_peak_4266 106.79 +chr13 94372068 94372377 MACS_peak_2298 52.57 +chr3 134875358 134875676 MACS_peak_5549 74.76 +chr14 35123318 35123817 MACS_peak_2496 91.32 +chr4 134064080 134064462 MACS_peak_6020 65.66 +chr7 38451614 38451982 MACS_peak_7318 51.67 +chr2 59721277 59721585 MACS_peak_4718 52.74 +chr4 148521433 148521717 MACS_peak_6146 50.43 +chr6 29651055 29651524 MACS_peak_6813 99.40 +chr2 25283862 25284454 MACS_peak_4553 118.30 +chr1 180335685 180336069 MACS_peak_548 70.43 +chr15 9000420 9001374 MACS_peak_2924 50.16 +chr17 76783407 76783741 MACS_peak_3945 73.33 +chr10 79377042 79377510 MACS_peak_798 105.25 +chr4 137813129 137813637 MACS_peak_6069 79.35 +chr19 23347935 23348633 MACS_peak_4335 168.98 +chr2 77014459 77014721 MACS_peak_4764 57.79 +chr17 27725137 27725648 MACS_peak_3703 81.61 +chr3 84271282 84271895 MACS_peak_5266 107.11 +chr4 149036130 149036714 MACS_peak_6156 127.11 +chr17 36157226 36157966 MACS_peak_3800 58.06 +chr9 113925463 113925818 MACS_peak_8532 139.16 +chr18 62455027 62455412 MACS_peak_4131 69.19 +chr2 143717397 143717821 MACS_peak_4930 57.72 +chr14 70058939 70059262 MACS_peak_2698 78.56 +chr9 8004492 8005091 MACS_peak_8167 146.39 +chr2 22750741 22751369 MACS_peak_4526 107.68 +chr11 113663893 113664272 MACS_peak_1623 118.67 +chr11 60643876 60644376 MACS_peak_1224 75.91 +chr13 55463887 55464601 MACS_peak_2197 89.38 +chr3 138158153 138158835 MACS_peak_5576 101.13 +chr9 61779725 61780181 MACS_peak_8335 52.74 +chr5 141092685 141093127 MACS_peak_6724 60.64 +chr4 151560621 151560894 MACS_peak_6192 80.71 +chr12 71087816 71088552 MACS_peak_1856 105.21 +chr3 136623971 136624307 MACS_peak_5565 54.21 +chr18 64675715 64676128 MACS_peak_4137 67.15 +chr5 93521864 93522250 MACS_peak_6451 50.34 +chr14 27666233 27666572 MACS_peak_2457 95.10 +chr17 65649466 65649790 MACS_peak_3914 74.21 +chr3 96961630 96962284 MACS_peak_5365 62.03 +chr19 46681813 46682242 MACS_peak_4433 64.27 +chr5 33677654 33678040 MACS_peak_6302 133.38 +chr1 155123197 155123551 MACS_peak_454 52.75 +chr11 104222718 104223628 MACS_peak_1551 135.09 +chr12 40834638 40835084 MACS_peak_1801 104.85 +chr5 140797328 140797751 MACS_peak_6714 136.27 +chr8 124636207 124636603 MACS_peak_8095 55.87 +chr1 33776550 33777146 MACS_peak_29 54.49 +chr2 127277423 127277785 MACS_peak_4871 62.22 +chr16 11144052 11144357 MACS_peak_3337 75.98 +chr2 71759141 71759569 MACS_peak_4740 61.84 +chr5 144654264 144654609 MACS_peak_6750 202.40 +chr6 136416896 136417766 MACS_peak_7155 107.68 +chr19 61160284 61160710 MACS_peak_4445 79.27 +chr5 135513632 135514247 MACS_peak_6652 52.16 +chr10 69559457 69559926 MACS_peak_764 75.43 +chr19 34625289 34625732 MACS_peak_4369 58.58 +chr3 129778582 129778971 MACS_peak_5530 52.74 +chr3 40549079 40549989 MACS_peak_5170 139.15 +chr12 63655639 63655947 MACS_peak_1841 75.70 +chr12 88027775 88028206 MACS_peak_1939 57.42 +chr4 149930560 149930906 MACS_peak_6169 51.58 +chr7 26175003 26175297 MACS_peak_7251 193.27 +chr3 137631502 137632466 MACS_peak_5570 270.50 +chr7 75095358 75096309 MACS_peak_7396 276.27 +chr13 112597147 112598260 MACS_peak_2361 120.74 +chr8 73397210 73397892 MACS_peak_7870 88.07 +chr10 57870391 57870790 MACS_peak_723 61.56 +chr12 21379875 21380271 MACS_peak_1759 71.11 +chr4 149229209 149229627 MACS_peak_6162 68.37 +chr11 79454167 79454630 MACS_peak_1371 103.29 +chr2 118577801 118578303 MACS_peak_4831 83.83 +chr12 90031052 90031356 MACS_peak_1953 67.25 +chr3 89221936 89222334 MACS_peak_5302 55.52 +chr11 49015967 49017374 MACS_peak_1149 140.92 +chr5 101854756 101855187 MACS_peak_6476 57.25 +chr14 55045118 55046069 MACS_peak_2572 278.29 +chr8 122360636 122360974 MACS_peak_8068 72.98 +chr6 29559590 29559996 MACS_peak_6810 66.51 +chr8 37675573 37676057 MACS_peak_7806 52.74 +chr7 135604640 135605584 MACS_peak_7660 140.03 +chr7 75215546 75215889 MACS_peak_7400 73.17 +chr11 6387328 6387780 MACS_peak_1054 66.34 +chr6 97171581 97172040 MACS_peak_7033 76.80 +chr2 71652110 71652536 MACS_peak_4738 52.74 +chr14 70205001 70205574 MACS_peak_2702 90.17 +chr7 4636478 4636845 MACS_peak_7189 84.01 +chr1 163697037 163697580 MACS_peak_480 152.14 +chr14 69905127 69905516 MACS_peak_2694 60.69 +chr4 105905243 105905631 MACS_peak_5853 53.13 +chr19 43763805 43764296 MACS_peak_4405 84.01 +chr15 98863988 98864259 MACS_peak_3288 109.33 +chr8 28268378 28268863 MACS_peak_7783 143.30 +chr5 50210130 50210512 MACS_peak_6329 203.50 +chr1 49424163 49424526 MACS_peak_75 70.90 +chr11 114416815 114417130 MACS_peak_1628 75.04 +chr2 29967973 29968359 MACS_peak_4603 84.01 +chr11 87275081 87275514 MACS_peak_1443 92.18 +chr9 72510503 72510911 MACS_peak_8403 63.21 +chr18 32996570 32997045 MACS_peak_4049 106.79 +chr7 108812030 108812396 MACS_peak_7522 58.27 +chr11 61377499 61378145 MACS_peak_1228 59.79 +chr5 141051472 141051938 MACS_peak_6718 69.03 +chr13 36416595 36416984 MACS_peak_2127 55.95 +chr9 14446069 14446592 MACS_peak_8182 98.84 +chr10 117850777 117851031 MACS_peak_971 57.79 +chr8 126502767 126503316 MACS_peak_8125 69.46 +chr6 66891898 66892295 MACS_peak_6917 59.59 +chr4 122959709 122960251 MACS_peak_5935 121.49 +chr12 60308039 60308451 MACS_peak_1839 80.38 +chr5 137108320 137108562 MACS_peak_6673 91.45 +chr4 129373773 129374378 MACS_peak_5972 131.90 +chr2 45268392 45268789 MACS_peak_4692 56.57 +chr5 141120758 141121124 MACS_peak_6726 71.31 +chr16 30453372 30453956 MACS_peak_3392 106.68 +chrX 71542249 71542578 MACS_peak_8628 64.97 +chr12 72743380 72743811 MACS_peak_1864 61.64 +chrX 108755267 108755697 MACS_peak_8648 65.99 +chr9 45983547 45983831 MACS_peak_8258 57.79 +chr14 63049340 63049683 MACS_peak_2644 61.79 +chr7 105719591 105719912 MACS_peak_7498 88.72 +chr7 65987933 65988294 MACS_peak_7377 74.48 +chr7 26496882 26497392 MACS_peak_7260 73.32 +chr3 157588086 157588412 MACS_peak_5676 86.48 +chr5 66089157 66089851 MACS_peak_6367 87.07 +chr1 63823189 63823575 MACS_peak_148 56.59 +chr19 8872798 8873377 MACS_peak_4283 89.38 +chr2 179759459 179759977 MACS_peak_5073 51.39 +chr6 128611850 128612175 MACS_peak_7127 74.12 +chr6 125049277 125049748 MACS_peak_7109 130.85 +chr14 58645884 58646276 MACS_peak_2621 52.85 +chr7 20080932 20081328 MACS_peak_7239 66.58 +chr2 131917466 131917870 MACS_peak_4911 59.48 +chr5 3152015 3152483 MACS_peak_6238 160.90 +chr2 132512500 132512844 MACS_peak_4916 59.86 +chrX 99352299 99352715 MACS_peak_8645 66.94 +chr18 55059820 55060479 MACS_peak_4108 118.53 +chr3 40456923 40457382 MACS_peak_5169 255.48 +chr11 57331929 57332212 MACS_peak_1192 57.79 +chr9 65389306 65389659 MACS_peak_8364 62.73 +chr6 30252722 30253129 MACS_peak_6815 70.78 +chr9 74844269 74844678 MACS_peak_8409 69.94 +chr3 79787772 79788143 MACS_peak_5255 89.40 +chr5 97259867 97260133 MACS_peak_6460 68.02 +chr7 147392845 147393149 MACS_peak_7705 66.02 +chrX 71516418 71516883 MACS_peak_8627 52.74 +chr4 135841295 135841647 MACS_peak_6056 62.73 +chr17 34781424 34781705 MACS_peak_3772 57.79 +chr6 108654497 108654889 MACS_peak_7052 82.04 +chr1 88337836 88338284 MACS_peak_244 100.57 +chr16 18876401 18877082 MACS_peak_3353 122.18 +chr15 86033062 86033641 MACS_peak_3242 60.29 +chr11 16851380 16851985 MACS_peak_1071 71.31 +chr7 125272857 125273444 MACS_peak_7585 163.78 +chr12 53738815 53739179 MACS_peak_1825 70.07 +chr2 156665349 156666095 MACS_peak_4998 100.62 +chr7 133942356 133942843 MACS_peak_7630 98.04 +chr9 90020990 90021384 MACS_peak_8443 52.74 +chr11 83658247 83658585 MACS_peak_1398 60.37 +chr14 52103248 52103624 MACS_peak_2554 55.37 +chr18 36446981 36447399 MACS_peak_4078 89.45 +chr14 22367170 22367534 MACS_peak_2429 72.40 +chr15 53498017 53498814 MACS_peak_3061 116.56 +chr11 87256810 87257164 MACS_peak_1441 111.94 +chr9 122859679 122860394 MACS_peak_8568 109.13 +chr1 23930853 23931253 MACS_peak_23 249.39 +chr12 70598412 70598748 MACS_peak_1852 89.26 +chr13 51943389 51943842 MACS_peak_2182 104.20 +chr19 29138427 29138708 MACS_peak_4343 57.79 +chr8 81885020 81885368 MACS_peak_7895 87.82 +chr11 106277303 106277812 MACS_peak_1574 61.29 +chr14 119583365 119583707 MACS_peak_2862 52.74 +chr6 32801035 32801303 MACS_peak_6834 51.52 +chr10 94394483 94395062 MACS_peak_900 88.16 +chr3 37565697 37565925 MACS_peak_5162 90.55 +chr3 145588349 145588654 MACS_peak_5611 80.31 +chr19 23061529 23061970 MACS_peak_4334 65.26 +chr17 26989228 26989502 MACS_peak_3680 76.86 +chr1 95970905 95971720 MACS_peak_298 160.55 +chr4 108520352 108520992 MACS_peak_5880 118.67 +chr3 26391575 26392055 MACS_peak_5126 88.33 +chr3 8919741 8920292 MACS_peak_5104 77.70 +chr1 29104970 29105334 MACS_peak_25 70.82 +chr16 58637925 58638495 MACS_peak_3475 107.78 +chr15 57966348 57966985 MACS_peak_3075 149.78 +chr13 115022343 115022626 MACS_peak_2368 64.17 +chr11 67905507 67905890 MACS_peak_1243 200.30 +chr17 29330165 29330593 MACS_peak_3730 142.62 +chr11 119161198 119161769 MACS_peak_1689 135.03 +chr4 140249323 140249820 MACS_peak_6089 56.52 +chr1 35926096 35926623 MACS_peak_38 55.35 +chr1 59412217 59412591 MACS_peak_129 51.22 +chr2 181414705 181415126 MACS_peak_5096 57.92 +chr17 57418275 57418714 MACS_peak_3896 78.38 +chr8 87246451 87247128 MACS_peak_7927 93.62 +chr12 81913169 81913458 MACS_peak_1914 52.00 +chr9 88275002 88275236 MACS_peak_8441 69.53 +chr11 103078799 103079762 MACS_peak_1540 129.73 +chr7 148141747 148142194 MACS_peak_7708 52.74 +chr19 41338432 41338860 MACS_peak_4389 51.90 +chr16 91538765 91539078 MACS_peak_3515 52.29 +chr7 132761686 132762056 MACS_peak_7609 66.03 +chr5 138070239 138070549 MACS_peak_6688 62.84 +chr1 174294816 174295355 MACS_peak_528 124.08 +chr19 41912152 41912595 MACS_peak_4394 51.03 +chr3 96217894 96218423 MACS_peak_5355 67.16 +chr8 11393666 11393996 MACS_peak_7740 77.69 +chr15 37172600 37172975 MACS_peak_3007 91.99 +chr1 173611130 173611397 MACS_peak_522 76.86 +chr1 133022808 133023128 MACS_peak_372 72.77 +chr1 88454389 88454942 MACS_peak_252 84.11 +chr5 34856205 34856831 MACS_peak_6311 57.06 +chr7 71082000 71082779 MACS_peak_7382 80.20 +chr14 63736378 63736637 MACS_peak_2648 59.25 +chr19 32843677 32843920 MACS_peak_4364 63.61 +chr3 138702613 138702924 MACS_peak_5578 62.02 +chr17 86566107 86566474 MACS_peak_3976 84.24 +chr8 96910090 96910444 MACS_peak_7966 52.74 +chr13 112430419 112430951 MACS_peak_2359 81.28 +chr10 42013834 42014255 MACS_peak_700 66.60 +chr11 31517779 31518060 MACS_peak_1118 68.02 +chr18 5101351 5101714 MACS_peak_4000 52.05 +chr9 62724326 62725109 MACS_peak_8338 201.21 +chr9 99083674 99084236 MACS_peak_8465 132.54 +chr4 134827884 134829500 MACS_peak_6036 231.64 +chr17 13498739 13499070 MACS_peak_3624 73.59 +chr2 103006169 103006579 MACS_peak_4802 67.36 +chr15 6925244 6925735 MACS_peak_2914 54.92 +chr7 53078238 53078607 MACS_peak_7352 51.59 +chr2 90910384 90910774 MACS_peak_4782 68.81 +chr14 60870155 60870663 MACS_peak_2627 98.84 +chr2 118798450 118798941 MACS_peak_4834 74.57 +chr11 100870661 100871158 MACS_peak_1523 172.41 +chr11 87562630 87563498 MACS_peak_1450 403.76 +chr1 88154721 88155315 MACS_peak_239 86.42 +chr11 83112056 83112724 MACS_peak_1391 71.38 +chr12 101425557 101426166 MACS_peak_1973 147.38 +chr6 85401368 85402272 MACS_peak_6974 228.57 +chr11 78966191 78966722 MACS_peak_1368 117.55 +chr3 129236434 129236906 MACS_peak_5523 68.23 +chr9 109777897 109778345 MACS_peak_8516 77.56 +chr3 88426615 88427427 MACS_peak_5292 205.76 +chr1 46004702 46005146 MACS_peak_69 84.01 +chr5 76126811 76127048 MACS_peak_6401 66.98 +chr10 59405079 59405490 MACS_peak_730 58.60 +chr1 9690569 9690998 MACS_peak_3 64.27 +chr11 88281205 88281634 MACS_peak_1478 51.83 +chr10 21199165 21199653 MACS_peak_652 194.61 +chr1 173433353 173434146 MACS_peak_518 161.22 +chr12 35731430 35731910 MACS_peak_1792 101.79 +chr15 38446130 38446559 MACS_peak_3017 57.57 +chr4 144679039 144679338 MACS_peak_6130 76.57 +chr10 92865497 92865758 MACS_peak_887 57.79 +chr14 121027316 121027735 MACS_peak_2865 55.30 +chr3 96530843 96531751 MACS_peak_5362 201.73 +chr16 91406386 91406908 MACS_peak_3506 89.38 +chr5 67336216 67336546 MACS_peak_6372 54.72 +chr3 89746156 89746412 MACS_peak_5305 60.79 +chr14 106991035 106991399 MACS_peak_2826 58.26 +chr1 36186077 36186421 MACS_peak_41 72.46 +chr14 66211596 66212092 MACS_peak_2670 58.08 +chr2 127911067 127911519 MACS_peak_4875 63.04 +chr8 73335210 73335509 MACS_peak_7867 62.14 +chr17 24291509 24291795 MACS_peak_3646 66.03 +chr16 92938013 92938490 MACS_peak_3549 62.73 +chr11 3279575 3280176 MACS_peak_1024 89.38 +chr6 32447109 32447460 MACS_peak_6833 67.55 +chr1 133724229 133724871 MACS_peak_379 70.44 +chr3 138152249 138152833 MACS_peak_5575 98.84 +chr1 38420121 38420414 MACS_peak_60 72.51 +chr14 55224814 55225200 MACS_peak_2579 50.34 +chr4 140624561 140624920 MACS_peak_6101 83.78 +chr2 106328336 106328622 MACS_peak_4811 57.79 +chr11 114335454 114335791 MACS_peak_1626 77.00 +chr1 133850783 133851479 MACS_peak_381 101.13 +chr15 101084784 101085109 MACS_peak_3307 66.49 +chr1 121422851 121423230 MACS_peak_328 58.94 +chr5 50093335 50093768 MACS_peak_6328 78.73 +chr17 44569507 44569864 MACS_peak_3811 119.82 +chr9 40965392 40966162 MACS_peak_8238 135.71 +chr18 57409148 57409560 MACS_peak_4116 95.69 +chr11 106227571 106228161 MACS_peak_1573 68.64 +chr12 106264328 106264856 MACS_peak_1994 115.56 +chr11 51694649 51695006 MACS_peak_1167 50.61 +chr14 73304152 73304540 MACS_peak_2723 56.45 +chr13 38249483 38249806 MACS_peak_2137 53.73 +chr17 23939899 23940349 MACS_peak_3642 104.48 +chr8 13353101 13353525 MACS_peak_7749 169.06 +chr6 134203272 134203641 MACS_peak_7145 68.62 +chr13 3869743 3870092 MACS_peak_2043 57.73 +chr14 71173919 71174385 MACS_peak_2717 65.91 +chr15 8711544 8712011 MACS_peak_2923 52.74 +chr14 60883642 60884282 MACS_peak_2628 73.32 +chr6 100238263 100238692 MACS_peak_7041 79.31 +chr18 43246353 43246775 MACS_peak_4094 54.45 +chr3 32427840 32428279 MACS_peak_5142 65.39 +chr4 114176976 114177339 MACS_peak_5890 69.10 +chr15 24413374 24413825 MACS_peak_2953 72.47 +chr17 24388206 24388689 MACS_peak_3648 52.74 +chr2 31983332 31984086 MACS_peak_4626 138.76 +chr1 82784012 82784449 MACS_peak_226 85.75 +chr11 115527669 115527986 MACS_peak_1642 74.85 +chr4 133958319 133958921 MACS_peak_6017 111.16 +chr3 33698778 33699287 MACS_peak_5144 72.46 +chr14 122276884 122277969 MACS_peak_2882 101.55 +chr12 87310571 87310976 MACS_peak_1938 95.23 +chr13 58545231 58545547 MACS_peak_2208 75.30 +chr4 151382308 151383335 MACS_peak_6186 211.80 +chr4 107730838 107731131 MACS_peak_5875 76.86 +chr7 127973750 127974191 MACS_peak_7591 56.60 +chr13 51831495 51831924 MACS_peak_2177 53.62 +chr6 113256331 113257237 MACS_peak_7059 493.29 +chr18 75366936 75367315 MACS_peak_4186 69.64 +chr8 83893580 83893897 MACS_peak_7903 64.52 +chr6 82852344 82853038 MACS_peak_6953 269.15 +chr5 123271183 123271619 MACS_peak_6591 123.24 +chr14 47344721 47345104 MACS_peak_2524 52.77 +chr3 152575073 152575476 MACS_peak_5646 52.74 +chr3 145596911 145597202 MACS_peak_5612 83.77 +chr9 63985221 63985609 MACS_peak_8354 62.36 +chr1 58851809 58852216 MACS_peak_124 60.72 +chr4 119140346 119140886 MACS_peak_5928 82.34 +chr17 24486224 24486632 MACS_peak_3649 78.65 +chr6 34614205 34614765 MACS_peak_6840 62.73 +chr17 50210060 50210574 MACS_peak_3861 61.75 +chr18 31945760 31946081 MACS_peak_4043 57.12 +chr17 23680231 23680767 MACS_peak_3638 53.90 +chr15 38129140 38129536 MACS_peak_3012 59.66 +chr8 113782043 113782424 MACS_peak_8040 69.49 +chr18 36388485 36388844 MACS_peak_4075 60.49 +chr14 35176361 35177236 MACS_peak_2498 90.81 +chr15 58986223 58986533 MACS_peak_3088 76.07 +chr15 38230489 38231164 MACS_peak_3015 108.56 +chr2 26207239 26207961 MACS_peak_4567 93.92 +chr17 31700930 31701353 MACS_peak_3750 79.51 +chr14 69764756 69765167 MACS_peak_2692 66.11 +chr1 82630866 82631590 MACS_peak_222 118.15 +chr13 63463080 63463547 MACS_peak_2222 137.87 +chr9 88333602 88334003 MACS_peak_8442 170.03 +chr1 108414102 108414881 MACS_peak_317 194.11 +chr17 71202093 71202486 MACS_peak_3924 71.93 +chr2 4397930 4398339 MACS_peak_4456 96.07 +chr19 6313436 6313784 MACS_peak_4261 67.80 +chr17 47430218 47430545 MACS_peak_3836 57.96 +chr1 88383641 88384502 MACS_peak_245 82.34 +chr15 99307421 99307832 MACS_peak_3292 54.82 +chr10 87525443 87525844 MACS_peak_870 52.21 +chr1 137560338 137560774 MACS_peak_409 84.01 +chr2 98177089 98177336 MACS_peak_4795 66.43 +chr6 146904425 146904852 MACS_peak_7174 57.52 +chr3 88489494 88489890 MACS_peak_5293 71.68 +chrX 160870684 160870964 MACS_peak_8672 76.86 +chr5 96978763 96979127 MACS_peak_6459 66.50 +chr11 117515601 117516136 MACS_peak_1677 87.38 +chr8 129497368 129498236 MACS_peak_8150 121.81 +chr9 44134590 44135169 MACS_peak_8247 90.62 +chr3 157699307 157699805 MACS_peak_5678 169.27 +chr1 184472944 184473657 MACS_peak_582 98.66 +chr2 165780325 165780779 MACS_peak_5042 95.23 +chr12 52792839 52793385 MACS_peak_1814 54.49 +chr13 23855588 23856068 MACS_peak_2092 126.79 +chr18 83456033 83456433 MACS_peak_4211 68.08 +chr10 14425071 14425602 MACS_peak_620 94.14 +chr13 41582171 41582615 MACS_peak_2143 65.06 +chr10 94786191 94786440 MACS_peak_905 106.55 +chr8 109816960 109817536 MACS_peak_8026 135.33 +chr12 81878967 81879611 MACS_peak_1912 119.33 +chr7 26059620 26059929 MACS_peak_7250 118.92 +chr4 62176380 62176995 MACS_peak_5799 54.70 +chr18 53436462 53436882 MACS_peak_4104 79.74 +chr11 51502212 51502759 MACS_peak_1164 56.31 +chr19 37281848 37282416 MACS_peak_4379 68.11 +chr16 92710525 92711133 MACS_peak_3535 77.20 +chr3 120874131 120875192 MACS_peak_5483 293.69 +chr13 17696479 17697123 MACS_peak_2068 170.31 +chr19 46077680 46078113 MACS_peak_4423 65.79 +chr5 143682333 143682913 MACS_peak_6736 222.86 +chr1 154568166 154568685 MACS_peak_448 95.23 +chr9 81108308 81108891 MACS_peak_8430 145.29 +chr12 44270399 44270849 MACS_peak_1806 72.54 +chr5 106128761 106129096 MACS_peak_6489 54.49 +chr4 120903868 120904161 MACS_peak_5932 76.86 +chr3 68460731 68461147 MACS_peak_5234 137.12 +chr3 58329271 58329978 MACS_peak_5204 131.98 +chr4 151462216 151462538 MACS_peak_6188 74.40 +chr17 34059082 34059521 MACS_peak_3763 62.02 +chr15 76554091 76554392 MACS_peak_3159 72.02 +chr2 117013160 117013597 MACS_peak_4828 61.25 +chr6 8719505 8720279 MACS_peak_6785 237.86 +chr6 102485635 102486092 MACS_peak_7043 67.00 +chr18 65959921 65960294 MACS_peak_4153 65.80 +chr11 3308916 3309259 MACS_peak_1025 175.63 +chr11 44333429 44333838 MACS_peak_1139 94.32 +chr1 9933866 9934292 MACS_peak_4 151.02 +chr2 49701652 49702023 MACS_peak_4700 86.63 +chr5 147072457 147073005 MACS_peak_6754 70.79 +chr14 21993252 21993662 MACS_peak_2423 117.33 +chr14 76186318 76187158 MACS_peak_2750 90.56 +chr3 139131096 139131484 MACS_peak_5580 50.20 +chr7 26466797 26467478 MACS_peak_7255 106.79 +chr1 123464347 123464745 MACS_peak_335 70.93 +chr7 132615669 132615970 MACS_peak_7605 72.43 +chr10 86169149 86169550 MACS_peak_867 103.08 +chr13 49863349 49863865 MACS_peak_2174 74.46 +chr7 86508733 86509023 MACS_peak_7431 62.85 +chr11 77378344 77378754 MACS_peak_1326 82.92 +chr6 86634390 86634683 MACS_peak_6980 103.74 +chr10 116871908 116872390 MACS_peak_961 57.59 +chr14 41750272 41750682 MACS_peak_2513 95.75 +chr4 151469965 151470437 MACS_peak_6189 78.61 +chr10 60875630 60876040 MACS_peak_740 67.36 +chr6 91572361 91572723 MACS_peak_7021 70.91 +chr6 128795347 128795755 MACS_peak_7131 94.41 +chr18 58701674 58702163 MACS_peak_4120 87.63 +chr5 110698548 110699046 MACS_peak_6511 143.30 +chrX 93325546 93325822 MACS_peak_8637 91.45 +chr7 134432899 134433190 MACS_peak_7644 106.55 +chr12 29320679 29321102 MACS_peak_1769 54.01 +chr10 20124963 20125579 MACS_peak_639 217.11 +chr3 106416991 106417459 MACS_peak_5432 62.73 +chr10 33739040 33739699 MACS_peak_678 218.19 +chr8 74878095 74878556 MACS_peak_7883 94.87 +chr11 83382803 83383271 MACS_peak_1395 68.88 +chr3 58862746 58863399 MACS_peak_5208 157.00 +chr15 77685670 77686204 MACS_peak_3162 62.73 +chr14 86615475 86615825 MACS_peak_2782 87.58 +chr6 124867137 124867557 MACS_peak_7105 62.38 +chr13 55514580 55515153 MACS_peak_2199 117.94 +chr5 102254699 102255015 MACS_peak_6477 60.56 +chr2 25923796 25924257 MACS_peak_4561 56.91 +chr3 152061055 152061395 MACS_peak_5639 51.22 +chr6 86334798 86335187 MACS_peak_6976 60.17 +chr17 53706687 53707067 MACS_peak_3872 69.57 +chr7 18595991 18596427 MACS_peak_7225 65.59 +chr2 166445552 166446686 MACS_peak_5044 237.97 +chrX 12567874 12568307 MACS_peak_8607 92.18 +chr9 57187655 57188087 MACS_peak_8309 150.82 +chr3 96553930 96554350 MACS_peak_5363 58.24 +chr11 49794755 49795218 MACS_peak_1156 97.10 +chr12 86453801 86454179 MACS_peak_1931 69.72 +chr11 113544835 113545435 MACS_peak_1622 189.84 +chr14 14820991 14821374 MACS_peak_2395 50.82 +chr11 115936845 115937272 MACS_peak_1653 53.75 +chr9 66358765 66359165 MACS_peak_8375 66.29 +chr3 14944703 14945194 MACS_peak_5112 88.33 +chr4 147514611 147514949 MACS_peak_6139 51.38 +chr2 154429029 154429495 MACS_peak_4972 62.73 +chr17 21082204 21082596 MACS_peak_3632 82.04 +chr17 14304400 14304838 MACS_peak_3625 65.46 +chr5 65288018 65288287 MACS_peak_6358 72.51 +chr2 26518695 26519103 MACS_peak_4576 63.21 +chr1 9272967 9273590 MACS_peak_2 182.17 +chr9 79658076 79658784 MACS_peak_8427 64.31 +chr12 100872379 100872633 MACS_peak_1969 57.15 +chr2 128684574 128684999 MACS_peak_4881 74.42 +chr2 154558527 154559018 MACS_peak_4975 69.69 +chr19 61171106 61171535 MACS_peak_4446 66.05 +chr6 88431783 88432112 MACS_peak_7003 73.77 +chr3 136303454 136303877 MACS_peak_5562 56.77 +chr12 71328424 71329097 MACS_peak_1859 114.45 +chr6 72059443 72059831 MACS_peak_6930 67.17 +chr7 26514419 26514743 MACS_peak_7262 83.43 +chr12 88174557 88175940 MACS_peak_1945 236.16 +chr17 29530544 29530849 MACS_peak_3735 56.94 +chr10 80895647 80896054 MACS_peak_845 153.53 +chr7 82925765 82926126 MACS_peak_7421 129.84 +chr13 105607134 105607556 MACS_peak_2347 50.79 +chr10 19311634 19312299 MACS_peak_631 73.24 +chr11 113667257 113667748 MACS_peak_1624 426.44 +chr17 29301427 29301965 MACS_peak_3729 91.69 +chr3 28680019 28680490 MACS_peak_5135 89.03 +chr12 70548827 70549327 MACS_peak_1849 143.57 +chr9 57758663 57758985 MACS_peak_8322 57.05 +chr12 55963541 55963840 MACS_peak_1831 67.73 +chr10 80814336 80815092 MACS_peak_838 73.53 +chr9 106106639 106107165 MACS_peak_8487 92.77 +chr18 31948500 31948917 MACS_peak_4044 115.28 +chr3 97817136 97817796 MACS_peak_5372 108.56 +chr5 115608262 115608636 MACS_peak_6540 61.95 +chr1 92677004 92677359 MACS_peak_270 56.79 +chr4 154536407 154536875 MACS_peak_6204 179.88 +chr11 78866726 78867449 MACS_peak_1362 160.60 +chr1 58502152 58502677 MACS_peak_120 107.68 +chr1 78654241 78654789 MACS_peak_210 256.08 +chr17 91266249 91266654 MACS_peak_3987 67.72 +chr10 80392804 80393085 MACS_peak_829 60.79 +chr9 123168920 123169217 MACS_peak_8575 89.95 +chr18 64648305 64648906 MACS_peak_4135 50.43 +chr16 45025492 45026112 MACS_peak_3450 84.01 +chr11 120459679 120460020 MACS_peak_1701 71.31 +chr13 19606884 19607473 MACS_peak_2072 72.34 +chr7 134324277 134324565 MACS_peak_7637 91.45 +chr12 32908423 32908906 MACS_peak_1781 89.00 +chr15 24842100 24842543 MACS_peak_2955 75.93 +chr18 6489913 6490350 MACS_peak_4007 97.14 +chr14 122247568 122247901 MACS_peak_2881 54.94 +chr10 98856446 98856735 MACS_peak_921 64.17 +chr1 182741146 182741385 MACS_peak_566 91.45 +chr4 45045503 45045718 MACS_peak_5770 81.78 +chr1 95531960 95532820 MACS_peak_290 140.79 +chr2 152643374 152643779 MACS_peak_4954 153.80 +chr10 79317823 79318310 MACS_peak_795 114.99 +chr5 106146989 106147692 MACS_peak_6490 267.90 +chr4 34634278 34634904 MACS_peak_5728 116.61
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/nib/test_batch_fasta.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,9 @@ +from chipsequtil import get_org_settings, BEDFile +from chipsequtil.nib import NibDB +from pprint import pprint + +genome_dir = get_org_settings('mm9')['genome_dir'] +db = NibDB(nib_dirs=[genome_dir]) +fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') + +pprint(seqs[:10])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/nib/test_nib_db.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,47 @@ +from chipsequtil import get_org_settings, BEDFile +from chipsequtil.nib import NibDB +from pprint import pprint + +# see `org_settings.py -h` for more info on get_org_settings(<organism>) function +genome_dir = get_org_settings('mm9')['genome_dir'] + +# NibDB is an interface to a collection of nib files, typically corresponding +# to chromosomes of a genome + +# example with only one nib file +print 'NibDB with a single nib file' +db = NibDB(nib_fns=[genome_dir+'/chr1.nib']) + +print 'NibDB info:' +pprint(dict(db.db_info)) + +# get a fasta record for some sequence +print 'Example fasta record: chr1:1e8-1e8+100' +print db.get_fasta('chr1',1e8,1e8+100) + +# get just the sequence +print 'Same example, only sequence:' +print db.get_seq('chr1',1e8,1e8+100) +print + + +# example with a directory of nib files +print 'NibDB with a directory of nib files' +db = NibDB(nib_dirs=[genome_dir]) + +# get a fasta record for some sequence +print 'Example fasta record: chr1:1e8-1e8+100' +print db.get_fasta('chr1',1e8,1e8+100) + +print 'Example fasta record: chr1:1e8-1e8+100' +print db.get_fasta('chr2',1e8,1e8+100) + +print 'Example fasta record: chr1:1e8-1e8+100' +print db.get_fasta('chrX',1e8,1e8+100) + + +# example of fetching all sequences from a bed file +fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') + +print 'Num. peaks:',len(open('shuffled_peaks.bed').readlines()) +pprint(seqs[:10])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/examples/seq/test_chipsequtil_seq.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,19 @@ +from StringIO import StringIO +from chipsequtil.seq import FASTAFile, FASTQFile + +fasta_str = StringIO(">seq1\nACATAGGGAT\n>seq2\nTTATNTAGATA\n") +fasta_f = FASTAFile(fasta_str) +print fasta_f.headers + +print "[r for r in fasta_f]", [r for r in fasta_f] +print "fasta_f['seq1']", fasta_f['seq1'] +print "fasta_f.headers", fasta_f.headers +print "fasta_f.sequences", fasta_f.sequences + +fastq_str = StringIO("@seq1\nACATAGGGAT\n+seq2\nY^_cccQYJQ\n@seq2\nTTATNTAGATA\n+seq2\nY^_cJcQQJQ") +fastq_f = FASTQFile(fastq_str) +print "[r for r in fastq_f]", [r for r in fastq_f] +print "fastq_f['seq1']", fastq_f['seq1'] +print "fastq_f.headers", fastq_f.headers +print "fastq_f.sequences", fastq_f.sequences +print "fastq_f.quals", fastq_f.quals
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/ez_setup.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,284 @@ +#!python +"""Bootstrap setuptools installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import sys +DEFAULT_VERSION = "0.6c11" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] + +md5_data = { + 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', + 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', + 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', + 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', + 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', + 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', + 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', + 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', + 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', + 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', + 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', + 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', + 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', + 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', + 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', + 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', + 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', + 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', + 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', + 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', + 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', + 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', + 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', +} + +import sys, os +try: from hashlib import md5 +except ImportError: from md5 import md5 + +def _validate_md5(egg_name, data): + if egg_name in md5_data: + digest = md5(data).hexdigest() + if digest != md5_data[egg_name]: + print >>sys.stderr, ( + "md5 validation of %s failed! (Possible download problem?)" + % egg_name + ) + sys.exit(2) + return data + +def use_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + download_delay=15 +): + """Automatically find/download setuptools and make it available on sys.path + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end with + a '/'). `to_dir` is the directory where setuptools will be downloaded, if + it is not already available. If `download_delay` is specified, it should + be the number of seconds that will be paused before initiating a download, + should one be required. If an older version of setuptools is installed, + this routine will print a message to ``sys.stderr`` and raise SystemExit in + an attempt to abort the calling script. + """ + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + def do_download(): + egg = download_setuptools(version, download_base, to_dir, download_delay) + sys.path.insert(0, egg) + import setuptools; setuptools.bootstrap_install_from = egg + try: + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>="+version); return + except pkg_resources.VersionConflict, e: + if was_imported: + print >>sys.stderr, ( + "The required version of setuptools (>=%s) is not available, and\n" + "can't be installed while this script is running. Please install\n" + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() + +def download_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + delay = 15 +): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download attempt. + """ + import urllib2, shutil + egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) + url = download_base + egg_name + saveto = os.path.join(to_dir, egg_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + from distutils import log + if delay: + log.warn(""" +--------------------------------------------------------------------------- +This script requires setuptools version %s to run (even to display +help). I will attempt to download it for you (from +%s), but +you may need to enable firewall access for this script first. +I will start the download in %d seconds. + +(Note: if this machine does not have network access, please obtain the file + + %s + +and place it in this directory before rerunning this script.) +---------------------------------------------------------------------------""", + version, download_base, delay, url + ); from time import sleep; sleep(delay) + log.warn("Downloading %s", url) + src = urllib2.urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = _validate_md5(egg_name, src.read()) + dst = open(saveto,"wb"); dst.write(data) + finally: + if src: src.close() + if dst: dst.close() + return os.path.realpath(saveto) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + try: + import setuptools + except ImportError: + egg = None + try: + egg = download_setuptools(version, delay=0) + sys.path.insert(0,egg) + from setuptools.command.easy_install import main + return main(list(argv)+[egg]) # we're done here + finally: + if egg and os.path.exists(egg): + os.unlink(egg) + else: + if setuptools.__version__ == '0.0.1': + print >>sys.stderr, ( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ) + sys.exit(2) + + req = "setuptools>="+version + import pkg_resources + try: + pkg_resources.require(req) + except pkg_resources.VersionConflict: + try: + from setuptools.command.easy_install import main + except ImportError: + from easy_install import main + main(list(argv)+[download_setuptools(delay=0)]) + sys.exit(0) # try to force an exit + else: + if argv: + from setuptools.command.easy_install import main + main(argv) + else: + print "Setuptools version",version,"or greater has been installed." + print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' + +def update_md5(filenames): + """Update our built-in md5 registry""" + + import re + + for name in filenames: + base = os.path.basename(name) + f = open(name,'rb') + md5_data[base] = md5(f.read()).hexdigest() + f.close() + + data = [" %r: %r,\n" % it for it in md5_data.items()] + data.sort() + repl = "".join(data) + + import inspect + srcfile = inspect.getsourcefile(sys.modules[__name__]) + f = open(srcfile, 'rb'); src = f.read(); f.close() + + match = re.search("\nmd5_data = {\n([^}]+)}", src) + if not match: + print >>sys.stderr, "Internal error!" + sys.exit(2) + + src = src[:match.start(1)] + repl + src[match.end(1):] + f = open(srcfile,'w') + f.write(src) + f.close() + + +if __name__=='__main__': + if len(sys.argv)>2 and sys.argv[1]=='--md5update': + update_md5(sys.argv[2:]) + else: + main(sys.argv[1:]) + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/install.sh Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,31 @@ +#!/bin/bash + +# this script installs chipsequtils into /usr/local on the cluster nodes +# since the cluster nodes do not have root write access to the network +# volumes from nodes other than node 9, sudo ... doesn't work with +# setuptools because it writes egg_info to the source directory and I +# can't figure out how to get it to write to a local directory. +# +# this script copies the entire chipsequtil source tree to /tmp on the +# local machine, runs sudo ./setup.py install --prefix=/usr/local, and, +# on success, deletes the temporary source directory +# +# it _must_ be run from the source directory + +TMPDIR="/tmp/chipsequtil_tmp_$(date +%F)" +if [ ! -d $TMPDIR ]; then + echo "temporary source dir $TMPDIR does not exist, creating" + mkdir $TMPDIR +fi + +cd ../ +echo "copying source tree to $TMPDIR" +cp -vr -t $TMPDIR chipsequtil/{setup.*,ez_setup.py,src,scripts,setuptools*} +cd $TMPDIR +echo "cd'ed to $PWD, installing" +sudo ./setup.py install --prefix=/usr/local +if [ $? -eq 0 ]; then + echo "install successful, removing $TMPDIR" + cd + sudo rm -r $TMPDIR +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/org_settings.cfg Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,120 @@ +# This file is used by org_settings.py to return sets of paths/settings like +# genomic sequence files, genome sizes, etc. It is formatted according to +# Python's ConfigParser.CongifParser specification: +# +# http://docs.python.org/library/configparser.html +# +# Before installation, add any system-specific settings to the categories below, +# where categories correspond to organism/genome names, creating new category +# headings where desired. +# +# User-specific organisms and settings may be specified in: +# +# os.path.expanduser('~/.org_settings.cfg') +# +# with the same format. Settings in user configuration files override system-wide +# settings. +# +# A minimal organism configuration requires at least genome_dir and genome_size, +# other settings may be required for different tools (e.g. theme_* for THEME.py) +# +# field values can contain no spaces if they are to be exported to the command line +# (i.e. with org_settings.py) + +[hg18] +description=UCSC hg18 (March '06 build) with full TRANSFAC hypothesis set +genome=hg18 +genome_dir=/nfs/genomes/human_gp_mar_06 +genome_size=2700000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-hg18-2010-08-17.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-hg18-2010-08-17.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-hg18-2010-08-17.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-hg18-2010-08-17.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo +theme_markov=%(genome_dir)s/hg18_promoters_3000_1000.markov + +[hg18clust] +description=UCSC hg18 (March '06 build) with clustered TRANSFAC hypothesis set +genome=hg18 +genome=hg18 +genome_dir=/nfs/genomes/human_gp_mar_06 +genome_size=2700000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-hg18-2010-08-17.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-hg18-2010-08-17.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-hg18-2010-08-17.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-hg18-2010-08-17.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9_sym_clus4.tamo +theme_markov=%(genome_dir)s/hg18_promoters_3000_1000.markov +weeder_freqfiles_path=%(genome_dir)s/weeder + +[hg19] +description=UCSC hg19 (Feb '09 build) with full TRANSFAC hypothesis set +genome=hg19 +genome_dir=/nfs/genomes/human_gp_feb_09 +genome_size=2700000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s-2011-01-04.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s-2011-01-04.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s-2011-01-04.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s-2011-01-04.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo +theme_markov=%(genome_dir)s/hg19_promoters_3000_1000.markov + +[hg19clust] +description=UCSC hg19 (Feb '09 build) with clustered TRANSFAC hypothesis set +genome=hg19 +genome_dir=/nfs/genomes/human_gp_feb_09 +genome_size=2700000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s-2011-01-04.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s-2011-01-04.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s-2011-01-04.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s-2011-01-04.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9_sym_clus4.tamo +theme_markov=%(genome_dir)s/hg19_promoters_3000_1000.markov + +[mm9] +description=UCSC mm9 (July '07 build) with full TRANSFAC hypothesis set +genome=mm9 +genome_dir=/nfs/genomes/mouse_gp_jul_07 +genome_size=2107000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s.txt +affy_to_known_path=%(genome_dir)s/anno/knownToMOE43-%(genome)s.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo +theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov + +[mm9clust] +description=UCSC mm9 (July '07 build) with clustered TRANSFAC hypothesis set +genome=mm9 +genome_dir=/nfs/genomes/mouse_gp_jul_07 +genome_size=2107000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt +refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt +known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s.txt +affy_to_known_path=%(genome_dir)s/anno/knownToMOE430-%(genome)s.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9_sym_clus4.tamo +theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov + +[mm8] +description=UCSC mm8 (March '07 build) with full TRANSFAC hypothesis set +genome=mm8 +genome_dir=/nfs/genomes/mouse_gp_mar_06 +genome_size=2107000000 +ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes +refgene_anno_path=%(genome_dir)s/anno/refFlat-2010-08-26.txt +annotation_path=%(refgene_anno_path)s +known_gene_anno_path=%(genome_dir)s/anno/knownGene-2010-08-26.txt +known_gene_xref_path=%(genome_dir)s/anno/kgXref-2010-08-26.txt +affy_to_known_path=%(genome_dir)s/anno/knownToMOE430-2010-08-26.txt +theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo +theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov + +# others...
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/org_settings.cfg.sample Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,39 @@ +# This file is used by org_settings.py to return sets of paths/settings like +# genomic sequence files, genome sizes, etc. It is formatted according to +# Python's ConfigParser.CongifParser specification: +# +# http://docs.python.org/library/configparser.html +# +# Before installation, add any system-specific settings to the categories below, +# where categories correspond to organism/genome names, creating new category +# headings where desired. +# +# User-specific organisms and settings may be specified in: +# +# os.path.expanduser('~/.org_settings.cfg') +# +# with the same format. Settings in user configuration files override system-wide +# settings. +# +# A minimal organism configuration requires at least genome_dir and genome_size, +# other settings may be required for different tools (e.g. theme_* for THEME.py) +# +# field values can contain no spaces if they are to be exported to the command line +# (i.e. with org_settings.py) + +[human] +description= +genome_dir= +genome_size= +annotation_path= +theme_hyp= +theme_markov= +# others... + +[mouse] +genome_dir=/nfs/genomes/mouse_gp_jul_07 +genome_size=2107000000 +annotation_path=%(genome_dir)s/anno/refFlat.txt +theme_hyp=/nfs/vendata/cwng/motifs/TRANSFAC_vert_filt9_clus4_trunc.tamo +theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov +# others...
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/THEME.sh Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,177 @@ +#!/bin/bash + +THEME_EXE=/nfs/data/cwng/archive/cvEM.64/THEME_edit.py + +OPT_SPEC=' +{ +"NAME": "THEME.sh", +"DESC": "Run old THEME version", +"ARGS": ["FG_FASTA","BG_FASTA","HYP_FN","MARKOV"], +"OPTS": { + "CV":{"LONG":"--cv","DEFAULT":5,"TYPE":"int","HELP":"number of cross validation folds [default:%default]"}, + "NOREFINE":{"LONG":"--no-refine","ACTION":"store_true","HELP":"do not run with refinement"}, + "BETA":{"LONG":"--beta","DEFAULT":0.7,"TYPE":"float","HELP":"beta parameter to use [default:%default]"}, + "DELTA":{"LONG":"--delta","DEFAULT":0.001,"TYPE":"float","HELP":"delta parameter to use [default:%default]"}, + "RANDOMIZE":{"LONG":"--randomization","ACTION":"store_true","HELP":"run randomization"}, + "MOTIF_FN":{"LONG":"--motif-file","DEFAULT":"dummy.out","HELP":"filename to write motif results to [default:%default]"}, + "OUTPUT_FN":{"LONG":"--output-filename","DEFAULT":"dummy.txt","HELP":"filename to write motif results to [default:%default]"}, + "RANDOM_FN":{"LONG":"--random-output","DEFAULT":"random.txt","HELP":"filename to write motif results to [default:%default]"}, + "DUMP":{"LONG":"--dump","ACTION":"store_true","HELP":"dump categtories to file"}, + "REM_COM":{"LONG":"--remove-common","ACTION":"store_true","HELP":"remove common sequences from analysis"}, + "NOPARALLEL":{"LONG":"--no-parallelize","ACTION":"store_true","HELP":"do not use wqsub.py for parallelization"}, + "INTERACTIVE":{"LONG":"--interactive","ACTION":"store_true","HELP":"run the script interactively"}, + "HYP_INDS":{"LONG":"--hyp-indices","DEFAULT":"ALL","HELP":"0-based indices of hypotheses to run [default: %default]"}, + "VERBOSE":{"SHORT":"-v","LONG":"--verbose","ACTION":"store_true","HELP":"print out the commands that are being run"}, + "TRIALS":{"LONG":"--trials","HELP":"this option is here only for backwards compatibility with THEME.py"} + } +}' +OUTPUT=$(echo $OPT_SPEC | getopts.py --shell=bash -- $@) +GETOPTS_RET=$? +if [ $GETOPTS_RET -ne 0 ]; then + exit 1 +fi +$OUTPUT + +INTERACTIVE_FLAG="--auto" +if [ $INTERACTIVE != "None" ]; then + INTERACTIVE_FLAG= +fi + +eval "$(steplist.py $INTERACTIVE_FLAG -t "Run THEME" THEME "Wait for jobs" "Combine results")" + +# run THEME +OUTDIR=THEME_data +test \! -e $OUTDIR && mkdir $OUTDIR + +WQSUB_EXE="wqsub.py" +if [ $NOPARALLEL != "None" ]; then + WQSUB_EXE= +fi + +RANDOMIZE_FLAG= +if [ $RANDOMIZE != "None" ]; then + RANDOMIZE_FLAG="-randomization" +fi + +RC= +if [ $RC ]; then + RC='-rc' +fi + +if [ $HYP_INDS != "ALL" ]; then + HYP_INDS=$(parse_steplist.py $HYP_INDS) + HYP_INDS_STATUS=$? + if [ $HYP_INDS_STATUS != 0 ]; then + echo "Incorrectly formatted argument to --hyp-indices option, aborting" + exit $HYP_INDS_STATUS + fi +else + NUM_HYPS=`grep -c '^Source' $HYP_FN` + NUM_HYPS=$(($NUM_HYPS-1)) + HYP_INDS=$(seq 0 $NUM_HYPS) +fi + +JOBIDS= +next_step && \ +for i in $HYP_INDS +do + + WQSUB= + REDIRECT= + if [ ! -z $WQSUB_EXE ]; then + WQSUB="$WQSUB_EXE --wqsub-name=THEME_$i" + fi + + OUTPRE=$OUTDIR/$i + + CMD="$WQSUB python $THEME_EXE $FG_FASTA $BG_FASTA $i \ + -fse $HYP_FN -markov $MARKOV -cv $CV -beta $BETA \ + -delta $DELTA -motif_file $OUTPRE.tamo -out_file $OUTPRE.txt \ + $RC" + JOBID=$($WQSUB $CMD) + JOBIDS="$JOBID $JOBIDS" + if [ $VERBOSE != "None" ]; then + echo $WQSUB $CMD + fi + + if [ $RANDOMIZE != "None" ]; then + + WQSUB="$WQSUB_EXE --wqsub-name=THEME_rand_$i" + + CMD="$WQSUB python $THEME_EXE $FG_FASTA $BG_FASTA $i \ + -fse $HYP_FN -markov $MARKOV -cv $CV -beta $BETA \ + -delta $DELTA -out_file ${OUTPRE}_rand_output.txt \ + -random_file ${OUTPRE}_rand.txt $RC -randomization" + + JOBID=$($WQSUB $CMD) + JOBIDS="$JOBID $JOBIDS" + + if [ $VERBOSE != "None" ]; then + echo $WQSUB $CMD -randomization + fi + fi + +done + + +# wait for jobs +next_step && wait_for_jobid.py $JOBIDS + +# compile results +next_step +DO_COMPILE=$? +if [ $DO_COMPILE == 0 ]; then + + rm -f $MOTIF_FN && touch $MOTIF_FN + ( + cd $OUTDIR + ls *.tamo | sort -n | xargs -n1 -I{} -t cat {} >> ../$MOTIF_FN + ) + + if [ $NOPARALLEL == "None" ]; then + mv -f *.{err,out} THEME_data + fi + + if [ $RANDOMIZE != "None" ]; then + rm -f $RANDOM_FN && touch $RANDOM_FN + ( + cd $OUTDIR + for ind in $HYP_INDS + do + out_fn="${ind}_rand.txt" + echo "Consolidating $out_fn" + python >> ../$RANDOM_FN << EOF +import re +import sys + +from TAMO.MotifTools import load + +ind = re.match('(\d+)',"$out_fn").group(1) + +motif = load("$HYP_FN")[int(ind)] + +src = motif.source.split() +if len(src) == 0 : + print 'Got weird motif source: %s\n'%src +src = src[0]+'_%s'%ind + +mot_str = str(motif) + +cverrs = [] +for l in open("$out_fn") : + m = re.match("trial: \d+ mean test error: (\d+\.\d+)$",l) + if m is not None : + cverrs.append(float(m.group(1))) + +print "\t".join([src,mot_str,str(sum(cverrs)/len(cverrs)),repr(cverrs)]) +sys.stdout.flush() + +EOF + done + + ) + + compile_THEME_results.py $MOTIF_FN $RANDOM_FN --output=$OUTPUT_FN + + fi +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/build_chipseq_infosite.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,675 @@ +#!/usr/bin/env python + +import getpass +import glob +import json +import matplotlib +matplotlib.use('AGG') +import matplotlib.pyplot as mp +import os +import re +import shutil +import sys + +from collections import defaultdict +from csv import reader, writer, DictReader +from math import log +from optparse import OptionParser +from subprocess import call + +from chipsequtil import MACSFile, get_org_settings +from reStUtil import * + +usage = '%prog [options] [<peak filename> <peak filename> ...]' +parser = OptionParser(usage=usage) +parser.add_option('-d','--dir',dest='dir',default='.',help='Source directory [default: %default]') +parser.add_option('-n','--name',dest='name',help='Experiment name [default: current directory name]') +parser.add_option('--skip-motif-scan',dest='skip_motif_scan',action='store_true',help="skip motif_scan.py, but still build motifs into document (assumes motif_scan.py was previously run)") +parser.add_option('--skip-motif-stuff',dest='skip_motif_stuff',action='store_true',help="motif stuff takes a long time, manually skip it if no motif results are available or you don't care about them") + +{ + "experiment path": "/nfs/antdata/analysis/100809_P/100809_St7-10ul-p300_degenhar_Fraenkel_L2_mm9_sorted.bed", + "analysis path": "/net/ventral/nfs/people/labadorf/analysis/100809_P_St7_10ul", + "stage url": "http://fraenkel.mit.edu/stage/labadorf", + "peak files": { + "100809_P_St7_10ul_mfold10,30_pval1e-5": { + "total tags in control": 9331149, + "total tags in treatment": 10064908, + "Range for calculating regional lambda": "1000 bps and 10000 bps", + "tag size": 35, + "name": "100809_P_St7_10ul_mfold10,30_pval1e-5", + "model fold": "10,30", + "format": "BED", + "tags after filtering in treatment": 5099883, + "band width": 150, + "Redundant rate in control": 0.40999999999999998, + "Redundant rate in treatment": 0.48999999999999999, + "effective genome size": 2110000000.0, + "d": 145, + "maximum duplicate tags at the same position in control": 1, + "control file": "cntrl_6-3_sorted_filterbed.txt", + "MACS version": "1.4.0beta", + "ChIP-seq file": "exp_100809_St7-10ul-p300_degenhar_Fraenkel_L2_mm9_sorted.bed", + "tags after filtering in control": 5481613, + "maximum duplicate tags at the same position in treatment": 2, + "pvalue cutoff": 1.0000000000000001e-05 + } + }, + "format": "BED", + "FDR filter": "none", + "experiment name": "100809_P_St7_10ul", + "mapping type": "TSS", + "pipeline args": { + "--filter-peaks-args": "--sort-by=pvalue --top=200", + "--macs-args": "--mfold=10,30 --tsize=35 --bw=150 --format=BED --pvalue=1e-5", + "--map-args": "--tss --upstream-window=10000 --downstream-window=10000" + }, + "org": "mm9", + "control path": "/nfs/antdata/analysis/090828_42JVC/6-3/6-3_sorted_filterbed.txt", + "mapping window": [ + "10000", + "10000" + ], + "peaks used by THEME": "200", + "stage_dir": "/nfs/antdata/web_stage/labadorf" +} + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + exp_dir = os.path.abspath(opts.dir) + exp_name = opts.name if opts.name is not None else os.path.basename(exp_dir) + + # 1. find the param JSON file + param_json_fn = glob.glob('*params.json') + if len(param_json_fn) == 0 : + sys.stderr.write('Could not find parameter file, building one as best I can\n') + curr_user = getpass.getuser() + json_d = {'analysis path':os.getcwd(), + 'stage url':'http://fraenkel.mit.edu/stage/'+curr_user, + 'stage dir':'/nfs/antdata/web_stage/'+curr_user + } + else : + if len(param_json_fn) > 1 : + sys.stderr.write('Found more than one parameter file, picking the first one: %s\n'%','.join(param_json_fn)) + param_json_fn = param_json_fn[0] + json_d = json.load(open(param_json_fn)) + + # 2. make a new directory to save all the stuff + infosite_dir_name = exp_name+'_infosite' + infosite_path = os.path.join(os.getcwd(),infosite_dir_name) + if not os.path.exists(infosite_path) : + os.mkdir(infosite_path) + + infosite_img_path = os.path.join(infosite_path,'images') + if not os.path.exists(infosite_img_path) : + os.mkdir(infosite_img_path) + + # 3. setup web staging directory + stage_dir_path = os.path.join(json_d['stage dir'],infosite_dir_name) + if not os.path.exists(stage_dir_path) : + os.symlink(infosite_path,stage_dir_path) + + # 4. get the peaks files stats, don't want negative peaks + if len(args) == 0 : + peaks_fns = glob.glob('*_peaks.xls') + peaks_fns = filter(lambda x: 'negative' not in x,peaks_fns) + else : + peaks_fns = args + analysis_sets = [] + peak_json = json_d['peak files'] = {} + + # analyze all the peak files + for peak_fn in peaks_fns : + print 'processing:',peak_fn + macs_f = MACSFile(peak_fn) + peak_json[peak_fn] = macs_f.file_info + + # positive peaks + peak_stats = defaultdict(list) + num_peaks = 0 + pos_chr_dist = defaultdict(int) + for peak in macs_f : + pos_chr_dist[peak['chr']] += 1 + peak_stats['length'].append(peak['length']) + peak_stats['tags'].append(peak['tags']) + peak_stats['pvalue'].append(peak['-10*log10(pvalue)']) + peak_stats['fold_enrichment'].append(peak['fold_enrichment']) + peak_stats['fdr'].append(peak['FDR(%)']) + num_peaks += 1 + + peak_json[peak_fn]['positive peaks'] = num_peaks + peak_json[peak_fn]['reads under peaks'] = sum(peak_stats['tags']) + + # extract paired peaks info out of output.txt + output_fn = peak_json[peak_fn]['name']+'_output.txt' + output_regexes = ('#2 number of (paired peaks): (\d+)',) + for l in open(output_fn) : + for regex in output_regexes : + m = re.search(regex,l) + if m is not None : + peak_json[peak_fn][m.group(1)] = int(m.group(2)) + + # do the negative peaks + # negative peak file is now filtered + neg_peak_fns = glob.glob(peak_json[peak_fn]['name']+'_negative_peaks_*.xls') + + #TODO - do check for file exists + if neg_peak_fns : + neg_peak_fn = neg_peak_fns[0] + neg_peak_f = MACSFile(neg_peak_fn) + + neg_peak_stats = defaultdict(list) + num_peaks = 0 + neg_chr_dist = defaultdict(int) + for peak in neg_peak_f : + neg_chr_dist[peak['chr']] += 1 + neg_peak_stats['length'].append(peak['length']) + neg_peak_stats['tags'].append(peak['tags']) + neg_peak_stats['pvalue'].append(peak['-10*log10(pvalue)']) + neg_peak_stats['fold_enrichment'].append(peak['fold_enrichment']) + neg_peak_stats['fdr'].append(peak['FDR(%)']) + num_peaks += 1 + + peak_json[peak_fn]['negative peaks'] = num_peaks + peak_json[peak_fn]['reads under negative peaks'] = sum(peak_stats['tags']) + else : + peak_json[peak_fn]['negative peaks'] = 'NA' + peak_json[peak_fn]['reads under negative peaks'] = 'NA' + + # save the track lines + ucsc_track_fn = peak_json[peak_fn]['name']+'_MACS_wiggle_tracks.txt' + if os.path.exists(ucsc_track_fn) : + peak_json[peak_fn]['ucsc tracks'] = open(ucsc_track_fn).readlines() + + font = {'size':'9'} + mp.rc('font',**font) + + figsize = (3.5,3.5) + subplots_sizes = {'top':0.8,'left':0.15,'right':0.95} + hist_labels = ('+ peaks','- peaks') + # create histograms for each of the attributes + len_hist_name = macs_f.file_info['name']+'_length.png' + len_hist_fn = os.path.join(infosite_img_path,len_hist_name) + len_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+len_hist_name + peak_json[peak_fn]['length distribution url'] = len_hist_url + mp.figure(figsize=figsize) + mp.subplots_adjust(**subplots_sizes) + mp.hist((peak_stats['length'],neg_peak_stats['length']),label=hist_labels,bins=20,log=True) + mp.title('%s\npeak length distribution'%macs_f.file_info['name']) + mp.xlabel('peak length') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(len_hist_fn) + mp.clf() + + tags_hist_name = macs_f.file_info['name']+'_tags.png' + tags_hist_fn = os.path.join(infosite_img_path,tags_hist_name) + tags_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+tags_hist_name + peak_json[peak_fn]['tag distribution url'] = tags_hist_url + mp.figure(figsize=figsize) + mp.subplots_adjust(**subplots_sizes) + mp.hist((peak_stats['tags'],neg_peak_stats['tags']),label=hist_labels,bins=20,log=True) + mp.title('%s\npeak tag count distribution'%macs_f.file_info['name']) + mp.xlabel('# tags') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(tags_hist_fn) + mp.clf() + + pval_hist_name = macs_f.file_info['name']+'_pval.png' + pval_hist_fn = os.path.join(infosite_img_path,pval_hist_name) + pval_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pval_hist_name + peak_json[peak_fn]['pvalue distribution url'] = pval_hist_url + mp.figure(figsize=figsize) + mp.subplots_adjust(**subplots_sizes) + mp.hist((peak_stats['pvalue'],neg_peak_stats['pvalue']),label=hist_labels,bins=20,log=True) + mp.title('%s\npeak -10*log10(p-valuek) distribution'%macs_f.file_info['name']) + mp.xlabel('-10*log10(p-value)') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(pval_hist_fn) + mp.clf() + + fold_hist_name = macs_f.file_info['name']+'_fold.png' + fold_hist_fn = os.path.join(infosite_img_path,fold_hist_name) + fold_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+fold_hist_name + peak_json[peak_fn]['fold distribution url'] = fold_hist_url + mp.figure(figsize=figsize) + mp.subplots_adjust(**subplots_sizes) + mp.hist((peak_stats['fold_enrichment'],neg_peak_stats['fold_enrichment']),label=hist_labels,bins=20,log=True) + mp.title('%s\npeak fold enrichment distribution'%macs_f.file_info['name']) + mp.xlabel('fold enrichment') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(fold_hist_fn) + mp.clf() + + fdr_hist_name = macs_f.file_info['name']+'_fdr.png' + fdr_hist_fn = os.path.join(infosite_img_path,fdr_hist_name) + fdr_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+fdr_hist_name + peak_json[peak_fn]['fdr distribution url'] = fdr_hist_url + mp.figure(figsize=figsize) + mp.subplots_adjust(**subplots_sizes) + mp.hist(peak_stats['fdr'],label=hist_labels[0],bins=20,log=True) + mp.title('%s\npeak fdr distribution'%macs_f.file_info['name']) + mp.xlabel('fdr') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(fdr_hist_fn) + mp.clf() + + chr_dist_name = macs_f.file_info['name']+'_chr_dist.png' + chr_dist_fn = os.path.join(infosite_img_path,chr_dist_name) + chr_dist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+chr_dist_name + peak_json[peak_fn]['chr distribution url'] = chr_dist_url + chromos = [] + if json_d.has_key('org') : + chr_sizes_fn = get_org_settings(json_d['org'])['ucsc_chrom_sizes'] + chromos = [r[0] for r in reader(open(chr_sizes_fn),delimiter='\t')] + else : + chromos = list(set(pos_chr_dist.keys()).union(neg_chr_dist.keys())) + standard_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is not None,chromos) + + # hack chrM, chrX and chrY so they sort right + if 'chrM' in standard_chromos : + standard_chromos[standard_chromos.index('chrM')] = 'chr100' + if 'chrX' in standard_chromos : + standard_chromos[standard_chromos.index('chrX')] = 'chr101' + if 'chrY' in standard_chromos : + standard_chromos[standard_chromos.index('chrY')] = 'chr102' + + standard_chromos.sort(key=lambda x: int(x.replace('chr',''))) + + # unhack chrM, chrX and chrY so they display right + if 'chr100' in standard_chromos : + standard_chromos[standard_chromos.index('chr100')] = 'chrM' + if 'chr101' in standard_chromos : + standard_chromos[standard_chromos.index('chr101')] = 'chrX' + if 'chr102' in standard_chromos : + standard_chromos[standard_chromos.index('chr102')] = 'chrY' + + other_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is None,chromos) + + pos_plot_chr_dist = defaultdict(int) + neg_plot_chr_dist = defaultdict(int) + for chrom in standard_chromos : + pos_plot_chr_dist[chrom] += pos_chr_dist.get(chrom,0) + neg_plot_chr_dist[chrom] += neg_chr_dist.get(chrom,0) + for chrom in other_chromos : + pos_plot_chr_dist['Other'] += pos_chr_dist.get(chrom,0) + neg_plot_chr_dist['Other'] += neg_chr_dist.get(chrom,0) + chromos.append('Other') + mp.figure(figsize=figsize) + mp.subplots_adjust(bottom=0.18,**subplots_sizes) + mp.bar(range(len(chromos)), + [pos_plot_chr_dist[k] for k in chromos], + width=0.45, + color='b', + label='Positive' + ) + mp.bar([x+0.45 for x in range(len(chromos))], + [neg_plot_chr_dist[k] for k in chromos], + width=0.45, + color='g', + label='Negative' + ) + mp.xticks([x+0.45 for x in range(len(chromos))],chromos,rotation=90) + mp.title('%s\nPeaks by chromosome'%macs_f.file_info['name']) + mp.xlabel('Chromosome') + mp.ylabel('# peaks') + mp.legend() + mp.savefig(chr_dist_fn) + mp.clf() + + # pos vs neg peaks + pos_v_neg_name = '%s_pos_v_neg.png'%macs_f.file_info['name'] + pos_v_neg_fn = os.path.join(infosite_img_path,pos_v_neg_name) + pos_v_neg_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pos_v_neg_name + peak_json[peak_fn]['pos v neg url'] = pos_v_neg_url + cmd = 'plot_pos_vs_neg_peaks.py --output=%s %s %s'%(pos_v_neg_fn,peak_fn, neg_peak_fn) + sys.stderr.write(cmd+'\n') + r = call(cmd,shell=True) + + # motif stuff + if opts.skip_motif_scan or opts.skip_motif_stuff : + sys.stderr.write('Obediently skipping motif stuff\n') + else : + # not exactly sure the best way to find the filtered macs file yet, + # just take the .xls file with the longest filename? + filtered_peak_fns = glob.glob('%s_peaks_*'%macs_f.file_info['name']) + filtered_peak_fns.sort(key=lambda x: len(x),reverse=True) + filtered_peak_fn = filtered_peak_fns[0] + + motif_results_fns = glob.glob('%s_motifs_beta*_cv*[0-9].tamo'%macs_f.file_info['name']) + motif_results_fn = motif_results_fns[0] + #TODO - do check for file exists + + # motif_scan.py <org> <peak fn> <TAMO motif fn> + fixed_peak_width = '' + if json_d['fixed peak width'] != 'none' : + fixed_peak_width = '--fixed-peak-width=%s'%json_d['fixed peak width'] + + cmd = 'motif_scan.py %s --dir=%s/images/ %s %s %s' + cmd = cmd%(fixed_peak_width,infosite_dir_name,json_d['org'],filtered_peak_fn,motif_results_fn) + sys.stderr.write(cmd+'\n') + call(cmd,shell=True) + + # pot_peaks_vs_motifs.py <peaks fn> <seq score fn> <bg score fn> + + + # 5. build reSt document + reSt_fn = exp_name+'_info.rst' + reSt_path = os.path.join(infosite_path,reSt_fn) + reSt_html_name = exp_name+'_info.html' + reSt_html_path = os.path.join(infosite_path,reSt_html_name) + reSt_url = json_d['stage url'] + '/' + infosite_dir_name + '/' + reSt_html_name + doc = ReStDocument(reSt_path) + doc.add(ReStSection("Infopage for %s"%exp_name)) + + # basic experiment stats table + ident = lambda x: x or 'unknown' + stat_key_labels_fmts = [ + ('org','Organism',ident), + ('analysis path','Analysis Path',ident), + ('experiment path','Experiment Path',ident), + ('control path','Control Path',ident), + ('format','Read Format',ident), + ('FDR filter','FDR filter',ident), + ('mapping type','Gene Mapping Type',ident), + ('mapping window','Gene Mapping Window',lambda x: x and '-%s,%s'%tuple(x)), + ('peaks used by THEME','Peaks used by THEME',ident) + ] + stat_rows = [('**%s**'%label, fmt(json_d.get(key))) for key,label,fmt in stat_key_labels_fmts] + doc.add(ReStSimpleTable(None,stat_rows)) + + doc.add(ReStSection('MACS Peak File Stats',level=2)) + + # go through peak files + peak_recs = json_d['peak files'] + fl_str = lambda x: x and '%.2g'%float(x) + stat_key_labels_fmts = [ + ('paired peaks','*paired peaks*',ident), + ('positive peaks','*positive peaks*',ident), + ('negative peaks','*negative peaks*',ident), + ('reads under peaks','*reads under positive peaks*',ident), + ('total tags in treatment','*Treatment Tags*',ident), + ('tags after filtering in treatment','after filtering',ident), + ('Redundant rate in treatment','redunancy rate',fl_str), + ('maximum duplicate tags at the same position in treatment','max dup. tags',ident), + ('total tags in control','*Control Tags*',ident), + ('tags after filtering in control','after filtering',ident), + ('Redundant rate in control','redunancy rate',fl_str), + ('maximum duplicate tags at the same position in control','max dup. tags',ident), + ('peak tag count filter','*Minimum peak tag count*',ident), + ('d','*MACS d*',ident), + ('band width','*band width*',ident), + ('MACS version','*MACS version*',ident), + ('pvalue cutoff','*p-value cutoff*',lambda x: '1e%d'%int(log(x,10))), + ] + + for peak_fn,peak_stats in peak_recs.items() : + + # add the new section and stats table + doc.add(ReStSection(peak_fn,level=3)) + stat_rows = [('*%s*'%label, fmt(peak_stats.get(key))) for key,label,fmt in stat_key_labels_fmts] + doc.add(ReStSimpleTable(None,stat_rows)) + + # link to the peaks file + peak_infosite_name = os.path.join(infosite_dir_name,peak_fn) + peak_infosite_path = os.path.abspath(peak_infosite_name) + peak_infosite_url = json_d['stage url'] + '/' + peak_infosite_name + call('cp %s %s'%(peak_fn,os.path.join(infosite_dir_name,peak_fn)),shell=True) + doc.add(ReStSimpleTable(None,[('**MACS Peaks File**','`%s`_'%peak_infosite_url)])) + doc.add(ReStHyperlink(peak_infosite_url,url=peak_infosite_url)) + + # UCSC track info + if peak_stats.has_key('ucsc tracks') : + ucsc_tbl = ReStSimpleTable(('**UCSC Genome Browser Track Lines**',), + [[x] for x in peak_stats['ucsc tracks']]) + doc.add(ucsc_tbl) + else : + doc.add(ReStSimpleTable(None,[['UCSC integration was not enabled for this experiment']])) + + # peak quality plots + img_tbl1 = ReStSimpleTable(None, [ + [ + ReStImage(peak_stats['pos v neg url'],options={'width':'600px','align':'center'}), + ] + ] + ) + doc.add(img_tbl1) + + img_tbl2 = ReStSimpleTable(None, [ + [ + ReStImage(peak_stats['length distribution url'],options={'width':'250px','align':'center'}), + ReStImage(peak_stats['tag distribution url'],options={'width':'250px','align':'center'}), + ReStImage(peak_stats['pvalue distribution url'],options={'width':'250px','align':'center'}) + ], + [ + ReStImage(peak_stats['fold distribution url'],options={'width':'250px','align':'center'}), + ReStImage(peak_stats['fdr distribution url'],options={'width':'250px','align':'center'}), + ReStImage(peak_stats['chr distribution url'],options={'width':'250px','align':'center'}) + ] + ] + ) + doc.add(img_tbl2) + + # gene info + gene_fn = peak_stats['name']+'_genes.txt' + gene_link = os.path.join(infosite_dir_name,gene_fn) + if not os.path.exists(gene_link) : + shutil.copyfile(gene_fn,gene_link) + gene_url = json_d['stage url']+'/'+gene_link + + # gather other gene mapping stats + # knownGeneID + # geneSymbol + # chr + # start + # end + # length + # summit + # tags + # -10*log10(pvalue) + # fold_enrichment + # FDR(%) + # peak + # loc + # dist + # from + # feature + # score + # map + # type + # map + # subtype + + gene_reader = DictReader(open(gene_fn),delimiter='\t') + gene_stats = defaultdict(set) + gene_pvals = defaultdict(float) + for rec in gene_reader : + gene_stats['num knownGenes'].add(rec['knownGeneID']) + gene_stats['num geneSymbols'].add(rec['geneSymbol']) + gene_pvals[rec['geneSymbol']] = max(gene_pvals[rec['geneSymbol']],float(rec['-10*log10(pvalue)'])) + gene_pvals = gene_pvals.items() + gene_pvals.sort(key=lambda x: x[1],reverse=True) + for k,v in gene_pvals[:20]: + print k,v + gene_mapping_data = [('**# knownGenes mapped**',len(gene_stats['num knownGenes'])), + ('**# gene symbols mapped**',len(gene_stats['num geneSymbols'])), + ('**Top 10 gene symbols**',','.join([x[0] for x in gene_pvals[:10]])), + ('**All gene mappings**','`%s`_'%gene_url) + ] + + # plots from plot_peak_loc_dist.py + gene_pie_name = exp_name+'_gene_map.png' + peak_pie_name = exp_name+'_peak_map.png' + hist_name = exp_name+'_peak_dist.png' + pval_bar_name = exp_name+'_pval_bar.png' + peak_loc_d = {'out_dir':infosite_path, + 'gene_pie_fn':os.path.join(infosite_path,'images',gene_pie_name), + 'peak_pie_fn':os.path.join(infosite_path,'images',peak_pie_name), + 'pval_bar_fn':os.path.join(infosite_path,'images',pval_bar_name), + 'hist_fn':os.path.join(infosite_path,'images',hist_name), + 'peak_fn':peak_fn, + 'gene_name':gene_fn + } + cmd = 'plot_peak_loc_dist.py --save -d %(out_dir)s -g %(gene_pie_fn)s ' \ + '-p %(peak_pie_fn)s -f %(hist_fn)s -b %(pval_bar_fn)s ' \ + '%(peak_fn)s %(gene_name)s' + sys.stderr.write(cmd%peak_loc_d+'\n') + call(cmd%peak_loc_d,shell=True) + peak_stats['gene map url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+gene_pie_name + peak_stats['peak map url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+peak_pie_name + peak_stats['pval bar url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pval_bar_name + peak_stats['dist url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+hist_name + + # make links to the different peaks files + feature_patts = ('promoter.txt','gene_exon.txt','gene_intron.txt','after.txt','intergenic.xls') + feature_data = [] + feature_urls = [] + + for patt in feature_patts : + feature_fn = '%s_*_%s'%(peak_stats['name'],patt) + feature_path = glob.glob(os.path.join(infosite_dir_name,feature_fn)) + if len(feature_path) == 0 : + sys.stderr.write('Warning: %s could not be found, skipping feature type\n'%os.path.join(infosite_dir_name,feature_fn)) + continue + feature_path = feature_path[0] + feature_url = json_d['stage url']+'/'+feature_path + + # create UCSC formatted versions of the files + if patt.endswith('.txt') : # these have gene columns + feature_type = patt.replace('.txt','') + ucsc_feature_fn = feature_fn.replace('.txt','_ucsc.txt') + st,en = 2,4 + elif patt.endswith('.xls') : + feature_type = patt.replace('.xls','') + ucsc_feature_fn = feature_fn.replace('.xls','_ucsc.xls') + st,en = 0,2 + + ucsc_feature_path = os.path.join(infosite_dir_name,ucsc_feature_fn) + ucsc_feature_f = open(ucsc_feature_path,'w') + ucsc_feature_writer = writer(ucsc_feature_f,delimiter='\t') + for l in reader(open(feature_path),delimiter='\t') : + rec = l[0:st] + \ + ['%s:%s-%s'%tuple(l[st:en+1])] + \ + l[en+1:] + ucsc_feature_writer.writerow(rec) + ucsc_feature_f.close() + + ucsc_feature_url = json_d['stage url']+'/'+ucsc_feature_path + + feature_data.append(('**%s peaks**'%feature_type,'`%s`_ `UCSC %s`_'%(feature_url,feature_type))) + feature_urls.append(ReStHyperlink(feature_url,url=feature_url)) + feature_urls.append(ReStHyperlink('UCSC %s'%feature_type,url=ucsc_feature_url)) + + gene_mapping_data.extend(feature_data) + feat_tbl = ReStSimpleTable(('**Gene mapping data**',''),gene_mapping_data) + doc.add(feat_tbl) + doc.add(ReStHyperlink(gene_url,url=gene_url)) + for url in feature_urls : + doc.add(url) + + img_tbl3 = ReStSimpleTable(None, [ + [ + ReStImage(peak_stats['gene map url'],options={'align':'center'}), + ReStImage(peak_stats['peak map url'],options={'align':'center'}) + ], + [ + ReStImage(peak_stats['pval bar url'],options={'align':'center'}), + ReStImage(peak_stats['dist url'],options={'align':'center'}) + ] + ] + ) + doc.add(img_tbl3) + + # now put some motif stuff up there + + + if opts.skip_motif_stuff : + sys.stderr.write('Obediently skipping even more motif stuff\n') + else : + # THEME refines all motifs, display the top 30 + + # for now, just list a table of the top 30 significant, unrefined motifs + doc.add(ReStSection('%s Top 30 Refined Motif Results'%peak_stats['name'],level=3)) + motif_results_fns = glob.glob('%s_motifs_beta*_cv*[0-9].txt'%macs_f.file_info['name']) #catRun_mfold10,30_pval1e-5_motifs_beta0.0_cv5.txt + #TODO - do check for file exists + + motif_results_fn = motif_results_fns[0] + + motif_reader = reader(open(motif_results_fn),delimiter='\t') + + motif_header = motif_reader.next() + motif_data = [] + top_n = 30 + motif_fmts = (ident,ident,int,fl_str,fl_str,fl_str,fl_str,fl_str,fl_str) + motif_plot_urls = [] + for rec in motif_reader : + motif_data.append([f(x) for f,x in zip(motif_fmts,rec)]) + """ + if rec[2] in motif_sig_inds_d.keys() : + from_id = motif_sig_inds_d[rec[2]] + try : + old_id_fn = glob.glob(infosite_dir_name+'/images/*_%d_peakmot.png'%from_id)[0] + new_id_fn = old_id_fn.replace('_%d_'%from_id,'_%s_'%rec[2]) + os.rename(old_id_fn,new_id_fn) + except : + sys.stderr.write("Couldn't rename file for pattern %s, just " \ + "assuming its there\n"%(infosite_dir_name+'/images/*_%d_peakmot.png'%from_id)) + """ + new_id_fn = glob.glob(infosite_dir_name+'/images/*_%s_peakmot.png'%rec[2])[0] + motif_plot_urls.append(json_d['stage url']+'/'+new_id_fn) + + doc.add(ReStSimpleTable(['**%s**'%x for x in motif_header],motif_data[:top_n])) + + # create another file with the full table + motif_results_base, motif_results_ext = os.path.splitext(motif_results_fn) + motif_doc_fn = motif_results_base+'.rst' + motif_doc_path = os.path.join(infosite_path,motif_doc_fn) + motif_doc_html_fn = motif_results_base+'.html' + motif_doc_html_path = os.path.join(infosite_path,motif_doc_html_fn) + motif_doc_url = json_d['stage url']+'/'+infosite_dir_name+'/'+motif_doc_html_fn + motif_doc = ReStDocument(motif_doc_path) + motif_doc.add(ReStSection('%s Full Motif Results'%peak_stats['name'])) + motif_doc.add('`Back to main infopage`_') + motif_doc.add(ReStSimpleTable(['**%s**'%x for x in motif_header],motif_data)) + motif_doc.add('`Back to main infopage`_') + motif_doc.add(ReStHyperlink('Back to main infopage',url=reSt_url)) + motif_doc.write() + motif_doc.close() + rst2html_call = 'rst2html.py --stylesheet-path=/nfs/antdata/web_stage/css/lsr.css ' \ + '%s %s'%(motif_doc_path,motif_doc_html_path) + sys.stderr.write(rst2html_call+'\n') + r = call(rst2html_call,shell=True) + doc.add('`All refined motifs`_') + doc.add(ReStHyperlink('All refined motifs',url=motif_doc_url)) + + # individual motif plots + plt_tbl = [] + for i,url in enumerate(motif_plot_urls[:30]) : + if i%3 == 0 : + plt_tbl.append([]) + plt_tbl[-1].append(ReStImage(url)) + + doc.add(ReStSimpleTable(('**Peak strength vs refined motif strength**','(based on top 2000 peak sequences by pvalue)',''),plt_tbl)) + + doc.write() + doc.close() + + # 6. convert reSt to PDF and HTML + rst2html_call = 'rst2html.py --stylesheet-path=/nfs/antdata/web_stage/css/lsr.css ' \ + '%s %s'%(reSt_path,reSt_html_path) + sys.stderr.write(rst2html_call+'\n') + r = call(rst2html_call,shell=True) + + pdf_name = exp_name+'_info.pdf' + pdf_path = os.path.join(infosite_path,pdf_name) + r = call('rst2pdf %s -o %s'%(reSt_path,pdf_path),shell=True) + + # 7. write out url to infosite + print json_d['stage url']+'/'+infosite_dir_name+'/'+reSt_html_name + open(infosite_dir_name+'_url.txt','w').write(json_d['stage url']+'/'+infosite_dir_name+'/'+reSt_html_name+'\n')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/chipseq_pipeline.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,331 @@ +#!/usr/bin/env python + +import os +from subprocess import Popen, PIPE +import string +import sys +from optparse import OptionParser, OptionGroup, SUPPRESS_HELP + +from pypeline import Pypeline, ProcessPypeStep as PPS, PythonPypeStep as PyPS, parse_steplist +from chipsequtil import get_file_parts, get_org_settings +from chipsequtil.util import MultiLineHelpFormatter +from TAMO import MotifTools +from TAMO.MD.THEME import parser as theme_parser + +usage = "%prog [options] <organism> <experiment alignment filename> [<control alignment filename>]" +description = """1st generation ChIPSeq analysis pipeline: + + - runs MACS to find peaks and sorts peaks by p-value + - sorts peaks by pvalue and isolates top *n* + - maps peaks to genes + - extracts fasta files for gene peaks in experiments + - constructs background sequences matching foreground distribution + - runs THEME.py on input sequences w/ refinement + - builds an infosite with stats from this analysis + +Control input file is optional. *organism* argument is passed to the +*org_settings.py* command to specify organism specific parameters, ensure +that the following commands return valid paths: + +If running MACS: + - org_settings.py <organism> genome_size + - org_settings.py <organism> genome_dir + - org_settings.py <organsim> refgene_anno_path + +If running THEME: + - org_settings.py <organism> theme_hypotheses + - org_settings.py <organism> theme_markov + +""" + +epilog = """Note: it is advised to leave the --*-args arguments unchanged +unless you really know what you're doing.""" + +parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) +parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') +parser.add_option('--steplist',dest='steplist',default='',help='with --auto, run specific steps') +parser.add_option('--exp-name',dest='exp_name',default=os.path.basename(os.getcwd()),help='name for the experiment/pipeline, used for convenience [default: current directory name]') +parser.add_option('--bed-args',dest='bed_args',default='--stdout --chromo-strip=.fa',help='double quote wrapped arguments for gerald_to_bed.py [default: %default]') +#parser.add_option('--stats-args',dest='stats_args',default='',help='double quote wrapped arguments for gerald_stats.py [default: %default]') +parser.add_option('--macs-exec',dest='macs_exec',default='macs14',help='the executable to use for MACS, if not an absolute path it needs to be on your shell environment path [default: %default]') +parser.add_option('--macs-args',dest='macs_args',default='--pvalue=1e-5',help='double quote wrapped arguments for macs, only changing --mfold, --tsize, --bw, and --pvalue recommended [default: %default]') +parser.add_option('--map-args',dest='map_args',default='--tss --upstream-window=10000 --downstream-window=10000',help='double quote wrapped arguments for mapping peaks to genes [default: %default]') +parser.add_option('--filter-peaks-args',dest='filter_peaks_args',default="--sort-by=pvalue --top=1000 -f 'tags>20'",help='double quote wrapped arguments for filter_macs_peaks.py [default: %default]') +parser.add_option('--filter-neg-peaks-args',dest='filter_neg_peaks_args',default="-f 'tags>20'",help='double quote wrapped arguments for filter_macs_peaks.py applied to negative peaks [default: %default]') +parser.add_option('--peaks-to-fa-args',dest='peaks_to_fa_args',default='--fixed-peak-width=200',help='double quote wrapped arguments for peaks_to_fasta.py [default: %default]') +parser.add_option('--bg-exec',dest='bg_exec',default='rejection_sample_fasta.py',help='the executable to use for generating background sequences for THEME, if not an absolute path it needs to be on your shell environment path [default: %default]') +parser.add_option('--bg-args',dest='bg_args',default='--num-seq=2.1x',help='double quote wrapped arguments for background sequence generation utility [default: %default]') +parser.add_option('--theme-args',dest='theme_args',default='--beta=0.7 --cv=5 --trials=25',help='double quote wrapped arguments for THEME.py [default: %default]') +parser.add_option('--motif-pval-cutoff',dest='motif_pval',type='float',default=1e-5,help='the p-value cutoff for sending non-refined enrichmed motifs to THEME for refinement') +parser.add_option('--parallelize',dest='parallelize',action='store_true',help='parallelize portions of the pipeline using qsub, only works from SGE execution hosts') +parser.add_option('--ucsc',dest='ucsc',action='store_true',default=False,help='perform tasks for automated integration with UCSC genome browser [default:%default]') +parser.add_option('--build-infosite-args',dest='infosite_args',default='',help='arguments to pass to build_chipseq_infosite.py [default: None]') + +ucsc_group = OptionGroup(parser,"UCSC Integration Options (with --ucsc)") +ucsc_group.add_option('--stage-dir',dest='stage_dir',default='./',help='root directory where UCSC integration files should be made available [default: %default]') +ucsc_group.add_option('--stage-url',dest='stage_url',default='http://localhost/',help='URL where UCSC integration files will be made available over the web [default: %default]') +parser.add_option_group(ucsc_group) + +#parallel_group = OptionGroup(parser,"Parallelization Options (with --parallelize)",description="These options are relevant to parallelization of the pipeline, functionality is in beta status until further notice") +#parallel_group.add_option('--split-args',dest='split_args',default='--type=count --arg=16',help='double quote wrapped arguments for split_file.py [default: %default]') +#parallel_group.add_option('--qsub-args',dest='qsub_args',default='--die-on-err',help='double quote wrapped arguments for split_qsub.py [default: %default]') +#parser.add_option_group(parallel_group) + +parser.add_option('--print-args',dest='print_args',action='store_true',help=SUPPRESS_HELP) # secret ninja option + + +if __name__ == '__main__' : + + # parse command line arguments + opts, args = parser.parse_args(sys.argv[1:]) + + # stick it up here, so when we print out args it's updated + if opts.ucsc and opts.macs_args.find('--wig') == -1 : + opts.macs_args += " --wig" + + # just print out all options as passed in for script generating purposes + if opts.print_args : + opts_strs = [] + all_opts = [] + all_opts.extend(parser.option_list) + all_opts.extend(*[x.option_list for x in parser.option_groups]) + for opt in all_opts : + opt_str = opt.get_opt_string() + if opt_str in ['--help','--print-args'] : + pass + elif opt_str == '--steplist' and not opts.auto : + pass + #elif opt_str in ['--stage-dir','--stage-url'] and not opts.ucsc : + # pass + #elif opt_str in ['--split-args','--qsub-args'] and not opts.parallelize : + # pass + elif opt.action == 'store' : + arg = str(getattr(opts,opt.dest)) + if arg.count(' ') > 0 or arg.find(' -') != -1 or arg.startswith('-') or arg.find('--') != -1 : + opts_strs.append(' %s="%s"'%(opt.get_opt_string(),str(getattr(opts,opt.dest)))) + else : + opts_strs.append(' %s=%s'%(opt.get_opt_string(),str(getattr(opts,opt.dest)))) + elif opt.action == 'store_true' and getattr(opts,opt.dest) : + opts_strs.append(' %s'%opt.get_opt_string()) + opts_strs.append(' $@') + sys.stdout.write(' \\\n'.join(opts_strs)+'\n') + sys.exit(0) + + if len(args) < 2 : + parser.error('Must provide two non-option arguments') + + # filenames and paths + organism, experiment_fn = args[0:2] + control_fn = None + if len(args) > 2 : + control_fn = args[2] + + org_settings = get_org_settings(organism) + refgene_fn = org_settings['refgene_anno_path'] + kg_ref = org_settings['known_gene_anno_path'] + kg_xref = org_settings['known_gene_xref_path'] + + exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) + exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) + + if control_fn : + cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) + cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) + + # the pipeline + #log_fn = os.path.join(opts.exp_name+'_pipeline.log') + pipeline = Pypeline('Analysis pipeline for %s'%opts.exp_name) + + steps = [] + + #if opts.parallelize : + # # split up files + # calls = ["mkdir %s"%exp_wrk_dir, + # "split_file.py %s --outdir=%s %s"%(opts.split_args,exp_wrk_dir,experiment_fn),] + # if control_fn : + # calls.extend(["mkdir %s"%cnt_wrk_dir, + # "split_file.py %s --outdir=%s %s"%(opts.split_args,cnt_wrk_dir,control_fn), + # ]) + # steps.append(PPS('Split files',calls,env=os.environ)) + + ############################################################################ + # run macs + ############################################################################ + cnt_flag = '' + if control_fn : + cnt_flag = '-c %s'%control_fn + + # parse macs_args so we can extract mfold and pvalue...in a rather silly way + macs_mfold = [x for x in opts.macs_args.split(' ') if 'mfold' in x] + macs_mfold = macs_mfold[0].split('=',1)[1] if len(macs_mfold) >= 1 else 'DEF' + + macs_pvalue = [x for x in opts.macs_args.split(' ') if 'pvalue' in x] + macs_pvalue = macs_pvalue[0].split('=',1)[1] if len(macs_pvalue) >= 1 else 'DEF' + macs_name = opts.exp_name+'_mfold%s_pval%s'%(macs_mfold,macs_pvalue) + + macs_peaks_fn = macs_name+'_peaks.xls' + macs_neg_peaks_fn = macs_name+'_negative_peaks.xls' + macs_screen_output_fn = macs_name+'_output.txt' + + macs_d = {'exp_fn':experiment_fn, + 'cnt_flag':cnt_flag, + 'name':macs_name, + 'macs_exec':opts.macs_exec, + 'macs_args':opts.macs_args, + 'macs_out':macs_screen_output_fn, + 'gsize':org_settings['genome_size'], + } + calls = ["%(macs_exec)s --gsize=%(gsize)s -t %(exp_fn)s %(cnt_flag)s --name=%(name)s %(macs_args)s 2>&1 | tee %(macs_out)s"%macs_d] + steps.append(PPS('Run MACS',calls,env=os.environ)) + + + ############################################################################ + # process and stage wiggle files + ############################################################################ + if opts.ucsc : + wiggle_dir = macs_name+'_MACS_wiggle' + ucsc_d = {'org':organism, + 'stage_dir':opts.stage_dir, + 'stage_url':opts.stage_url, + 'macs_dir':wiggle_dir, + } + + calls = ["integrate_macs_ucsc.py --auto %(org)s %(stage_dir)s %(stage_url)s %(macs_dir)s"%ucsc_d] + steps.append(PPS("UCSC Integration",calls)) + + + ############################################################################ + # map peaks to genes + ############################################################################ + map_fn = "%s_genes.txt"%macs_name + map_stats_fn = "%s_genes_stats.xls"%macs_name + map_d = {'kg_ref':kg_ref, + 'kg_xref':kg_xref, + 'peaks_fn':macs_peaks_fn, + 'bed_peaks_fn':macs_name+'_peaks.bed', + 'map_fn':map_fn, + 'map_stats_fn':map_stats_fn, + 'map_args':opts.map_args + } + # make sure peak files don't have .fa at the end of their chromosomes + calls = ["sed -i 's/\.fa//g' %(peaks_fn)s %(bed_peaks_fn)s"%map_d] + c = "map_peaks_to_known_genes.py %(map_args)s --map-output=%(map_fn)s " + \ + "--detail --stats-output=%(map_stats_fn)s %(kg_ref)s %(kg_xref)s " + \ + "%(peaks_fn)s" + calls.append(c%map_d) + steps.append(PPS('Map peaks to genes',calls,env=os.environ)) + + + ############################################################################ + # filter macs peaks + ############################################################################ + filtered_d = {'filter_peaks_args':opts.filter_peaks_args, + 'filter_neg_peaks_args':opts.filter_neg_peaks_args, + 'peaks_fn':macs_peaks_fn, + 'neg_peaks_fn':macs_neg_peaks_fn + } + c = "filter_macs_peaks.py --print-encoded-fn --encode-filters " \ + "%(filter_peaks_args)s %(peaks_fn)s" + filtered_peaks_fn = Popen(c%filtered_d,shell=True,stdout=PIPE).communicate()[0] + filtered_neg_peaks_fn = macs_name + '_negative_peak_filt.xls' + calls = ["filter_macs_peaks.py --encode-filters %(filter_peaks_args)s %(peaks_fn)s"%filtered_d] + if control_fn is not None : + calls.append("filter_macs_peaks.py --encode-filters %(filter_neg_peaks_args)s %(neg_peaks_fn)s"%filtered_d) + steps.append(PPS('Filter MACS peaks',calls,env=os.environ)) + + + ############################################################################ + # THEME + ############################################################################ + # extract foreground and generate background sequences + fg_fn = filtered_peaks_fn.replace('.xls','.fa') + fg_d = {'opts':opts.peaks_to_fa_args, + 'organism':organism, + 'fg_fn':fg_fn, + 'peaks_fn':filtered_peaks_fn} + calls = ["peaks_to_fasta.py %(opts)s --output=%(fg_fn)s %(organism)s %(peaks_fn)s"%fg_d] + steps.append(PPS('Peaks to Fasta',calls,env=os.environ)) + + bg_fn = "%s_bg.fa"%macs_name + bg_d = {'opts':opts.bg_args, + 'organism':organism, + 'fg_fn':fg_fn, + 'bg_fn':bg_fn} + calls = ["rejection_sample_fasta.py %(opts)s --output=%(bg_fn)s %(organism)s %(fg_fn)s"%bg_d] + steps.append(PPS('Generate Background Sequences',calls,env=os.environ)) + + # run THEME on fg + theme_opts, theme_args = theme_parser.parse_args(opts.theme_args.split(' ')) + hyp_fn = org_settings['theme_hypotheses'] + markov_fn = org_settings['theme_markov'] + + # run THEME w/ randomization by running each motif individuall + # this is because TAMO.MD has a memory leak + raw_motif_fn = '%s_motifs_beta%s_cv%s.tamo'%(macs_name,theme_opts.beta,theme_opts.cv) + random_cv_fn = '%s_motifs_beta%s_cv%s_rand.txt'%(macs_name,theme_opts.beta,theme_opts.cv) + + # new old THEME call + #Usage: THEME.sh [options] <FG_FASTA> <BG_FASTA> <HYP_FN> <MARKOV> + # + #Run old THEME version + # + #Options: + # -h, --help show this help message and exit + # --hyp-indices=HYP_INDS + # 0-based indices of hypotheses to run [default: ALL] + # --no-refine do not run with refinement + # --no-parallelize do not use wqsub.py for parallelization + # -v, --verbose print out the commands that are being run + # --dump dump categtories to file + # --output-filename=OUTPUT_FN + # filename to write motif results to [default:dummy.txt] + # --random-output=RANDOM_FN + # filename to write motif results to + # [default:random.txt] + # --motif-file=MOTIF_FN + # filename to write motif results to [default:dummy.out] + # --beta=BETA beta parameter to use [default:0.7] + # --delta=DELTA delta parameter to use [default:0.001] + # --remove-common remove common sequences from analysis + # --randomization run randomization + # --cv=CV number of cross validation folds [default:5] + # --interactive run the script interactively + + motif_fn = '%s_motifs_beta%s_cv%s.txt'%(macs_name,theme_opts.beta,theme_opts.cv) + theme_d = {'opts':opts.theme_args, + 'fg_fn':fg_fn, + 'bg_fn':bg_fn, + 'hyp':hyp_fn, + 'markov':markov_fn, + 'tamo_motif_fn':raw_motif_fn, + 'random_fn':random_cv_fn, + 'motif_fn':motif_fn + } + + theme_call = "THEME.sh %(opts)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s " \ + "--motif-file=%(tamo_motif_fn)s " \ + "--random-output=%(random_fn)s " \ + "--output-filename=%(motif_fn)s " \ + "--randomization" + + calls = [theme_call%theme_d] + steps.append(PPS('Run THEME',calls,env=os.environ)) + + # build infosite + calls = ['build_chipseq_infosite.py %s'%opts.infosite_args] + steps.append(PPS('Build infosite',calls,env=os.environ)) + + # cleanup + rm_str = "rm -f %(d)s/*.out %(d)s/*.err %(d)s/*.script %(d)s/*.stats %(d)s/*.bed" + calls = [rm_str%{'d':exp_wrk_dir}] + + if control_fn : + calls.append(rm_str%{'d':cnt_wrk_dir}) + #steps.append(PPS('Clean up',calls,env=os.environ)) + + pipeline.add_steps(steps) + if opts.auto and opts.steplist is not None : + steplist = parse_steplist(opts.steplist,pipeline) + else : + steplist = None + pipeline.run(interactive=not opts.auto,steplist=steplist)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/chipseq_pipeline_wo_ctrl.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +import os +import sys +from optparse import OptionParser, OptionGroup + +from pypeline import Pypeline, ProcessPypeStep as PPS +from chipsequtil import get_file_parts, get_org_settings +from chipsequtil.util import MultiLineHelpFormatter + +usage = "%prog [options] <organism> <experiment GERALD alignment filename> [<control GERALD alignment filename>]" +description = """1st generation ChIPSeq analysis pipeline: + + - converts Illumina GERALD alignment files to BED format + - calculates statistics on input alignments + - runs MACS to find peaks + - maps peaks to genes + - extracts fasta files for gene peaks in experiments + - constructs background sequences matching foreground distribution + - runs THEME.py on input sequences + - runs THEME.py randomization + - creates documentation on entire pipeline run + +Control input file is optional. *organism* argument is passed to the +*org_settings.py* command to specify organism specific parameters, ensure +that the following commands return valid paths: + +If running MACS: + - org_settings.py <organism> genome_size + - org_settings.py <organism> genome_dir + - org_settings.py <organsim> annotation_path + +If running THEME: + - org_settings.py <organism> theme_hypotheses + - org_settings.py <organism> theme_markov + +""" + +epilog = """Note: it is advised to leave the --*-args arguments unchanged +unless you really know what you're doing.""" + +parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) +parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') +parser.add_option('--exp-name',dest='exp_name',default=os.path.basename(os.getcwd()),help='name for the experiment/pipeline, used for convenience [default: current directory name]') +parser.add_option('--split-args',dest='split_args',default='--type=count --arg=16',help='double quote wrapped arguments for split_file.py [default: %default]') +parser.add_option('--bed-args',dest='bed_args',default='--stdout --chromo-strip=.fa',help='double quote wrapped arguments for gerald_to_bed.py [default: %default]') +parser.add_option('--stats-args',dest='stats_args',default='',help='double quote wrapped arguments for gerald_stats.py [default: %default]') +parser.add_option('--qsub-args',dest='qsub_args',default='--die-on-err',help='double quote wrapped arguments for split_qsub.py [default: %default]') +parser.add_option('--macs-args',dest='macs_args',default='--mfold=10 --tsize=35 --bw=150 --pvalue=1e-5',help='double quote wrapped arguments for macs, only changing --mfold, --tsize, --bw, and --pvalue recommended [default: %default]') +parser.add_option('--pk-to-fa-args',dest='pk_to_fa_args',default='--bg-type=rej_samp',help='double quote wrapped arguments for peaks_to_fasta.py [default: %default]') +parser.add_option('--theme-args',dest='theme_args',default='--beta=0.7 --cv=5',help='double quote wrapped arguments for THEME.py [default: %default]') + + +if __name__ == '__main__' : + + # parse command line arguments + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 3 : + parser.error('Must provide two non-option arguments') + + # filenames and paths + organism, experiment_fn, control_fn = args[0:3] + control_fn = None + if len(args) > 3 : + control_fn = args[2] + + org_settings = get_org_settings(organism) + refseq_fn = org_settings['annotation_path'] + + exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) + exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) + + if control_fn : + cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) + cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) + + # the pipeline + pipeline = Pypeline() + + steps = [] + + # split up files + calls = ["mkdir %s"%exp_wrk_dir, + "split_file.py %s --outdir=%s %s"%(opts.split_args,exp_wrk_dir,experiment_fn),] + if control_fn : + calls.extend(["mkdir %s"%cnt_wrk_dir, + "split_file.py %s --outdir=%s %s"%(opts.split_args,cnt_wrk_dir,control_fn), + ]) + steps.append(PPS('Split files',calls,env=os.environ)) + + # convert to BED format + exp_bed_fn = "%s_exp.bed"%exp_fbase + calls = ["split_qsub.py %s --ext=.bed gerald_to_bed.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.bed_args,exp_wrk_dir), + "wait_for_qsub.py", + "cat %s/*.bed > %s"%(exp_wrk_dir,exp_bed_fn), + ] + + if control_fn : + cnt_bed_fn = "%s_cnt.bed"%cnt_fbase + calls.extend(["split_qsub.py %s --ext=.bed gerald_to_bed.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.bed_args,cnt_wrk_dir), + "wait_for_qsub.py", + "cat %s/*.bed > %s"%(cnt_wrk_dir,cnt_bed_fn), + ]) + + steps.append(PPS('Convert GERALD to BED format',calls,env=os.environ)) + + #steps.append(PPS('Helloooooooo nurse','echo Helloooooooo nurse')) + # generate alignment statistics + exp_stats_fn = '%s_stats.txt'%exp_fbase + calls = ["split_qsub.py %s --ext=.stats gerald_stats.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.stats_args,exp_wrk_dir), + "wait_for_qsub.py", + "combine_gerald_stats.py %s/*.stats > %s"%(exp_wrk_dir,exp_stats_fn), + ] + + if control_fn : + cnt_stats_fn = '%s_stats.txt'%cnt_fbase + calls.extend(["split_qsub.py %s --ext=.stats gerald_stats.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.stats_args,cnt_wrk_dir), + "wait_for_qsub.py", + "combine_gerald_stats.py %s/*.stats > %s"%(cnt_wrk_dir,cnt_stats_fn), + ]) + steps.append(PPS('Calculate alignment statistics',calls,env=os.environ)) + + # run macs + cnt_flag = '' + if control_fn : + cnt_flag = '-c %s'cnt_bed_fn + + macs_d = {'exp_fn':exp_bed_fn, + 'cnt_flag':cnt_flag, + 'name':opts.exp_name, + 'macs_args':opts.macs_args, + 'gsize':org_settings['genome_size'], + } + calls = ["macs --gsize=%(gsize)s -t %(exp_fn)s %(cnt_flag)s --name=%(name)s --format=BED %(macs_args)s"%macs_d] + steps.append(PPS('Run MACS',calls,env=os.environ)) + + # map peaks to genes + peaks_fn = "%s_peaks.bed"%opts.exp_name + map_fn = "%s_genes.txt"%opts.exp_name + map_stats_fn = "%s_genes_stats.txt"%opts.exp_name + calls = ["map_peaks_to_genes.py --peaks-format=BED %(refGene_fn)s %(peaks_fn)s --map-output=%(map_fn)s --stats-output=%(map_stats_fn)s"%{'refGene_fn':refseq_fn,'peaks_fn':peaks_fn,'map_fn':map_fn,'map_stats_fn':map_stats_fn}] + steps.append(PPS('Map peaks to genes',calls,env=os.environ)) + + # THEME + # extract foreground and generate background sequences + fg_fn = "%s_peaks.fa"%opts.exp_name + bg_fn = "%s_bg.fa"%opts.exp_name + nib_dir = org_settings['genome_dir'] + calls = ["peaks_to_fasta.py %(opts)s --output=%(fg_fn)s --bg-fn=%(bg_fn)s %(organism)s %(peaks_fn)s"%{'opts':opts.pk_to_fa_args,'organism':organism,'fg_fn':fg_fn,'bg_fn':bg_fn,'peaks_fn':peaks_fn}] + steps.append(PPS('Peaks to Fasta',calls,env=os.environ)) + + # run THEME on fg + motif_fn = '%s_motifs.txt'%opts.exp_name + hyp_fn = org_settings['theme_hypotheses'] + markov_fn = org_settings['theme_markov'] + calls = ["THEME.py %(opts)s --motif-file=%(motif_fn)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s"%{'opts':opts.theme_args,'motif_fn':motif_fn,'fg_fn':fg_fn,'bg_fn':bg_fn,'hyp':hyp_fn,'markov':markov_fn}] + steps.append(PPS('Run THEME on foreground',calls,env=os.environ)) + + # run THEME randomization + random_motif_fn = '%s_motifs_rand.txt'%opts.exp_name + calls = ["THEME.py %(opts)s --randomization --motif-file=%(motif_fn)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s"%{'opts':opts.theme_args,'motif_fn':random_motif_fn,'fg_fn':fg_fn,'bg_fn':bg_fn,'hyp':hyp_fn,'markov':markov_fn}] + steps.append(PPS('Run THEME randomization',calls,env=os.environ)) + + # cleanup + rm_str = "rm -f %(d)s/*.out %(d)s/*.err %(d)s/*.script %(d)s/*.stats %(d)s/*.bed" + calls = [rm_str%{'d':exp_wrk_dir}, + rm_str%{'d':cnt_wrk_dir}] + steps.append(PPS('Clean up',calls,env=os.environ)) + + pipeline.add_steps(steps) + pipeline.run(interactive=not opts.auto)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/combine_gerald_stats.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +import sys, re, os +from optparse import OptionParser +from collections import defaultdict as dd + +parser = OptionParser() + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + all_stats = dd(int) + for fn in args : + d = eval(open(fn).read()) + for k,v in d.items() : + all_stats[k] += v + all_stats['tot. aligns'] += v + + keys = all_stats.keys() + keys.sort() + keys.remove('tot. aligns') + + for k in keys : + print k,':',all_stats[k],'(%.4f)'%(float(all_stats[k])/all_stats['tot. aligns']) + + print 'tot. aligns',':',all_stats['tot. aligns']
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/compare_microarray_binding.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import sys + +from csv import reader, writer +from collections import defaultdict as dd +from optparse import OptionParser +from subprocess import Popen, PIPE + +from chipsequtil import MACSOutput, BEDOutput, AffyBiocFile + +usage = '%prog -m <mapped MACS peaks file>|-b <mapped BED peaks file>|-a <mapped microarray file> [-m <MACS peaks file> ...] [-b <mapped BED peaks file> ...] [-a <mapped microarray file> ...]' +description = """Join all files on the first column, concatenating records with \ +matching entries onto one line per entry. Understands MACS peaks data as mapped \ +with *map_peaks_to_known_genes.py* utility microarray data as mapped by \ +*probeset_to_known_genes.py* utility, passed to program using *-m* and *-a* options \ +respectively. Output is a file where genes with binding data (MACS, BED files) have \ +column with a 1, 0 otherwise, and genes with microarray expression values have logFC \ +and adjusted p-value colums for each microarray file input. Internally, uses \ +*join_mapped_known_genes.py* with --binary-plus option to perform mapping and parses \ +output. MACS fields are listed first, followed by BED fields, followed by microarray \ +fields.""" + +epilog="Note: microarray files should have been created by bioconductor, and all files should have a row of fieldnames as the first line" +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('-a','--affy-file',dest='affy_file',action='append',default=[],help='add a mapped microarray file') +parser.add_option('-b','--bed-file',dest='bed_file',action='append',default=[],help='add a mapped BED formatted peaks (*.bed) file') +parser.add_option('-m','--macs-file',dest='macs_file',action='append',default=[],help='add a mapped default MACS formatted peaks (*.xls) file') +parser.add_option('--output',dest='output',default=None,help='file to output joined records to [default: stdout]') + +if __name__ == '__main__' : + + opts,args = parser.parse_args(sys.argv[1:]) + + if len(args) > 0 : + parser.error('There were non-option command line arguments passed, all files should have a preceeding option indicating filetype') + + if len(opts.macs_file) == 0 and len(opts.affy_file) == 0 : + parser.error('No files were passed in, aborting') + + # call join_mapped_known_genes.py + fn_map = {} + fn_map['macs'] = ' '.join(['-m %s'%fn for fn in opts.macs_file]) + fn_map['bed'] = ' '.join(['-b %s'%fn for fn in opts.bed_file]) + fn_map['array'] = ' '.join(['-a %s'%fn for fn in opts.affy_file]) + join_call = 'join_mapped_known_genes.py --binary-plus %(macs)s %(bed)s %(array)s'%fn_map + p = Popen(join_call, shell=True, stdout=PIPE,stderr=PIPE) + stdout, stderr = p.communicate() + if len(stderr) != 0 : + print stderr + + joined_output = stdout.split('\n') + joined_output = joined_output[:-1] if joined_output[-1] == '' else joined_output + + # determine which fields will end up in the file + header = joined_output[0].split('\t') + + # always want gene and symbol + field_indices = [0,1] + + # macs and bed fields are named by filename + for fn in opts.macs_file+opts.bed_file : + field_indices.append(header.index(fn)) + + # affy fields are index(fn)+5, index(fn)+8 + for fn in opts.affy_file : + # just add all the microarray columns + fn_header_indices = [i for i,x in enumerate(header) if x.find(fn) != -1] + field_indices.extend(fn_header_indices) + + #field_indices.append(header.index(fn)) + #field_indices.append(header.index(fn)+5) + #field_indices.append(header.index(fn)+8) + + out_f = open(opts.output,'w') if opts.output else sys.stdout + for line in joined_output : + line = line.split('\t') + out_f.write('\t'.join([line[i] for i in field_indices])+'\n') + + if opts.output : + out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/construct_bg_fasta.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,235 @@ +#!/usr/bin/env python + +import os +import sys +import warnings + +from collections import defaultdict +from optparse import OptionParser + +from chipsequtil import get_org_settings, RefGeneFile +from chipsequtil.nib import NibDB +from chipsequtil.util import MultiLineHelpFormatter +from TAMO.seq import Fasta + +usage='%prog [options] <type> <organism> <foreground fasta>' +description='Create background sequence databses for motif finding, etc.' +parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) + + +def rejection_sampling(fg,settings_dict,gc_bins=20) : + + genm_db = NibDB(settings_dict['genome_dir']) + annot = RefGeneFile(settings_dict['annotation_file']) + + + num_peak_bases = 0 + for header, seq in fg.items() : + num_peak_bases += len(seq) + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 3 : + parser.error('Must provide three non-option arguments') + + sample_type, organism, fg_fn = args[:3] + + settings_dict = get_org_settings(organism) + + fg = Fasta.load(fg_fn) + bg = rejection_sampling(fg,settings_dict) + + +############################################################### +# start Chris' code from rej_samp_bg_rand2.py + the_genes={} #list of distances to nearest TSS + + # for each peak find the chromosome, distance to nearest + # gene, size of peaks in bases, and GC content + the_chrs,dists,sizes,gcs=[],[],[],[] + + # number of bases in the fg sequences + size=0 + + for key in pos_seqs.keys(): + + size+=len(pos_seqs[key]) + + # chromosome first field in fasta headers from bed2seq.bedtoseq + chr=key.split(':')[0] + + # adjust chromosomes in special cases + if re.search('random',chr): + continue + if chr=='chr20': + chr='chrX' + elif chr=='chr21': + chr='chrY' + if not the_genes.has_key(chr): + the_genes[chr]=[] + + # start first int in second field of bed2seq.bedtoseq header + start=int(key.split(':')[1].split('-')[0]) + midpoint=int(start+len(pos_seqs[key])/2) + + # figure out which chromosome we're working on + tss_chr=tss[chr.split('chr')[-1]] + + # D is the distances from all the genes, find minimum + D=[(s[0]-midpoint) for s in tss_chr] + + # best distance for this peak + minD=min([abs(x) for x in D]) + best=[d for d in D if abs(d)==minD] + dists.append(best[0]) + + # chromosome for this peak + the_chrs.append(chr) + seq=pos_seqs[key] + + # calculate # bases and GC content + N=len(seq) + sizes.append(N) + gc=len([x for x in seq if (x=='G')or(x=='C')])/N + gcs.append(gc) + + #bin GC content distribution + bins=20 + + # q is # of peaks w/ x% GC content + q=[0]*bins + + for gc in gcs: + for i in range(bins): + win_start=i/bins + win_end=(i+1)/bins + if gc>=win_start and gc<win_end: + q[i]+=1 + continue + + # q is now % peaks w/ x% GC content + q=[x/Nseqs for x in q] + #print q + + # c is # peaks w/ highest GC content + c=max(q)*Nseqs + + # start generating bg sequences + print "Done assembling distance and gc content distributions" + genome_outfile=open(bg,'w') + + # make twice as many + size=round(size/(2*len(pos_seqs))) + bg_gcs,bg_sizes=[],[] + #for key in the_genes.keys(): + #chrom=key.split('chr')[-1] + #the_genes[key]=[x[0] for x in tss[chrom]] + + # C_TX is a list of all genes in (chromosome,gene start) tuples + C_TX=[] + for key in tss.keys(): + chrom=key.split('chr')[-1] + for x in tss[chrom]: + C_TX.append((chrom,x[0])) + + # generate a bg sequence for every fg sequence + for i in range(Nseqs): + + # propose sequences until one is accepted + keep_going=1 + while keep_going: + #random.shuffle(the_chrs) + + # randomize the list of distances from genes + random.shuffle(dists) + #chr=the_chrs[0] + + # pick the first distance, i.e. at random + d=dists[0] + + #random.shuffle(the_genes[chr]) + + # randomize the gene list + random.shuffle(C_TX) + + # randomize the peak sizes + random.shuffle(sizes) + + # pick a random gene + (chr,coord)=C_TX[0] + + #coord=the_genes[chr][0] + # propose a starting point for the bg sequence + midpoint=coord-d+random.randint(-100,100) + + # propose a starting size for the bg sequence + size=sizes[0] + start=int(midpoint-int(size/2)) + stop=int(midpoint+int(size/2)) + id='chr'+chr.split('chr')[-1]+':'+str(start)+'-'+str(stop) + r=random.random() + + # randomly choose strand + if r<0.5: strand='+' + else: strand='-' + + # extract the proposed sequence + nib_title,seq=nibfrag.sequence('chr'+chr,start, stop,strand) + if not seq: + print 'NOT FOUND', chr,start,stop, + continue + else: + + N,y=0,0 + # calculate the GC content for the proposed sequence + for line in seq: + s=line.upper() + N+=len(line) + y+=len([x for x in s if (x=='G')or(x=='C')]) + if line[0]=='N': continue + x=float(y)/N + + # determine the GC bin for this sequence + #gc=float(len([x for x in seq if (x=='G')or(x=='C')]))/N + for i in range(bins): + win_start=i/bins + win_end=(i+1)/bins + if x>=win_start and x<win_end: + bin=i + continue + + # pick a uniform random number such that it does not exceed + # the maximum GC content distribution over bins + r=random.random()*c/Nseqs + + # if the random number is <= the GC content for this + # proposed sequence, accept, otherwise reject + if r>q[bin]: + #print 'skip' + continue + else: + #print bin + bg_gcs.append(x) + bg_sizes.append(size) + keep_going-=1 + title='>%s\n'%id + genome_outfile.write(title) + for line in seq: + genome_outfile.write(line.upper()+'\n') + print len(gcs) + print len(bg_gcs) + fg_mean,fg_sdev=mean_sdev(gcs) + print fg_mean,fg_sdev + #bg_mean,bg_sdev=mean_sdev(bg_gcs) + bg_mean=scipy.mean(bg_gcs) + bg_sdev=scipy.std(bg_gcs) + print bg_mean,bg_sdev + fg_size_m,fg_size_dev=mean_sdev(sizes) + bg_size_m,bg_size_dev=mean_sdev(bg_sizes) + print fg_size_m,fg_size_dev + print bg_size_m,bg_size_dev + genome_outfile.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/create_pipeline_script.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,385 @@ +#!/usr/bin/env python + +from __future__ import with_statement +import getpass +import json +import os +import textwrap + +try: + import readline + import glob + readline.parse_and_bind("tab: complete") + readline.set_completer_delims('') + + comp_states = {} + def basic_complete_file(text,state) : + #if text.strip() == '' : + # text = './' + options = dict([(i,p) for i,p in enumerate(glob.glob(text+'*'))]) + return options.get(state,None) + + readline.set_completer(basic_complete_file) + +except ImportError: + print "Module readline not available." + +import re +import stat +import sys +from optparse import OptionParser +from subprocess import Popen, PIPE + +import chipsequtil +from chipsequtil import get_global_settings, get_local_settings, check_org_settings, GLOBAL_SETTINGS_FN, LOCAL_SETTINGS_FN +from terminalcontroller import TERM_ESCAPE, announce, warn, error, white, bold + +usage = "%prog" +description = """Script for creating a custom run script for +ChIPSeq/DNAse hypersensitivity experiments. User is asked for +paths and settings required for ChIPSeq analysis using the *chipseq_pipeline.py* +utility and produces an executable run script with helpful information on how to +run it. Also creates a JSON formatted file containing all the parameters for +this pipeline run.""" +epilog = "Note: this script only works in Unix-style environments" +parser = OptionParser(usage=usage,description=description,epilog=epilog) + + +script_template = """\ +#!/bin/bash + +# required parameters for the pipeline +ORG=%(organism)s +EXP_FN=%(exp_path)s +CNT_FN=%(cnt_path)s + +# chipseq_pipeline.py is the main workhorse of this analysis +# you may change any of the arguments below from their defaults + +chipseq_pipeline.py $ORG $EXP_FN $CNT_FN \\ +%(def_args)s +""" + +start_text = """\ +This is an interactive script that creates an executable script to use for +ChIPSeq analyses. When prompted for experiment and control files, tab +completion is available a la bash or tcsh shells. Press Ctrl-C at any time to +quit. +""" + +end_text = """The script %(script_fn)s has been created to run this pipeline. \ +The script can now be run with: + +$> ./%(script_fn)s + +Have a nice day.""" + + + +def wb(st) : + sys.stdout.write(white(bold(st))) + + +def input(st,default=None) : + + if default is None : + default_str = '' + else : + default_str = ' [default: ' + default + ' ] ' + + out = None + while out is None : + out = raw_input(white(bold(st))+default_str+white(bold(':'))+' \n') + if len(out) == 0 : + out = default + + return out + + +if __name__ == '__main__' : + + TERM_ESCAPE = True + + try : + + pipeline_args = {} + + # herro + announce('ChIPSeq Experiment Pipeline Script Generator') + print textwrap.fill(start_text) + + opts, args = parser.parse_args(sys.argv[1:]) + if len(args) > 0 : + warn("Arguments were passed, but this script doesn't accept any arguments, rudely ignoring them...\n") + + # this dictionary will be used to generate a JSON formatted file with + # all the relevant settings for the pipeline + json_dict = {} + + ############################################################################ + # name of the experiment + ############################################################################ + def_path = os.path.basename(os.getcwd()) + exp_name = input('Experiment name',def_path) + exp_name = exp_name.replace(' ','_') # shhhhhhhh... + + json_dict['experiment name'] = exp_name + json_dict['analysis path'] = os.getcwd() + + ############################################################################ + # experiment and control file + ############################################################################ + align_text = "The pipeline can accept either BED, BOWTIE, SAM, or " \ + "ELANDEXPORT formatted alignment files. SAM is the default " \ + "format of files provided by the BMC pipeline. Both experiment " \ + "and control files must have the same format." + print textwrap.fill(align_text) + + align_fmt = input("Which format are the alignment files in?",'SAM') + exp_path = input('Experiment alignment path') + exp_path = exp_path.strip() + + lims_exp_url = input('Experiment LIMS sample URL, if applicable','none') + lims_exp_url = lims_exp_url.strip() + + cntrl_path = input('Control alignment path (leave blank for no control)','none') + cntrl_path = cntrl_path.strip() + + lims_cntrl_url = input('Control LIMS sample URL, if applicable','none') + lims_cntrl_url = lims_cntrl_url.strip() + + if cntrl_path == 'none' : + cntrl_path = '' + + if cntrl_path == '' : + print 'Analysis will be run with no control' + + json_dict['experiment path'] = os.path.realpath(exp_path) + json_dict['experiment lims url'] = lims_exp_url + json_dict['control path'] = os.path.realpath(cntrl_path) if cntrl_path != '' else 'none' + json_dict['control lims url'] = lims_cntrl_url + + ############################################################################ + # organism + settings + ############################################################################ + announce('Organism settings configuration') + global_settings = get_global_settings() + local_settings = get_local_settings() + valid_org_settings = global_settings.keys() + local_settings.keys() + valid_org_settings.sort() + + org_text = """\ +Below are the organism settings available on this system. The pipeline will +use the settings for one organism (e.g. %(org)s) for the entire execution. If +you do not see a set of settings that correspond to files you need you may +add your own to %(local_org)s. See %(glob_org)s for details. +""" + + print textwrap.fill(org_text%{'org':valid_org_settings[0],'local_org':LOCAL_SETTINGS_FN,'glob_org':GLOBAL_SETTINGS_FN},break_long_words=False) + print + + wb('Available settings\n') + # global settings + print 'Global settings: (%s)'%GLOBAL_SETTINGS_FN + org_sets = [(k,global_settings[k]) for k in sorted(global_settings.keys())] + for org, settings in org_sets : + wb(org.ljust(8)) + print ':', settings.get('description','No description') + #for k,v in settings.items() : + # print ' '*4+k+": "+str(v) + + # local settings + print 'Local settings: (%s)'%LOCAL_SETTINGS_FN + org_sets = [(k,local_settings[k]) for k in sorted(local_settings.keys())] + for org, settings in org_sets : + wb(org.ljust(8)) + print ':', settings.get('description','No description') + #for k,v in settings.items() : + # print ' '*4+k+": "+str(v) + org = '' + all_settings = {} + all_settings.update(global_settings) + all_settings.update(local_settings) + + while org not in valid_org_settings : + org = input('Choose organism configuration, one of ('+','.join(valid_org_settings)+')') + + # check for the required settings + required_settings = ['description','genome_dir','refgene_anno_path','theme_hypotheses','theme_markov'] + if not check_org_settings(org,required_settings) : + warn(textwrap.fill('Selected organism settings must have the following settings defined:\n\ + %s\n\ + Either select another organism or define these settings in your local\ + configuration file.'%required_settings)) + org = '' + print + + json_dict['org'] = org + + ############################################################################ + # UCSC + ############################################################################ + + ucsc_text = """The pipeline can include a step to automatically make called +peak data available on the web for integration with UCSC genome browser.""" + + print textwrap.fill(ucsc_text,break_long_words=False) + + ucsc_integrate = input('Would you like to integrate this analysis with UCSC genome browser [y/n]?','y') + ucsc_integrate = False if ucsc_integrate == 'n' else True + ucsc_args = '' + stage_dir = '/nfs/antdata/web_stage/%s'%getpass.getuser() + stage_url = 'http://fraenkel.mit.edu/stage/%s'%getpass.getuser() + if ucsc_integrate : + ucsc_args = ['--ucsc'] + ucsc_args = ' '.join(ucsc_args) + + pipeline_args['--stage-dir'] = stage_dir + pipeline_args['--stage-url'] = stage_url + + json_dict['stage dir'] = stage_dir + json_dict['stage url'] = stage_url + + # TODO - consider letting user set these on script creation time + # any utility specific arguments? + # - MACS + # - THEME + + + ############################################################################ + # various pipeline parameters + ############################################################################ + + # --macs-args + macs_args = ['--mfold=10,30','--format=%s'%align_fmt] + pval = '' + while not re.search('^\de-\d+$',pval) : + pval = input('What p-value should MACS use as a cutoff?','1e-5') + macs_args.append('--pvalue=%s'%pval) + pipeline_args['--macs-args'] = ' '.join(macs_args) + + # --map-args + map_args = [] + tss = '' + while tss.upper() not in ('TSS','GENE') : + tss = input('Should gene mapping be made in relation to transcription start site or full gene coordinates [TSS/gene]?','TSS') + if tss == 'TSS' : + map_args.append('--tss') + + window = '' + while not re.search('^\d+,\d+$',window) : + window = input('What window would you like to use for mapping peaks to genes (upstream bases,downstream bases)?','10000,10000') + upstr, downstr = window.split(',') + map_args.extend(['--upstream-window=%s'%upstr,'--downstream-window=%s'%downstr]) + pipeline_args['--map-args'] = ' '.join(map_args) + + # --filter-peaks-args + filt_args = ['--sort-by=pvalue'] + fdr = '' + while not re.search('^\d+(\.\d+)?',fdr) and fdr != 'none' : + fdr = input('What FDR cutoff should be used, in %?','none') + if fdr != 'none' : + filt_args.append("--filter='fdr<%s'"%fdr) + + top = '' + while not re.search('^\d+$',top) and top != 'ALL' : + top = input('How many peak sequences should be used for motif discovery when sorted by p-value [<# peaks>/ALL]','1000') + if top != 'ALL' : + filt_args.append('--top=%s'%top) + + # tag filter for both pos and neg peaks + tags = '' + filt_neg_args = [] + while not re.search('^\d+$',tags) and tags != 'ALL' : + tags = input('What tag count cutoff should be used as a minimum for positive and negative peaks? [<# peaks>/None]','20') + if tags != 'None' : + filt_args.append("--filter='tags>%s'"%tags) + filt_neg_args.append("--filter='tags>%s'"%tags) + pipeline_args['--filter-peaks-args'] = ' '.join(filt_args) + pipeline_args['--filter-neg-peaks-args'] = ' '.join(filt_neg_args) + + # --peaks-to-fa-args + peaks_to_fa_args = [] + width = '' + while not re.search('^\d+$',width) and width != 'NA' : + width = input('What width around peak summit should be used for motif analysis (NA to use entire peak)? [<# bases>/NA]','200') + if width != 'NA' : + peaks_to_fa_args.append('--fixed-peak-width=%s'%width) + else : + width = 'none' + pipeline_args['--peaks-to-fa-args'] = ' '.join(peaks_to_fa_args) + + # --parallelize + parallel = input('Use cluster parallelization [y/n]?','y') + parallel = '--parallelize' if parallel.lower() != 'n' else '' + + # each user-specified argument gets its own key + json_dict['format'] = align_fmt + json_dict['mapping type'] = tss + json_dict['mapping window'] = (upstr,downstr) + json_dict['FDR filter'] = fdr + json_dict['peaks used by THEME'] = top + json_dict['fixed peak width'] = width + json_dict['parallelize'] = parallel != '' + json_dict['peak tag count filter'] = tags + + # put all the command line utility args in json_dict as its own dict + json_dict['pipeline args'] = pipeline_args + + ############################################################################ + # done with input, creating script and other stuff + ############################################################################ + # if the experiment and control files are in a different directory, + # create symlinks for them + exp_dir,exp_fn = os.path.split(os.path.abspath(exp_path)) + if exp_dir != os.getcwd() : + wb('Creating symlink for experiment file...\n') + if os.path.exists(exp_fn) : + if os.path.realpath(exp_fn) != os.path.abspath(exp_path) : # existing symlink doesn't point to the same file, prompt to overwrite + ans = raw_input('Symlink %s in current directory points to %s but you asked for %s, overwrite symbolic link? y/[n] '%(exp_fn,os.path.realpath(exp_fn),os.path.abspath(exp_path))) + if ans == 'y' : + os.remove(exp_fn) + exp_fn = 'exp_'+exp_fn + os.symlink(exp_path,exp_fn) + else : + exp_fn = 'exp_'+exp_fn + os.symlink(exp_path,exp_fn) + + if cntrl_path != '' : + cntrl_dir,cntrl_fn = os.path.split(os.path.abspath(cntrl_path)) + if cntrl_dir != os.getcwd() : + wb('Creating symlink for control file...\n') + if os.path.exists(cntrl_fn) : + if os.path.realpath(cntrl_fn) != os.path.abspath(cntrl_path) : # existing symlink doesn't point to the same file, prompt to overwrite + ans = raw_input('Symlink %s in current directory points to %s but you asked for %s, overwrite symbolic link? y/[n] '%(cntrl_fn,os.path.realpath(cntrl_fn),os.path.abspath(cntrl_path))) + if ans == 'y' : + os.remove(cntrl_fn) + cntrl_fn = 'cntrl_'+cntrl_fn + os.symlink(cntrl_path,cntrl_fn) + else : + cntrl_fn = 'cntrl_'+cntrl_fn + os.symlink(cntrl_path,cntrl_fn) + else : + cntrl_fn = '' + + # get default chipseq_pipeline.py args + pipeline_args = ' '.join(['%s="%s"'%(k,v) for k,v in pipeline_args.items()]) + print 'chipseq_pipeline.py --exp-name=%s %s %s --print-args'%(exp_name,ucsc_args,pipeline_args) + def_args = Popen('chipseq_pipeline.py --exp-name=%s %s %s %s --print-args'%(exp_name,ucsc_args,parallel,pipeline_args),shell=True,stdout=PIPE,stderr=PIPE).communicate()[0] + + wb('Creating script...\n') + script_fn = '%s_pipeline.sh'%exp_name + with open(script_fn,'w') as script_f : + script_f.write(script_template%{'exp_path':exp_fn,'cnt_path':cntrl_fn,'organism':org,'exp_name':exp_name,'def_args':def_args}) + os.chmod(script_f.name,stat.S_IRWXU|stat.S_IRWXG|stat.S_IROTH) + + print end_text%{'script_fn':script_fn} + + wb('Creating parameter file...\n') + json_fn = '%s_params.json'%exp_name + with open(json_fn,'w') as json_f : + json.dump(json_dict,json_f,indent=4) + + except KeyboardInterrupt : + sys.stderr.write('\n') + error('Script creation interrupted, aborting')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/extract_promoters.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +import re +import sys +from csv import writer +from optparse import OptionParser + +from collections import defaultdict + +from chipsequtil import get_org_settings, RefGeneFile +from chipsequtil.nib import NibDB +from chipsequtil.util import MultiLineHelpFormatter as MF + +usage = "%prog [options] <organism>" +description = """Extract the promoter sequences in FASTA format from all genes +or a list of genes specified in an input file. Gene annotation is RefGene +corresponding to the organism passed in, paths returned by: + +$> org_settings.py <organism> refgene_anno_path +$> org_settings.py <organism> genome_dir + +must be valid.""" +parser = OptionParser(usage=usage,description=description,formatter=MF()) +parser.add_option('-u','--upstream',type='int',default=3000,help='upstream window from TSS to extract [default: %default]') +parser.add_option('-d','--downstream',type='int',default=1000,help='downstream window from TSS to extract [default: %default]') +parser.add_option('-l','--gene-list',dest='gene_list',default=None, + help='file containing a list of gene identifiers to extract, one per line [default: %default]') +gene_type_choices = ['symbol','refgene'] +parser.add_option('-t','--gene-type',dest='gene_type',type='choice', + choices=gene_type_choices,default=gene_type_choices[0], + help='type of gene identifier in gene list, choose from %s [default: %%default]'%gene_type_choices) +parser.add_option('-o','--output',dest='output',default=None, + help='file to write fasta records to [default: stdout]') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 1 : + parser.error('Exactly one argument is required') + + org_settings = get_org_settings(args[0]) + + refgene_fn = org_settings['refgene_anno_path'] + refgene_f = RefGeneFile(refgene_fn) + + nib_db = NibDB(nib_dirs=[org_settings['genome_dir']]) + + gene_list = None + if opts.gene_list : + gene_list = [x.strip() for x in open(opts.gene_list).readlines()] + + id_index = 'bin' + if opts.gene_type != gene_type_choices[0] : + if opts.gene_type == 'refgene' : + id_index = 'name' + + seq_recs = [] + gene_map = defaultdict(list) + for rec in refgene_f : + if gene_list and rec[id_index] not in gene_list : continue # skip this one + st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases']) + key = (rec['chrom'],st,end,rec['strand']) + seq_recs.append(key) + gene_map[key[:-1]].append(rec['bin']+'/'+rec['name']) + + fasta_recs = nib_db.get_fasta_batch(seq_recs) + + out_f = open(opts.output,'w') if opts.output else sys.stdout + header_regex = re.compile('^.*(chr[0-9MXY]+).*:([0-9]+)-([0-9]+).*$') + for header, seq in zip(*fasta_recs) : + # map sequences back to gene names using the header + reg_obj = header_regex.search(header) + if reg_obj is not None : + chrm,st,end = reg_obj.groups() + gene_names = gene_map.get((chrm,int(st),int(end))) + if gene_names is not None : + header = header.strip()+':'+','.join(gene_names)+'\n' + out_f.write(header+seq+'\n')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/filter_bed_by_position_count.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +import sys + +from csv import reader, writer +from optparse import OptionParser + +usage = '%prog [options] <bed file>' +description = """Analyze BED file and filter out alignments above some threshold \ +that align to a single genomic position.""" +epilog="Note: only works if BED file is sorted!" +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('-n','--max-count',dest='max_count',default=5,type='int',help='max tag count at a given position, filter above [default: %default]') +parser.add_option('--output',dest='output',default=None,help='write output to file') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 1 : + parser.error('Exactly one sorted .bed file is required') + + bed_fn = args[0] + + bed_reader = reader(open(bed_fn),delimiter='\t') + out_f = open(opts.output,'w') if opts.output else sys.stdout + bed_writer = writer(out_f,delimiter='\t') + + curr_key, curr_key_count = None, 0 + for rec in bed_reader : + key = rec[:3] # chromosome, start, end + if key != curr_key : + curr_key, curr_key_count = key, 0 + if curr_key_count < opts.max_count : + bed_writer.writerow(rec) + curr_key_count += 1 + else : + continue + if opts.output : out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/filter_gps_peaks.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,215 @@ +#!/usr/bin/env python + +import re +import os +import sys +from collections import defaultdict +from optparse import OptionParser, SUPPRESS_HELP +from random import shuffle + +from chipsequtil import GPSFile, get_file_parts +from chipsequtil.util import MultiLineHelpFormatter as MF +from terminalcontroller import warn + +usage = "%prog [options] <GPS peak file>" +description = """\ +Filter GPS peaks by supplied criteria. Available filter features are: + +IP +Control +Fold +qvalue +pvalue +IPvsEMP +IPvsCTR + +Filters are provided as expressions using the [-f |--filter] option, e.g. the command + +%prog -f "IP>100" --filter="pvalue<=1e-9" <GPS peak file> + +finds only peaks with more than 100 tags and a pvalue of less than 1e9. Any +number of filters may be provided, and only peaks that match *all* filters pass. \ +User is warned if filters result in zero results. Only inequality operators are \ +valid. Invoking with no filter arguments returns all peaks. To sort, use the \ +--sort-by option, e.g. + +%prog -f "pvalue<=1e-9" --sort-by=pvalue <GPS peak file> + +sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. All fields \ +are sorted ascending by default. Output is prepended with comments describing what \ +the file contains, i.e. which filters are applied, how many records there are, etc. + +Note: GPS P_-log10 and Q_-log10 values are converted to normal pvalues and qvalues +""" + +parser = OptionParser(usage=usage,description=description,formatter=MF()) +parser.add_option('-f','--filter',dest='filters',default=[],action='append',help='add filter expression') +parser.add_option('--sort-by',dest='sort_by',default='',help='comma delimited list of features to sort by, filtered peaks are not sorted by default, if provided peaks are sorted ascending by default') +parser.add_option('--sort-dir',dest='sort_dir',type='choice',choices=['ASCEND','DESCEND'],default='ASCEND',help='direction to sort [default: %default]') +parser.add_option('--top',dest='top',type='int',default=None,help='accepts an integer, output at most this many peaks [default: all]') +parser.add_option('--output',dest='output',default=None,help='filename to output filtered peaks to [default: stdout]') +parser.add_option('--encode-filters',dest='encode_filters',action='store_true',help='write out records to a file <GPS peaks file>_<filters>.xls (incompatible with --output option)') +parser.add_option('--summary',dest='summary',action='store_true',help='only print out summary information for the filter') +parser.add_option('--no-header',dest='no_header',action='store_true',help='do not print out header or metadata info') +parser.add_option('--shuffle',dest='shuffle',action='store_true',help='shuffle order of filtered records, useful for selecting random peaks') + +parser.add_option('--print-encoded-fn',dest='print_encoded_fn',action='store_true',help="print out the filename that would be created by --encode-filters") + +# make condition function objects using closures +_lt = lambda x,y : x < y +_lte = lambda x,y : x <= y +_gt = lambda x,y : x > y +_gte = lambda x,y : x >= y +_cond_map = {'<':_lt,'<=':_lte,'>':_gt,'>=':_gte,None:None} + +def make_condition(low_val=None,low_test=_gte,high_val=None,high_test=_lte) : + if low_val and not high_val : + return lambda x: low_test(low_val,x) + elif not low_val and high_val : + return lambda x: high_test(x,high_val) + elif low_val and high_val : + return lambda x: low_test(low_val,x) and high_test(x,high_val) + else : + return lambda x: True # identity with no constraints + +# regex and function for parsing filter strings +numeric_regex_str = r'\d+(?:\.\d*)?(?:(?:e|E)-?\d+)?' # matches numeric-looking patterns, e.g. 1, 1.234, 1e4, 1.234E-5, etc. +separator_regex_str = r'(?:>|>=|<|<=)' +ids_regex_str = r'(?:IP|Control|Fold|qvalue|pvalue|IPvsEMP|IPvsCTR)' +filter_regex = re.compile('^(%(num)s)?(%(sep)s)?(%(id)s)(%(sep)s)?(%(num)s)?$'%{'num':numeric_regex_str,'sep':separator_regex_str,'id':ids_regex_str}) + +class FilterException(Exception) : pass + +def parse_filter(filter_str) : + match = filter_regex.search(filter_str.strip()) + if match is None : + raise FilterException('Filter %s is formatted incorrectly'%filter_str) + low_val, low_test, field, high_test, high_val = match.groups() + low_val = float(low_val) if low_val else low_val + high_val = float(high_val) if high_val else high_val + return field, make_condition(low_val,_cond_map[low_test],high_val,_cond_map[high_test]) + +_sort_keys = {'length': lambda x: int(x[3]), + 'tags': lambda x: int(x[5]), + 'pvalue': lambda x: 10**(float(x[6])/-10), + 'fold_enrichment': lambda x: float(x[7]), + 'fdr': lambda x: float(x[8]), + } + + +summary_str = """\ +# This output was generated by filter_macs_peaks.py, filtered from %(macs_fn)s +# Number of peaks: %(num_recs)d +# Filters: %(filters)s +# Sorted by: %(sort_by)s +# Shuffled: %(shuffled)s +""" +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 1 : + parser.error('Must provide one GPS peaks file') + + if opts.output is not None and opts.encode_filters : + parser.error('--output and --encode-filters options are mutually exclusive') + + # set where to write output + if opts.encode_filters : + # construct filename additions + fn_str = '' + opts.filters.sort() + for filt in opts.filters : + filter_str = filt.replace(' ','') + filter_str = filter_str.replace('>=','_GTE_') + filter_str = filter_str.replace('<=','_LTE_') + filter_str = filter_str.replace('>','_GT_') + filter_str = filter_str.replace('<','_LT_') + fn_str += '_%s'%filter_str + + if opts.top is not None : + fn_str += '_top%d'%opts.top + + if len(opts.sort_by) != 0 : + fn_str += '_sortby_%s'%opts.sort_by + + if opts.shuffle : + fn_str += '_shuffled' + + macs_path,macs_fn,macs_basefn,macs_ext = get_file_parts(args[0]) + encoded_fn = os.path.join(macs_path,macs_basefn+fn_str+macs_ext) + if opts.print_encoded_fn : + sys.stdout.write(encoded_fn) + sys.exit(0) + else : + out_f = open(encoded_fn,'w') + elif opts.output : + out_f = open(opts.output,'w') + else : + out_f = sys.stdout + + # parse the filters + field_filters = defaultdict(list) + for filter in opts.filters : + field, filter_cond = parse_filter(filter) + field_filters[field].append(filter_cond) + + # start processing GPS file + peaks = GPSFile(args[0]) + + # filter the records + pass_recs = [] + for peak in peaks : + # test each of the fields, if any one fails skip the record + if not all([c(int(peak['IP'])) for c in field_filters['IP']]) or \ + not all([c(int(peak['Control'])) for c in field_filters['Control']]) or \ + not all([c(float(peak['Fold'])) for c in field_filters['Fold']]) or \ + not all([c(10**(float(peak['Q_-lg10'])/-10)) for c in field_filters['qvalue']]) or \ + not all([c(10**(float(peak['P_-lg10'])/-10)) for c in field_filters['pvalue']]) or \ + not all([c(float(peak['IPvsEMP'])) for c in field_filters['IPvsEMP']]) or \ + not all([c(float(peak['IPvsCTR'])) for c in field_filters['IPvsCTR']]) : + continue + else : + pass_recs.append([peak[k] for k in GPSFile.FIELD_NAMES]) + + if len(pass_recs) == 0 : + warn('WARNING: no records remain after filtering\n') + sys.exit(1) + + # sorting + if opts.sort_by : + pass_recs.sort(key=_sort_keys[opts.sort_by],reverse=opts.sort_dir != 'ASCEND') + + # top records + num_recs = len(pass_recs) if not opts.top else min(len(pass_recs),opts.top) + + # construct the summary string + filters_str = 'none' if len(opts.filters) == 0 else ', '.join(opts.filters) + sort_str = 'original order' if not opts.sort_by else opts.sort_by+', '+opts.sort_dir + shuffled_str = str(opts.shuffle) + summary = summary_str%{'macs_fn':args[0],'num_recs':num_recs, + 'filters':filters_str, + 'sort_by':sort_str, + 'shuffled':shuffled_str} + + # print summary only + if opts.summary : + sys.stdout.write(summary) + sys.exit(0) + + # write out the header cuz it's a nice thing to do + if not opts.no_header : + out_f.write(summary) + out_f.write('\t'.join(GPSFile.FIELD_NAMES)+'\n') + + # write out records + if opts.shuffle : + shuffle(pass_recs) + out_recs = pass_recs[:num_recs] + + for rec in out_recs : + # rec[0] is a tuple of (chromosome,start pos,original string) + out_f.write('\t'.join([rec[0][2]]+map(str,rec[1:]))+'\n') + + # good programming practice + out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/filter_macs_peaks.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,210 @@ +#!/usr/bin/env python + +import re +import os +import sys +from collections import defaultdict +from optparse import OptionParser, SUPPRESS_HELP +from random import shuffle + +from chipsequtil import MACSFile, MACSOutput, get_file_parts +from chipsequtil.util import MultiLineHelpFormatter as MF +from terminalcontroller import warn + +usage = "%prog [options] <MACS peak file>" +description = """\ +Filter MACS peaks by supplied criteria. Available filter features are: + +length +tags +pvalue +fold_enrichment +fdr + +Filters are provided as expressions using the [-f |--filter] option, e.g. the command + +%prog -f "tags>100" --filter="pvalue<=1e-9" --filter="100<length<=200" <MACS peak file> + +finds only peaks with more than 100 tags, a pvalue of less than 1e9, and a length \ +between 100, exclusive, and 200, inclusive. Any number of filters may be provided, \ +and only peaks that match *all* filters pass. User is warned if filters result in \ +zero results. Only inequality operators are valid. Invoking with no filter arguments \ +returns all peaks. To sort, use the --sort-by option, e.g. + +%prog -f "pvalue<=1e-9" --sort-by=pvalue <MACS peak file> + +sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. All fields \ +are sorted ascending by default. Output is prepended with comments describing what \ +the file contains, i.e. which filters are applied, how many records there are, etc. + +Note: MACS -10*log10(pvalue) values are converted to normal pvalues +""" + +parser = OptionParser(usage=usage,description=description,formatter=MF()) +parser.add_option('-f','--filter',dest='filters',default=[],action='append',help='add filter expression') +parser.add_option('--sort-by',dest='sort_by',default='',help='comma delimited list of features to sort by, filtered peaks are not sorted by default, if provided peaks are sorted ascending by default') +parser.add_option('--sort-dir',dest='sort_dir',type='choice',choices=['ASCEND','DESCEND'],default='ASCEND',help='direction to sort [default: %default]') +parser.add_option('--top',dest='top',type='int',default=None,help='accepts an integer, output at most this many peaks [default: all]') +parser.add_option('--output',dest='output',default=None,help='filename to output filtered peaks to [default: stdout]') +parser.add_option('--encode-filters',dest='encode_filters',action='store_true',help='write out records to a file <MACS peaks file>_<filters>.xls (incompatible with --output option)') +parser.add_option('--summary',dest='summary',action='store_true',help='only print out summary information for the filter') +parser.add_option('--no-header',dest='no_header',action='store_true',help='do not print out header or metadata info') +parser.add_option('--shuffle',dest='shuffle',action='store_true',help='shuffle order of filtered records, useful for selecting random peaks') + +parser.add_option('--print-encoded-fn',dest='print_encoded_fn',action='store_true',help="print out the filename that would be created by --encode-filters") + +# make condition function objects using closures +_lt = lambda x,y : x < y +_lte = lambda x,y : x <= y +_gt = lambda x,y : x > y +_gte = lambda x,y : x >= y +_cond_map = {'<':_lt,'<=':_lte,'>':_gt,'>=':_gte,None:None} + +def make_condition(low_val=None,low_test=_gte,high_val=None,high_test=_lte) : + if low_val and not high_val : + return lambda x: low_test(low_val,x) + elif not low_val and high_val : + return lambda x: high_test(x,high_val) + elif low_val and high_val : + return lambda x: low_test(low_val,x) and high_test(x,high_val) + else : + return lambda x: True # identity with no constraints + +# regex and function for parsing filter strings +numeric_regex_str = r'\d+(?:\.\d*)?(?:(?:e|E)-?\d+)?' # matches numeric-looking patterns, e.g. 1, 1.234, 1e4, 1.234E-5, etc. +separator_regex_str = r'(?:>|>=|<|<=)' +ids_regex_str = r'(?:tags|pvalue|fold_enrichment|fdr|length)' +filter_regex = re.compile('^(%(num)s)?(%(sep)s)?(%(id)s)(%(sep)s)?(%(num)s)?$'%{'num':numeric_regex_str,'sep':separator_regex_str,'id':ids_regex_str}) + +class FilterException(Exception) : pass + +def parse_filter(filter_str) : + match = filter_regex.search(filter_str.strip()) + if match is None : + raise FilterException('Filter %s is formatted incorrectly'%filter_str) + low_val, low_test, field, high_test, high_val = match.groups() + low_val = float(low_val) if low_val else low_val + high_val = float(high_val) if high_val else high_val + return field, make_condition(low_val,_cond_map[low_test],high_val,_cond_map[high_test]) + +_sort_keys = {'length': lambda x: int(x[3]), + 'tags': lambda x: int(x[5]), + 'pvalue': lambda x: 10**(float(x[6])/-10), + 'fold_enrichment': lambda x: float(x[7]), + 'fdr': lambda x: float(x[8]), + } + + +summary_str = """\ +# This output was generated by filter_macs_peaks.py, filtered from %(macs_fn)s +# Number of peaks: %(num_recs)d +# Filters: %(filters)s +# Sorted by: %(sort_by)s +# Shuffled: %(shuffled)s +""" +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 1 : + parser.error('Must provide one MACS peaks file') + + if opts.output is not None and opts.encode_filters : + parser.error('--output and --encode-filters options are mutually exclusive') + + # set where to write output + if opts.encode_filters : + # construct filename additions + fn_str = '' + opts.filters.sort() + for filt in opts.filters : + filter_str = filt.replace(' ','') + filter_str = filter_str.replace('>=','_GTE_') + filter_str = filter_str.replace('<=','_LTE_') + filter_str = filter_str.replace('>','_GT_') + filter_str = filter_str.replace('<','_LT_') + fn_str += '_%s'%filter_str + + if opts.top is not None : + fn_str += '_top%d'%opts.top + + if len(opts.sort_by) != 0 : + fn_str += '_sortby_%s'%opts.sort_by + + if opts.shuffle : + fn_str += '_shuffled' + + macs_path,macs_fn,macs_basefn,macs_ext = get_file_parts(args[0]) + encoded_fn = os.path.join(macs_path,macs_basefn+fn_str+macs_ext) + if opts.print_encoded_fn : + sys.stdout.write(encoded_fn) + sys.exit(0) + else : + out_f = open(encoded_fn,'w') + elif opts.output : + out_f = open(opts.output,'w') + else : + out_f = sys.stdout + + # parse the filters + field_filters = defaultdict(list) + for filter in opts.filters : + field, filter_cond = parse_filter(filter) + field_filters[field].append(filter_cond) + + # start processing MACS file + peaks = MACSFile(args[0]) + + # filter the records + pass_recs = [] + for peak in peaks : + # test each of the fields, if any one fails skip the record + if not all([c(int(peak['length'])) for c in field_filters['length']]) or \ + not all([c(int(peak['tags'])) for c in field_filters['tags']]) or \ + not all([c(10**(float(peak['-10*log10(pvalue)'])/-10)) for c in field_filters['pvalue']]) or \ + not all([c(float(peak['fold_enrichment'])) for c in field_filters['fold_enrichment']]) or \ + not all([c(float(peak['FDR(%)'])) for c in field_filters['fdr']]) : + continue + else : + pass_recs.append([peak[k] for k in MACSOutput.FIELD_NAMES]) + + if len(pass_recs) == 0 : + warn('WARNING: no records remain after filtering\n') + sys.exit(1) + + # sorting + if opts.sort_by : + pass_recs.sort(key=_sort_keys[opts.sort_by],reverse=opts.sort_dir != 'ASCEND') + + # top records + num_recs = len(pass_recs) if not opts.top else min(len(pass_recs),opts.top) + + # construct the summary string + filters_str = 'none' if len(opts.filters) == 0 else ', '.join(opts.filters) + sort_str = 'original order' if not opts.sort_by else opts.sort_by+', '+opts.sort_dir + shuffled_str = str(opts.shuffle) + summary = summary_str%{'macs_fn':args[0],'num_recs':num_recs, + 'filters':filters_str, + 'sort_by':sort_str, + 'shuffled':shuffled_str} + + # print summary only + if opts.summary : + sys.stdout.write(summary) + sys.exit(0) + + # write out the header cuz it's a nice thing to do + if not opts.no_header : + out_f.write(summary) + out_f.write('\t'.join(MACSOutput.FIELD_NAMES)+'\n') + + # write out records + if opts.shuffle : + shuffle(pass_recs) + out_recs = pass_recs[:num_recs] + + for rec in out_recs : + out_f.write('\t'.join(map(str,rec))+'\n') + + # good programming practice + out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/filter_mapped_known_genes.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +import re +import sys + +from csv import reader, writer +from collections import defaultdict as dd +from optparse import OptionParser + +from chipsequtil.util import MultiLineHelpFormatter as MF + +usage = '%prog [options] <mapped known genes file>' +description = """Filter columns and rows from *join_mapped_known_genes.py* output which was \ +invoked with *--binary-plus* and *--field-types* flags. Specify full column names for either \ +binding or expression data with the *--bind-cols* and *--affy-cols* arguments, respectively. \ +The special fieldname *MAPPED* from *join_mapped_known_genes.py* is used to determine whether \ +a file contains a mapping for each gene. To filter genes by their associated binding or \ +expression data, specify *--bind-filter* or *--affy-filter* as follows: + + - *any* - report gene if at least one input file maps to the gene + - *all* - report gene if every input file maps to the gene + - *absent* - report gene if no input file maps to the gene + - *none* - do not filter genes at all (default) + +Results of binding and expression filters are 'and'ed together, e.g.: + +--bind-filter=all --affy-filter=absent + +returns only genes for which all binding files and none of the expression files map. +""" +epilog='Note: when specifying column names, be sure to escape characters like (,),&,*,etc... \ +that shells interpret with a \\, e.g. --bind-cols=-10\\*log10\\(pvalue\\)' +parser = OptionParser(usage=usage,description=description,epilog=epilog, formatter=MF()) +parser.add_option('--bind-cols',dest='bind_cols',default='',help='comma delimited list of binding data column names to include, [default: all]') +parser.add_option('--affy-cols',dest='affy_cols',default='',help='comma delimited list of expression data column names to include, [default: all]') +parser.add_option('--bind-filter',dest='bind_filt',type='choice',choices=['any','all','absent','none'],default='none',help='gene set to include based on binding data [default: %default]') +parser.add_option('--affy-filter',dest='affy_filt',type='choice',choices=['any','all','absent','none'],default='none',help='gene set to include based on expression data [default: %default]') +parser.add_option('--output',dest='output',default=None,help='write output to file') + + +def match_headers(patts,field) : + for p in patts : + if field.endswith(p) : return True + return False + +def filter_vector(type,vec) : + if type == 'any' : + return '1' in vec + elif type == 'all' : + return all([x=='1' for x in vec]) + elif type == 'absent' : + return not ('1' in vec) + else : + return True + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 1 : + parser.error('Exactly one mapped file must be provided') + + map_fn = args[0] + + map_reader = reader(open(map_fn),delimiter='\t') + headers = map_reader.next() + bind_headers = [i for i,x in enumerate(headers) if x.startswith('BIND:')] + bind_map_headers = [i for i,x in enumerate(headers) if x.startswith('BIND:') and x.endswith('MAPPED')] + affy_headers = [i for i,x in enumerate(headers) if x.startswith('AFFY:')] + affy_map_headers = [i for i,x in enumerate(headers) if x.startswith('AFFY:') and x.endswith('MAPPED')] + + if len(bind_headers) == 0 and len(affy_headers) == 0 : + parser.error('No BIND: or AFFY: columns were found in the mapping, was *join_mapped_known_genes.py* run with the *--field-types* option?') + + # figure out which columns user wants + header_indices = [0,1] # always output knowngene and symbol + + bind_header_patts = opts.bind_cols.split(',') + header_indices += [i for i in bind_headers if match_headers(bind_header_patts,headers[i])] + + affy_header_patts = opts.affy_cols.split(',') + header_indices += [i for i in affy_headers if match_headers(affy_header_patts,headers[i])] + + out_f = open(opts.output,'w') if opts.output else sys.stdout + map_writer = writer(out_f,delimiter='\t') + + map_writer.writerow([headers[i] for i in header_indices]) + for rec in map_reader : + bind_vector = [rec[i] for i in bind_map_headers] + bind_pass = filter_vector(opts.bind_filt,bind_vector) + + affy_vector = [rec[i] for i in affy_map_headers] + affy_pass = filter_vector(opts.affy_filt,affy_vector) + + if bind_pass and affy_pass : + map_writer.writerow([rec[i] for i in header_indices]) + + if opts.output : out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/generate_stats_doc.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +from matplotlib.pyplot import * + +from reStUtil import * + +if __name__ == '__main__' : + + # read stats + # - common read sequences + # - overall quality scores + + + # alignment stats + # - # alignments + # - uniquely aligned + # - multi reads + # - fail filter + # - alignments per chromosome bar chart + + + # peak stats + + + # motif stats and plots
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/gerald_stats.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +import sys, re, os +from datetime import datetime +from optparse import OptionParser +from collections import defaultdict as dd +#from progressbar import ProgressBar +from csv import reader, writer + +from chipsequtil import get_file_parts +from chipsequtil.util import MultiLineHelpFormatter as MF +from reStUtil import ReStDocument, ReStSimpleTable + +usage = "%prog [options] <filename> [<filename>...]" +description="""\ +Outputs various stats about the GERALD formatted file(s) input. If multiple +files are provided statistics are aggregated according to the specified output +format. Output formats available via --format=X : + + # *python* - print an eval()'able python dictionary w/ counts + # *rst* - print statistics in a reStructured text table (default) + # *tab* - print statistics in a tab delimited form w/ header names + +Except for *python* format, each input file has its own output line. *python* +summarizes all alignments. +""" + +parser = OptionParser(usage=usage,description=description,formatter=MF()) +parser.add_option('--output',dest='output',default=None,help='write output to file [default: stdout]') +parser.add_option('--format',dest='format',type='choice',choices=['python','rst','tab'],default='rst',help='format to print out stats [default: %default]') + +def log(st) : + print datetime.now().isoformat()+' - '+st + +re_digits_nondigits = re.compile(r'\d+|\D+') +def format_with_commas(value,format='%s'): + parts = re_digits_nondigits.findall(format % (value,)) + for i in xrange(len(parts)): + s = parts[i] + if s.isdigit(): + parts[i] = _commafy(s) + break + return ''.join(parts) + +def _commafy(s): + + r = [] + for i, c in enumerate(reversed(s)): + if i and (not (i % 3)): + r.insert(0, ',') + r.insert(0, c) + return ''.join(r) + +if __name__ == '__main__' : + + opts,args = parser.parse_args(sys.argv[1:]) + + gerald_fns = args + + all_stats = dd(int) + stat_dicts = {} + stats_fields = ["sample", + "total alignments", + "% align unique", + "# reads aligned unique", + "% align repeat", + "# reads align repeat", + "% align none", + "# reads align none" + ] + + + data_rows = [] + for gerald_fn in gerald_fns : + stats = stat_dicts[gerald_fn] = dd(int) + + fnpath,fn,fnbase,fnext = get_file_parts(gerald_fn) + gerald_lines = reader(open(gerald_fn),delimiter='\t') + for row in gerald_lines : + m = re.match('^(\d+):(\d+):(\d+)$',row[10]) + if m is not None : + stats['multiread'] += 1 + all_stats['multiread'] += 1 + else : + stats[row[10]] += 1 + all_stats[row[10]] += 1 + + tot_reads = sum(stats.values())/1.-stats.get('QC',0) + unique_reads = sum([v for k,v in stats.items() if k.startswith('chr')]) + repeat_reads = stats.get('multiread',0) + nomap_reads = stats.get('NM',0) + data_row = [fn,format_with_commas(int(tot_reads)), + '%.1f'%(unique_reads/tot_reads*100),format_with_commas(unique_reads), + '%.1f'%(repeat_reads/tot_reads*100),format_with_commas(repeat_reads), + '%.1f'%(nomap_reads/tot_reads*100),format_with_commas(nomap_reads)] + + data_rows.append(data_row) + + out_f = open(opts.output,'w') if opts.output is not None else sys.stdout + + if opts.format == 'python' : + out_f.write(dict(all_stats)) + elif opts.format == 'rst' : + doc = ReStDocument(out_f) + table = ReStSimpleTable(header=stats_fields,data=data_rows) + doc.add(table) + doc.write() + elif opts.format == 'tab' : + out_w = writer(out_f,delimiter='\t') + out_w.writerow(stats_fields) + out_w.writerows(data_rows) + + if opts.output is not None : out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/gerald_to_bed.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import os +import re +import sys + +from optparse import OptionParser +from csv import DictReader, DictWriter +from chipsequtil import get_file_parts, GERALDOutput + +usage = "%prog [options] <GERALD file> [<GERALD file>...]" + +description = """\ +Convert the GERALD alignment formatted files into BED format. Input file named +<path>/<filename>.<ext> is translated into <path>/<filename>.bed unless --output +or --stdout is specified, in which case formatted lines are written to file or +standard output, respectively. If multiple input files are supplied with the +--output or --stdout option all formatted lines are concatenated together. +Formatting only occurs for GERALD input lines that have a valid Match Position +field (i.e. successfully aligned somewhere).""" + +parser = OptionParser(usage=usage, description=description) +parser.add_option('--output',dest='output',default=None,help='write all records to file') +parser.add_option('--stdout',dest='stdout',action='store_true',help='write out all formatted lines to stdout') +parser.add_option('--min-fields',dest='min_fields',action='store_true',help='only format the first three fields') +parser.add_option('--pass-only',dest='pass_only',action='store_true',help='only format lines with Y in the Pass Filtering field') +parser.add_option('--chromo-strip',dest='chromo_strip',default='.fa',help='pattern to remove from chromo field in BED output (e.g. --chromo-strip=.fa to remve .fa from chrX.fa) [default: %default]') + + + +if __name__ == '__main__' : + + opts,args = parser.parse_args(sys.argv[1:]) + + if len(args) == 0 : + parser.print_usage() + sys.exit(1) + + gerald_fns = args + + # step through the files + for gerald_fn in gerald_fns : + path,fn,fnbase,fnext = get_file_parts(gerald_fn) + bed_lines = [] + + + # where to write output to + if opts.stdout : + f_out = sys.stdout + else : + f_out = open(os.path.join(path,fnbase+'.bed'),'w') + + # process input + gerald_d = DictReader(open(gerald_fn),fieldnames=GERALDOutput.FIELD_NAMES,delimiter='\t') + for line_d in gerald_d : + if (opts.pass_only and line_d['filtering'] == 'Y' and line_d['match_pos'] != '') or (not opts.pass_only and line_d['match_pos'] != '') : + + if opts.chromo_strip is not None : + line_d['match_chromo'] = line_d['match_chromo'].replace(opts.chromo_strip,'') + + outline = [line_d['match_chromo'], # chromosome + line_d['match_pos'], # start + str(int(line_d['match_pos'])+len(line_d['read'])), # end + line_d['read'], # read + '0', # score + '+' if line_d['match_strand'] == 'F' else '-', # strand + '-', # thickStart + '-', # thickEnd + '0,0,255' if line_d['match_strand'] == 'F' else '255,0,0', # itemRgb + ] + outline = '\t'.join(outline) + f_out.write(outline+'\n') + #bed_lines.append(bed) + + # this is the slow way + #for line in open(gerld_fn) : + # grld = GERALDOutput(line) + # if (opts.pass_only and grld.filtering == 'Y' and grld.match_pos != '') or (not opts.pass_only and grld.match_pos != '') : + # bed = gerald_to_bed(grld,opts.min_fields) + # f_out.write(bed.output_format()) + # #bed_lines.append(bed) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/integrate_macs_ucsc.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import os +import sys +from optparse import OptionParser +from pypeline import Pypeline, ProcessPypeStep as PPS, PythonPypeStep as PyPS + +from chipsequtil import get_org_settings + +usage = "%prog <org> <stage dir> <stage url> <MACS wiggle directory>" +description = """Process a MACS wiggle directory when macs is invoked +with --wig option, convert all gzipped chromosome wiggle files to +bigWig format, copy to web staging directory <stage dir>, and create +track lines for adding to UCSC genome browser. Requires a <org> argument +that has a path using *org_settings.py <org> ucsc_chrom_sizes* that +points to a sizes file as created by UCSC's *fetchChromSizes <org>* +tool.""" + +parser = OptionParser(usage=usage,description=description) +parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 4 : + parser.error('Exactly four non-option arguments required') + + organism, stage_dir, stage_url, macs_dir = args + + pipeline = Pypeline('UCSC Integration',log='ucsc_integ.log') + + steps = [] + + org_settings = get_org_settings(organism) + + macs_path, macs_wiggle_path = os.path.dirname(macs_dir), os.path.basename(macs_dir) + macs_name = macs_wiggle_path.replace('_MACS_wiggle','') + wiggle_dir = macs_name+'_MACS_wiggle' + bigwig_fn = macs_name+'_%s_all_chr.bw' + d = {'wiggle_dir':macs_name+'_MACS_wiggle', + 'chrom_sizes':org_settings['ucsc_chrom_sizes'], + 'treat_bigwig_fn':macs_name+'_treat_all_chr.bw', + 'control_bigwig_fn':macs_name+'_control_all_chr.bw', + 'stage_dir':stage_dir, + 'stage_url':stage_url, + 'pwd':os.getcwd(), + } + + # create bigWig files + zcat_treat_call = "zcat %(wiggle_dir)s/treat/*.gz | " + \ + "grep -v '^track' | " + \ + "sed 's/\.fa//g' | " + \ + "wigToBigWig -clip stdin %(chrom_sizes)s " + \ + "%(wiggle_dir)s/treat/%(treat_bigwig_fn)s" + zcat_control_call = "zcat %(wiggle_dir)s/control/*.gz | " + \ + "grep -v '^track' | " + \ + "sed 's/\.fa//g' | " + \ + "wigToBigWig -clip stdin %(chrom_sizes)s " + \ + "%(wiggle_dir)s/control/%(control_bigwig_fn)s" + steps.append(PPS('Convert wig to bigWig',[zcat_treat_call%d,zcat_control_call%d])) + + # create the staging directory + mk_stage_dir_call = "mkdir -p %(stage_dir)s/%(wiggle_dir)s"%d + steps.append(PPS('Create staging directory',[mk_stage_dir_call])) + + # stage bigWig files to staging directory (create links) + stage_treat_call = "ln -fs %(pwd)s/%(wiggle_dir)s/treat/%(treat_bigwig_fn)s " + \ + "%(stage_dir)s/%(wiggle_dir)s/%(treat_bigwig_fn)s" + stage_control_call = "ln -fs %(pwd)s/%(wiggle_dir)s/control/%(control_bigwig_fn)s " + \ + "%(stage_dir)s/%(wiggle_dir)s/%(control_bigwig_fn)s" + steps.append(PPS('Stage bigWig files',[stage_treat_call%d,stage_control_call%d])) + + # generate track lines for treatment and control + treat_track_d = ['track', + 'type=bigWig', + 'name="Treatment"', + 'description="%s Treatment"'%macs_name, + 'bigDataUrl=%(stage_url)s/%(wiggle_dir)s/%(treat_bigwig_fn)s'%d] + treat_track = ' '.join(treat_track_d) + + control_track_d = ['track', + 'type=bigWig', + 'name="Control"', + 'description="%s Control"'%macs_name, + 'bigDataUrl=%(stage_url)s/%(wiggle_dir)s/%(control_bigwig_fn)s'%d] + control_track = ' '.join(control_track_d) + track_str = '\n'.join([treat_track, + control_track]) + + track_fn = wiggle_dir+'_tracks.txt' + def track_call(track_fn, track_str) : + f = open(track_fn,'w') + f.write(track_str+'\n') + f.close() + steps.append(PyPS('Generate track lines file',track_call, + callable_args=(track_fn,track_str)) + ) + + #calls = [zcat_treat_call, + # zcat_control_call, + # mk_stage_dir_call, + # stage_treat_call, + # stage_control_call, + # track_call + # ] + + #print calls + #steps.append(PPS('Stage Wiggle',calls)) + + pipeline.add_steps(steps) + pipeline.run(interactive=not opts.auto)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/join_mapped_known_genes.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,154 @@ +#!/usr/bin/env python + +import sys +import warnings + +from csv import reader, writer +from collections import defaultdict as dd +from optparse import OptionParser + +usage = '%prog -b <mapped DNA binding file>|-a <mapped microarray file> [-b <mapped DNA binding file> ...] [-a <mapped microarray file> ...]' +description = """Join all files on the first column, concatenating records with \ +matching entries onto one line per entry. Understands DNA binding data as mapped \ +with *map_peaks_to_known_genes.py* utility microarray data as mapped by \ +*probeset_to_known_genes.py* utility, passed to program using *-b* and *-a* options \ +respectively. If a file contains more than one mapping to a gene additional columns \ +are added. At least one file of either type is required. Field names are written as \ +<filename>.<original field name>.<map number> +""" +epilog="Note: microarray files should have been created by bioconductor, and all files should have a row of fieldnames as the first line" +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('-a','--affy-file',dest='affy_file',action='append',default=[],help='add a mapped microarray file') +parser.add_option('-b','--bind-file',dest='bind_file',action='append',default=[],help='add a mapped DNA binding file (e.g. MACS, BED)') +#parser.add_option('-b','--bed-file',dest='bed_file',action='append',default=[],help='add a mapped BED formatted peaks file') +parser.add_option('-m','--macs-file',dest='macs_file',action='append',default=[],help='DEPRECATED: use -b instead, add a mapped default MACS formatted peaks (*.xls) file') +parser.add_option('--output',dest='output',default=None,help='file to output joined records to [default: stdout]') +#parser.add_option('--intersect',dest='intersect',action='store_true',help='only output records common to all file passed in') +parser.add_option('--first-only',dest='first_only',action='store_true',help='only output the first mapping to a gene from each file') +parser.add_option('--binary',dest='binary',action='store_true',help='output only one column per file with a 0 or 1 to indicate whether a mapping exists in that file') +parser.add_option('--binary-plus',dest='binary_plus',action='store_true',help='output one column per file with a 0 or 1 to indicate whether a mapping exists in that file in addition to all other columns') +parser.add_option('--field-types',dest='field_types',action='store_true',help='prepend BIND or AFFY to the beginning of all appropriate columns') +#parser.add_option('--symbols',dest='symbols',action='store_true',help='mapped files contain symbols in second column (per map_peaks_to_known_genes.py|probeset_to_known_gene.py --symbol-xref option)') + +if __name__ == '__main__' : + + opts,args = parser.parse_args(sys.argv[1:]) + + if len(args) > 0 : + parser.error('There were non-option command line arguments passed, all files should have a preceeding option indicating filetype') + + if len(opts.macs_file) != 0 : + warnings.warn('The -m option is deprecated, please replace these flags with -b instead. Adding MACS filenames to binding filename list.',DeprecationWarning) + opts.bind_file.extend(opts.macs_file) + + if len(opts.bind_file) == 0 and len(opts.affy_file) == 0 : + parser.error('No files were passed in, aborting') + + # union of all genes + all_genes = set() + + # TODO - fix intersect w/ binary + opts.intersect = False + + # TODO - actually make this an option, or the default + opts.symbols = True + if opts.symbols : + symbol_map = {} + + # read all the files in + def get_file_dict(fns,header_prefix='') : + file_map = dd(lambda: dd(list)) + out_fieldnames = [] + blank_entry = [] + for fn in fns : + max_maps = 0 + f = reader(open(fn),delimiter='\t') + #f = open(fn) + fieldnames = f.next() + fieldnames = fieldnames[2:] # we don't want existing knownGeneID or geneSymbol + # read in the data, create a dictionary + for l in f : + if opts.symbols : + gene, symbol, data = l[0],l[1],l[2:] + symbol_map[gene] = symbol + else : + gene, data = l.split('\t',1) + file_map[fn][gene].append(data) + max_maps = max(max_maps,len(file_map[fn][gene])) + all_genes.add(gene) + + # if we're adding a binary column, do it + if opts.binary_plus : + out_fieldnames.append(header_prefix+fn+'.MAPPED') + + # construct the fieldnames for this file + for i in range(max_maps) : + out_fieldnames.extend(['%s%s.%d.%s'%(header_prefix,fn,i,h) for h in fieldnames]) + + # pad out data entries w/ fewer than max_maps + for gene,data in file_map[fn].items() : + while len(data) < max_maps : + data.append(['']*len(fieldnames)) + file_map[fn]['blank'] = [['']*len(fieldnames) for _ in range(max_maps)] + return file_map,out_fieldnames + + #macs_file_map, macs_fieldnames = get_file_dict(opts.macs_file) + #bed_file_map, bed_fieldnames = get_file_dict(opts.bed_file) + bind_prefix = 'BIND:' if opts.field_types else '' + affy_prefix = 'AFFY:' if opts.field_types else '' + bind_file_map, bind_fieldnames = get_file_dict(opts.bind_file,bind_prefix) + affy_file_map, affy_fieldnames = get_file_dict(opts.affy_file,affy_prefix) + + # prepare output objects + out_f = open(opts.output,'w') if opts.output else sys.stdout + map_fieldnames = ['knownGeneID'] + if opts.symbols : + map_fieldnames.append('geneSymbol') + #all_fieldnames = map_fieldnames+macs_fieldnames+bed_fieldnames+affy_fieldnames + all_fieldnames = map_fieldnames+bind_fieldnames+affy_fieldnames + if opts.binary : + #all_fieldnames = map_fieldnames+opts.macs_file+opts.bed_file+opts.affy_file + all_fieldnames = [x+'.MAPPED' for x in map_fieldnames+opts.bind_file+opts.affy_file] + join_writer = writer(out_f,delimiter='\t') + join_writer.writerow(all_fieldnames) + + # go through all the genes and print out lines + for gene in all_genes : + gene_line = [gene] + if opts.symbols : + gene_line.append(symbol_map[gene]) + #for filetype_data,fns in zip([macs_file_map,bed_file_map,affy_file_map],[opts.macs_file,opts.bed_file,opts.affy_file]) : + for filetype_data,fns in zip([bind_file_map,affy_file_map],[opts.bind_file,opts.affy_file]) : + for fn,recs in [(fn,filetype_data[fn]) for fn in fns] : + #for fn,recs in d.items() : + if recs.has_key(gene) : + # only output the first entry + if opts.first_only : + gene_line.extend(recs[gene][0]) + # only output a 1 or a zero + elif opts.binary : + gene_line.extend('1') + # else output normally + else : + # add binary column in addition to other output + if opts.binary_plus : + gene_line.extend('1') + for rec in recs[gene] : + gene_line.extend(rec) + else : + # if intersecting, ignore this gene + if opts.intersect : + continue + elif opts.binary : + gene_line.extend('0') + else : + # add binary column in addition to other output + if opts.binary_plus : + gene_line.extend('0') + for blank in filetype_data[fn]['blank'] : + #print len(blank) + gene_line.extend(blank) + #print fn, gene_line[2], len(gene_line), gene_line + join_writer.writerow(gene_line) + + if opts.output : out_f.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/kg_to_gff.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import os +import sys +from csv import DictReader, DictWriter, QUOTE_NONE +from optparse import OptionParser + +from chipsequtil import KnownGeneFile, get_file_parts + +#args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-07-08.txt','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] +args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-08-03.gtf','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] +usage = '%prog <knownGene annotation>' +description = 'convert a UCSC knownGene annotation to GFF' +parser = OptionParser(usage=usage,description=description) + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(args) + + kg_path,kg_fn,kg_base,kg_ext = get_file_parts(args[0]) + #kg_f = KnownGeneFile(args[0]) + + # xref for finding gene symbols + kgXref_fn = args[1] + kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','proAcc','description'] + xref_map = dict([(x['kgID'],x) for x in DictReader(open(kgXref_fn),delimiter='\t',fieldnames=kgXref_fieldnames)]) + + gff_headers = ['seqname','source','feature','start','end','score','strand','frame','attributes'] + gff_reader = DictReader(open(args[0]),delimiter='\t',fieldnames=gff_headers) + gff_writer = DictWriter(sys.stdout,delimiter='\t',fieldnames=gff_headers,quotechar='',quoting=QUOTE_NONE,lineterminator='\n') + #gff_writer.writerow(dict([(x,x) for x in gff_headers])) + + for i,rec in enumerate(gff_reader) : + #d = {} + #d['seqname'] = rec['chrom'] + #d['source'] = 'UCSC_knownGene' + #d['feature'] = 'gene' + #d['start'] = rec['txStart'] + #d['end'] = rec['txEnd'] + #d['score'] = '.' + #d['strand'] = rec['strand'] + #d['frame'] = '.' + #gene_name = rec['name'] + + gff_attrs_lst = [x.strip() for x in rec['attributes'].split(';')][:-1] + gff_attrs = {} + for attr in gff_attrs_lst : + k,v = attr.split(' ',1) + gff_attrs[k] = eval(v) + + kg_name = gff_attrs['gene_id'] + + # try to find a gene symbol + gene_id = xref_map[kg_name].get('geneSymbol',None) + #gene_id = kg_name + #if gene_id is None : + # gene_id = xref_map[kg_name].get('mRNA',None) + #if gene_id is None : + # gene_id = xref_map[kg_name].get('refseq',None) + if gene_id is None : # I give up + gene_id = kg_name + + gff_attrs_lst += ['gene_name "%s"'%gene_id] + rec['attributes'] = '; '.join(gff_attrs_lst) + gff_writer.writerow(rec) + + # now write the exons + #d['feature'] = 'exon' + #for j,(st,en) in enumerate(zip(rec['exonStarts'],rec['exonEnds'])) : + # d['start'] = st + # d['end'] = en + # d['attributes'] = '; '.join(['gene_id "%s"'%gene_id,'transcript_id "%s"'%rec['name'],'exon_number "%d"'%(j+1),'ID "%s.exon_%d"'%(rec['name'],j),'PARENT "%s"'%rec['name']]) + # gff_writer.writerow(d) + + + # version with knownGene in gene_name + # version with symbol in gene_name
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/map_intervals.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +import sys + +from collections import defaultdict +from csv import reader +from optparse import OptionParser + +from bx.intervals.intersection import IntervalTree, Interval + +usage = '%prog [options] <from> <to>' +description = """Find records in <to> interval file that map to records in +<from> interval file. Files should be tab delimited and are expected to have +a chromosome column, a start column, and an end column. The indices of these +columns can be specified on the command line but by default are the first +three columns, respectively. Prints out to stdout by default one new line +separated row per row in <from> with a line from <to> where there is a mapping. +If no mapping is found (e.g. when specifying a maximum margin to search within) +the word None is printed. By default only prints nearest record, with ties +settled by smallest line number in <to>.""" +parser = OptionParser(usage=usage,description=description) +parser.add_option('-w','--window',dest='window',type="float",nargs=2, + default=(1e9,1e9), + help="window as <int upstream> <int downstream> to search for intervals [default: %default]") +parser.add_option('-f','--from',dest='from_ind',type="int",nargs=3, + default=(0,1,2), + help="coordinates of chromosome, start, stop in <from> file") +parser.add_option('-i','--skip-from-header',dest='skip_fh',action='store_true', + help="<from> has a header that should be skipped") +parser.add_option('-t','--to',dest='to_ind',type="int",nargs=3, + default=(0,1,2), + help="coordinates of chromosome, start, stop in <to> file") +parser.add_option('-j','--skip-to-header',dest='skip_th',action='store_true', + help="<to> has a header that should be skipped") + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 2 : + parser.error('Exactly 2 non-option arguments are required') + + from_fn, to_fn = args + + chr_trees = defaultdict(IntervalTree) + chr_sizes = defaultdict(lambda : dict(minstart=sys.maxint,maxend=0)) + + if any([x > 1e9 for x in opts.window]) : + parser.error('Window maximum is +/- 1e9') + + to_reader = reader(open(to_fn),delimiter='\t') + if opts.skip_th : + to_header = to_reader.next() + + to_chr, to_st, to_en = opts.to_ind + for r in to_reader : + i = Interval(int(r[to_st]), + int(r[to_en]), + value=r, + chrom=r[to_chr] + ) + chr_trees[r[to_chr]].insert_interval(i) + chr_sizes[r[to_chr]]['minstart'] = min(int(r[to_st]),chr_sizes[r[to_chr]]['minstart']) + chr_sizes[r[to_chr]]['maxend'] = max(int(r[to_st]),chr_sizes[r[to_chr]]['maxend']) + + # window default is 1e9 because no chromosome is more than + # ten billion base pairs, right?! + def find_nearest(t,s,e,window=(1e9,1e9)) : + + # look for record within intervals + inside = t.find(s,e) + + if len(inside) >= 1 : # pick the first one, list returned is sorted + return inside[0] + + i = Interval(s,e) + before = t.upstream_of_interval(i,max_dist=window[0]) + after = t.downstream_of_interval(i,max_dist=window[1]) + + before = before[0] if len(before) != 0 else None + after = after[0] if len(after) != 0 else None + + if before and after : + b_dist = min(abs(before.end-s),abs(e-before.start)) + a_dist = min(abs(after.end-s),abs(e-after.start)) + nearest = before if b_dist < a_dist else after + elif before : + nearest = before + elif after : + nearest = after + else : + nearest = None + return nearest + + # now go through the from file + from_reader = reader(open(from_fn),delimiter='\t') + if opts.skip_fh : from_reader.next() + + from_chr, from_st, from_en = opts.from_ind + if opts.skip_th : + print '\t'.join(to_header) + for r in from_reader : + t = find_nearest(chr_trees[r[from_chr]],int(r[from_st]),int(r[from_en]), + window=opts.window) + if t : + print '\t'.join(t.value) + else : + print t + """ + # tests + print 'interval is before any other interval in tree' + t = find_nearest(chr_trees['chr2'],10388500,10388510) + print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-466f-1',t.value),t + print 'interval is after any other interval in tree' + t = find_nearest(chr_trees['chr1'],200000000,200000010) + print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-29c',t.value),t + print 'interval is between intervals' + t = find_nearest(chr_trees['chr3'],89773941,89774021) + print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-190b',t.value),t + print 'interval is inside another interval' + t = find_nearest(chr_trees['chr3'],89873999,89874001) + print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-190b',t.value), t + print 'interval is too far from anything to return anything' + t = find_nearest(chr_trees['chr3'],89773941,89774021,window=10) + print '\tCorrect answer: None, Returned answer: %s'%t + """
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/map_peaks_to_genes.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import sys, os +from optparse import OptionParser +from collections import defaultdict as dd +from chipsequtil import MACSOutput, BEDOutput, RefGeneOutput, parse_number +from csv import DictReader, DictWriter + +usage = '%prog [options] <refGene file> <peaks file>' +description = """ +Map the peaks in <peaks file> to genes in <refGene file>. <refGene file> is +format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql. +<peaks file> format is as produced by MACS.""" +epilog = '' +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') +parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') +parser.add_option('--map-output',dest='peak_output',default=sys.stdout,help='filename to output mapped peaks in BED format to [default: stdout]') +parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') +parser.add_option('--peaks-format',dest='peaks_fmt',default='MACS',type='choice',choices=['MACS','BED'],help='format of peaks input file [default: %default]') + +# TODO - options +#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') +#parser.add_option('--capture-intergenic'...) +#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') +#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') + +def parse_gene_ref(ref_gene) : + #FIXME - maybe, if galaxy doesn't work out, figure out how to deal with multiple RefGene mapping formats? + fieldnames = ['geneName','name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts','exonEnds'] + reader = DictReader(ref_gene,fieldnames=fieldnames,delimiter='\t') + gene_ref = dd(list) + for ref_dict in reader : + for k,v in ref_dict.items() : + # coerce numbers where possible + ref_dict[k] = parse_number(v) + + # turn 'x,x,x,...' into a list + ref_dict['exonStarts'] = [parse_number(x) for x in ref_dict['exonStarts'].split(',')] + if ref_dict['exonStarts'][-1] == '' : ref_dict['exonStarts'].remove('') + ref_dict['exonEnds'] = [parse_number(x) for x in ref_dict['exonEnds'].split(',')] + if ref_dict['exonEnds'][-1] == '' : ref_dict['exonEnds'].remove('') + + gene_ref[ref_dict['chrom']].append(ref_dict) + + return gene_ref + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 2 : + parser.error('Must provide two filename arguments') + + gene_ref = parse_gene_ref(open(args[0])) + if opts.peaks_fmt == 'MACS' : + fieldnames = MACSOutput.FIELD_NAMES + chr_field, start_field, end_field = 'chr', 'start', 'end' + elif opts.peaks_fmt == 'BED' : + fieldnames = BEDOutput.FIELD_NAMES + chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' + else : + fieldnames = [] + + peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') + + # default output format: + # <chromo> <peak loc> <accession #> <gene symbol> <strand> <map type> <map subtype> <score> <dist from feature> + # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene + output_fields = ['chromo', + 'peak loc', + 'accession #', + 'gene symbol', + 'strand', + 'map type', + 'map subtype', + 'score', + 'dist from feature', + ] + if opts.peak_output != sys.stdout : + opts.peak_output = open(opts.peak_output,'w') + peaks_writer = DictWriter(opts.peak_output,output_fields,delimiter='\t',lineterminator='\n') + unique_genes = set() + map_stats = dd(int) + for peak in peaks_reader : + + # if this is a comment or header line get skip it + if peak[fieldnames[0]].startswith('#') or \ + peak[fieldnames[0]] == fieldnames[0] or \ + peak[fieldnames[0]].startswith('track') : continue + + # coerce values to numeric if possible + for k,v in peak.items() : peak[k] = parse_number(v) + + # peak assumed to be in the middle of the reported peak range + peak_loc = (peak[start_field]+peak[end_field])/2 + + chrom_genes = gene_ref[peak[chr_field]] + + if len(chrom_genes) == 0 : + sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) + continue + + mapped = False + + # walk through the genes for this chromosome + for gene in chrom_genes : + + # reusable dictionary for output + out_d = {}.fromkeys(output_fields,0) + out_d['map type'] = '' + out_d['chromo'] = peak[chr_field] + out_d['peak loc'] = peak_loc + + # determine intervals for promoter, gene, and downstream + if gene['strand'] == '+' : + promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win + else : + promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + + # check for promoter + if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : + out_d['map type'] = 'promoter' + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + # check for gene + elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : + # check for intron/exon + exon_coords = zip(gene['exonStarts'],gene['exonEnds']) + in_exon = False + for st,en in exon_coords : + if peak_loc >= st and peak_loc <= en : + in_exon = True + break + out_d['map type'] = 'gene' + out_d['map subtype'] = 'exon' if in_exon else 'intron' + + # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene + gene_len = float(gene_coords[1]-gene_coords[0]) + out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len + + # distance calculated from start of gene + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + map_stats[out_d['map subtype']] += 1 + + # check for downstream + elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : + out_d['map type'] = 'after' + out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc + + # does not map to this gene + else : + pass + + # map type is not blank if we mapped to something + if out_d['map type'] != '' : + + out_d['accession #'] = gene['name'] + out_d['gene symbol'] = gene['geneName'] + out_d['strand'] = gene['strand'] + + map_stats[out_d['map type']] += 1 + peaks_writer.writerow(out_d) + + unique_genes.add(gene['name']) + mapped = True + + """ + print 'Peak:',peak + print 'Gene:',gene + print 'Peak loc:',peak_loc + print promoter_coords + print gene_coords + print downstream_coords + raw_input('Wait for it...') + """ + + # reset map_type + out_d['map type'] = '' + + if not mapped : + #out_d['map type'] = 'intergenic' + #peaks_writer.writerow(out_d) + map_stats['intergenic'] += 1 + + if opts.peak_output != sys.stdout : + opts.peak_output.close() + + if opts.stats_output != sys.stderr : + opts.stats_output = open(opts.stats_output,'w') + + for k,v in map_stats.items() : + opts.stats_output.write('%s: %s\n'%(k,v)) + + if opts.stats_output != sys.stderr : + opts.stats_output.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/map_peaks_to_known_genes.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,233 @@ +#!/usr/bin/env python + +import sys, os +from optparse import OptionParser +from collections import defaultdict as dd +from csv import DictReader, DictWriter + +from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number +from chipsequtil.util import MultiLineHelpFormatter + +usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>' +description = """ +Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\ +format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ +<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \ +is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail\ +option is provided, the following extra fields are appended to each row: + +peak loc, dist from feature, score, map type, map subtype +""" +epilog = '' +parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) +parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') +parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') +parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') +parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') +parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') +parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') +parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') +parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') +#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') + +# TODO - options +#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') +#parser.add_option('--capture-intergenic'...) +#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') +#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') + +def parse_gene_ref(ref_gene) : + reader = KnownGeneFile(ref_gene) + gene_ref = dd(list) + for ref_dict in reader : + gene_ref[ref_dict['chrom']].append(ref_dict) + + return gene_ref + +def parse_gene_ref_line(l) : + l = map(parse_number, l) # coerce to numbers where possible + l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list + l[10] = map(parse_number, l[10].split(',')) + return l + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 3 : + parser.error('Must provide three filename arguments') + + gene_ref = parse_gene_ref(args[0]) + xref_fn = args[1] + peaks_fn = args[2] + if opts.peaks_fmt == 'auto' : + path,ext = os.path.splitext(peaks_fn) + if ext.lower() == '.xls' : + opts.peaks_fmt = 'MACS' + elif ext.lower() == '.bed' : + opts.peaks_fmt = 'BED' + else : + parser.error('Could not guess peaks file format by extension (%s), aborting'%ext) + + if opts.peaks_fmt == 'MACS' : + peaks_reader_cls = MACSFile + chr_field, start_field, end_field = 'chr', 'start', 'end' + elif opts.peaks_fmt == 'BED' : + peaks_reader_cls = BEDFile + chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' + else : + # should never happen + fieldnames = [] + + #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') + peaks_reader = peaks_reader_cls(peaks_fn) + + # default output format: + if opts.peak_output : + peak_output = open(opts.peak_output,'w') + else : + peak_output = sys.stdout + + fieldnames = peaks_reader.FIELD_NAMES + if opts.detail : + fieldnames += ["peak loc","dist from feature","score","map type","map subtype"] + output_fields = ['knownGeneID']+fieldnames + + # see if the user wants gene symbols too + # TODO - actually make this an option, or make it required + opts.symbol_xref = xref_fn + if opts.symbol_xref : + kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] + symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') + symbol_xref_map = {} + for rec in symbol_xref_reader : + symbol_xref_map[rec['kgID']] = rec + output_fields = ['knownGeneID','geneSymbol']+fieldnames + + peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') + peaks_writer.writerow(dict([(k,k) for k in output_fields])) + unique_genes = set() + map_stats = dd(int) + for peak in peaks_reader : + + # if this is a comment or header line get skip it + if peak[fieldnames[0]].startswith('#') or \ + peak[fieldnames[0]] == fieldnames[0] or \ + peak[fieldnames[0]].startswith('track') : continue + + # coerce values to numeric if possible + for k,v in peak.items() : peak[k] = parse_number(v) + + # MACS output gives us summit + if opts.peaks_fmt == 'MACS' : + peak_loc = peak[start_field]+peak['summit'] + else : # peak assumed to be in the middle of the reported peak range + peak_loc = (peak[start_field]+peak[end_field])/2 + + chrom_genes = gene_ref[peak[chr_field]] + + if len(chrom_genes) == 0 : + sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) + continue + + mapped = False + + # walk through the genes for this chromosome + for gene in chrom_genes : + + # reusable dictionary for output + out_d = {}.fromkeys(output_fields,0) + out_d.update(peak) + out_d['map type'] = '' + out_d['chromo'] = peak[chr_field] + out_d['peak loc'] = peak_loc + + # determine intervals for promoter, gene, and downstream + if gene['strand'] == '+' : + promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 + if opts.tss : + gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) + downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win + else : + promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing + if opts.tss : + gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] + downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + + # check for promoter + if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : + out_d['map type'] = 'promoter' + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + # check for gene + elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : + # check for intron/exon + exon_coords = zip(gene['exonStarts'],gene['exonEnds']) + in_exon = False + for st,en in exon_coords : + if peak_loc >= st and peak_loc <= en : + in_exon = True + break + out_d['map type'] = 'gene' + out_d['map subtype'] = 'exon' if in_exon else 'intron' + + # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene + gene_len = float(gene_coords[1]-gene_coords[0]) + out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len + + # distance calculated from start of gene + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + map_stats[out_d['map subtype']] += 1 + + # check for downstream + elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : + out_d['map type'] = 'after' + if opts.tss : + out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc + else : + out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc + + # does not map to this gene + else : + pass + + # map type is not blank if we mapped to something + if out_d['map type'] != '' : + + #out_d = {'knownGeneID':gene['name']} + out_d['knownGeneID'] = gene['name'] + if opts.symbol_xref : + out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] + peaks_writer.writerow(out_d) + + mapped = True + + # reset map_type + out_d['map type'] = '' + + if not mapped : + if opts.intergenic : + out_d['knownGeneID'] = 'None' + out_d['geneSymbol'] = 'None' + out_d['map type'] = 'intergenic' + peaks_writer.writerow(out_d) + map_stats['intergenic'] += 1 + + if peak_output != sys.stdout : + peak_output.close() + + #if opts.stats_output != sys.stderr : + # opts.stats_output = open(opts.stats_output,'w') + + #for k,v in map_stats.items() : + # opts.stats_output.write('%s: %s\n'%(k,v)) + + #if opts.stats_output != sys.stderr : + # opts.stats_output.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/motif_scan.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,330 @@ +#!/usr/bin/env python + +import matplotlib +matplotlib.use('AGG') + +import numpy as np +import os +import random +import string +import sys + +from math import log, pow +import matplotlib.pyplot as mp +from multiprocessing import Pool +from optparse import OptionParser +from scipy.stats.stats import pearsonr + +from chipsequtil import MACSFile, get_org_settings +from chipsequtil.nib import NibDB +from chipsequtil.sampling import rejection_sample_bg +from TAMO import MotifTools as mt +from TAMO.MotifTools import load + +usage = "%prog [options] <org> <peaks fn> <TAMO motif fn>" +desc = "Do some motif scanning stuffs" +parser = OptionParser(usage=usage,description=desc) + +parser.add_option('-n','--top-n',dest='top_n',type='int',default=None, + help='use top n peaks by pvalue for sequence scanning [default: all]') +parser.add_option('-i','--motif-indices',dest='motif_ind',default='all', + help='which indices from <TAMO motif fn> to use [default: %default]') +parser.add_option('-d','--dir',dest='dir',default='motif_results', + help='write all results into this directory') +parser.add_option('--fixed-peak-width',dest='fixed_w',type='int',default=None, + help='use only a fixed peak window around the summit instead of whole peak') + +revcomp_map = string.maketrans('ACGT','TGCA') + +def score_sequence(seq,motif) : + ll_max = -sys.maxint + for i in range(len(seq)-len(motif)) : + # forward strand + ll_for_sum = 0 + subseq = seq[i:i+len(motif)].upper() + for n,pos in zip(subseq,motif.ll) : + ll_for_sum += pos[n] + # reverse strand + ll_rev_sum = 0 + subseq = reversed(subseq.translate(revcomp_map)) + for n,pos in zip(subseq,motif.ll) : + ll_rev_sum += pos[n] + ll_max = max(ll_max,ll_for_sum,ll_rev_sum) + + return ll_max + +illegal_fn_chars = '/;& ()' +fn_trans = string.maketrans(illegal_fn_chars,'_'*len(illegal_fn_chars)) + +def fasta_itr(fn) : + f = open(fn) + header = None + seq = None + for l in f : + if l.strip().startswith('>') : + if seq is not None : + yield (header,seq) + seq = None + header = l.strip() + else : + seq = seq+l.strip() if seq is not None else l.strip() + + # last record + yield (header, seq) + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 3 : + parser.error('Exactly 3 non-option arguments must be provided') + + org, peaks_fn, motif_fn = args + + if not os.path.exists(opts.dir) : + os.mkdir(opts.dir) + + peaks_dt = np.dtype([('chr',np.str_,13),('start',np.int32),('end',np.int32),('pvalue',np.float64)]) + if opts.fixed_w is not None : + + all_peaks = np.array([(r['chr'], + r['start']+r['summit']-opts.fixed_w/2., + r['start']+r['summit']+opts.fixed_w/2., + r['-10*log10(pvalue)']) for r in MACSFile(peaks_fn)], + dtype=peaks_dt) + else : + all_peaks = np.array([(r['chr'], + r['start'], + r['end'], + r['-10*log10(pvalue)']) for r in MACSFile(peaks_fn)], + dtype=peaks_dt) + + # -10*log10(pvalue) -> -log10(pvalue) + all_peaks[:]['pvalue'] /= 10. + peak_pvals = all_peaks[:]['pvalue'] + + # find the sorted order of peaks by descending pvalue + peak_pval_inds = peak_pvals.argsort() + peak_pval_inds = peak_pval_inds[::-1] # ascending -> descending + all_peaks = all_peaks[peak_pval_inds,:] + + # for pvalue vs motif score + pval_num_bins = 20 + pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins + # try to take at least 100 sequences, at most 10% of bin size + sample_percent = max(min(1.,100./pval_bin_size),0.1) + pval_bin_memo = {} + + if opts.top_n is not None : + peaks = all_peaks[0:opts.top_n] + peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n] + else : + peaks = all_peaks + + # extract fasta sequences for these peaks + nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir']) + + """ + # get the peak sequences + sys.stderr.write('Getting peak sequences\n') + fasta_batch = [] + for i in range(peaks.size) : + fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+')) + fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch) + + # need a dict for background sampling + # headers have genome_dir and .nib in them, strip that out + sys.stderr.write('Converting nib output to dict\n') + fg_fasta_headers = list(fg_fasta_headers) + fg_fasta_dict = {} + for h,s in zip(fg_fasta_headers,fg_fasta) : + h = h.replace('>'+get_org_settings(org)['genome_dir']+'/','') + h = h.replace('.nib','') + if len(s) > 150 : + fg_fasta_dict[h] = s + + # now sample the background sequences + sys.stderr.write('Sampling bg sequences (len(fg_fasta)==%d)\n'%(len(fg_fasta_dict))) + #bg_fasta_dict = rejection_sample_bg(fg_fasta_dict,org,bg_match_epsilon=1e-3,verbose=True) + bg_fasta_dict = {} + bg_fasta = bg_fasta_dict.values() + """ + + # load the motifs + sys.stderr.write('Movin right along\n') + motifs = load(motif_fn) + + if opts.motif_ind != 'all' : + motif_indices = [int(i) for i in opts.motif_ind.split(',') if len(i) != 0] + motifs = [motifs[i] for i in motif_indices] + else : + motif_indices = xrange(len(motifs)) + + # use all cores w/ a Pool + #pool = Pool(processes=opts.n_procs) + + # go through each motif + job_params = [] + res = [] + #for i,m in zip(motif_indices,motifs) : + # job_params.append((i,m,peak_pvals,fg_fasta,bg_fasta,opts.dir)) + #seq_scores = pool.map(analyze_motif_sequences,job_params) + + seq_scores = [] + for m_i,m in zip(motif_indices,motifs) : + + out_dir = opts.dir + + try : + m_name = m.source.split('\t')[2] + except : + m_name = m.source.split()[0] + + print 'starting',m_name + + # pvalue vs motif score + pval_bin_bounds = [] + pval_bin_pvals = [] + pval_bin_ranges = np.arange(0,all_peaks[:]['pvalue'].size,pval_bin_size) + for st_i in pval_bin_ranges : + + end_i = min(st_i+pval_bin_size,all_peaks[:]['pvalue'].size-1) + st_val = all_peaks[st_i]['pvalue'] + end_val = all_peaks[end_i]['pvalue'] + + #print st_i, end_i, pval_bin_size, st_val, end_val + + # keep track of the pvalue bounds of each bin + pval_bin_bounds.append((st_val,end_val)) + + # we sample sample_percent% of peaks in the bin to score + num_to_sample = int(sample_percent*(end_i-st_i)) + inds_to_sample = random.sample(xrange(st_i,end_i),num_to_sample) + + # we memoize the sequences we've seen before so we don't fetch seqs + # unnecessarily + unmemoed_inds_to_sample = set(inds_to_sample).difference(set(pval_bin_memo.keys())) + + bin_fasta_batch = [] + for peak_i in unmemoed_inds_to_sample : + bin_fasta_batch.append((str(all_peaks[peak_i]['chr']), + int(all_peaks[peak_i]['start']), + int(all_peaks[peak_i]['end']), + '+')) + + if len(bin_fasta_batch) != 0 : + bin_headers, bin_seq = nibDb.get_fasta_batch(bin_fasta_batch) + + for i, ind in enumerate(unmemoed_inds_to_sample) : + pval_bin_memo[ind] = bin_seq[i].upper() + + # score the sequences + pval_bin_pvals.append([]) + for ind in inds_to_sample : + max_score = m.bestscan(pval_bin_memo[ind]) + max_score = (max_score-m.minscore)/(m.maxscore-m.minscore) + pval_bin_pvals[-1].append(max_score) + pval_bin_pvals[-1] = np.array(pval_bin_pvals[-1]) + + + mp.figure(figsize=(4,4)) + font = {'size':'9'} + mp.rc('font',**font) + + # box plot of the bins + mp.boxplot(pval_bin_pvals,positions=np.arange(len(pval_bin_pvals))) + + # plot the means of the bins + #[(x[0]+x[1])/2. for x in pval_bin_bounds] + mp.plot(np.arange(len(pval_bin_pvals)), + [x.mean() for x in pval_bin_pvals],'bo') + mp.title('Sampled motif score vs binned peak pvalue') + mp.xlabel('Binned -log10(pvalue)') + mp.ylabel('Maximum normalized motif score') + + img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_peakmot.png'%m_i) + mp.savefig(img_fn) + mp.clf() + + continue + + fg_ratios = [] + for seq in fg_fasta : + #max_score = score_sequence(seq,m) + max_score = m.bestscan(seq.upper()) + fg_ratios.append((max_score-m.minscore)/(m.maxscore-m.minscore)) + fg_ratios = np.array(fg_ratios) + + bg_ratios = [] + for seq in bg_fasta : + #max_score = score_sequence(seq,m) + max_score = m.bestscan(seq.upper()) + bg_ratios.append((max_score-m.minscore)/(m.maxscore-m.minscore)) + bg_ratios = np.array(bg_ratios) + + fg_mean = sum(fg_ratios)/len(fg_ratios) + fg_std = np.std(fg_ratios) + bg_mean = sum(bg_ratios)/len(bg_ratios) + bg_std = np.std(bg_ratios) + + m_mat = np.array((fg_ratios,bg_ratios,peak_pvals)) + fg_score_sort_inds = m_mat[0,:].argsort() + + motif_score_cnts, motif_score_bins = np.histogram(m_mat[0,:],bins=20) + binned_motif_scores = [] + for st, end in zip(motif_score_bins[:-1],motif_score_bins[1:]) : + binned_motif_scores.append(m_mat[2,(m_mat[0,:]>=st)&(m_mat[0,:]<end)]) + + mp.figure(figsize=(4,4)) + font = {'size':'9'} + mp.rc('font',**font) + + mp.plot(fg_ratios,peak_pvals,'bo') + + # calculate pearson correlation coefficient + pear_r, pear_pval = pearsonr(fg_ratios,peak_pvals) + mp.title('Max motif strength vs peak pvalue\n(r=%.2f,pval=%.2g)'%(pear_r,pear_pval)) + img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_corr.png'%m_i) + mp.savefig(img_fn) + mp.clf() + + # line plot of average peak p-value for binned motif score + mp.title('Average peak p-value for binned motif score\n%s'%m_name) + mp.xlabel('normalized motif score') + mp.ylabel('-log10(pvalue)') + mp.boxplot(binned_motif_scores,positions=np.arange(motif_score_bins.size-1),sym='') + p = mp.plot(np.arange(motif_score_bins.size-1), + [x.mean() for x in binned_motif_scores], + 'bo', + label='Mean fg score') + p = p[0] + + # draw a crosshair + bg_median_ind = np.argwhere(((motif_score_bins<=bg_mean)[:-1] & (motif_score_bins>=bg_mean)[1:])).ravel()[0] + bg_median = np.median(binned_motif_scores[bg_median_ind]) + xlim, ylim = p.axes.get_xlim(), p.axes.get_ylim() + mp.plot([bg_median_ind,bg_median_ind],ylim,'k-',label='Mean bg score=%.2g'%m_mat[1,:].mean()) + mp.plot(xlim,[bg_median,bg_median],'k-') + mp.xticks(np.arange(motif_score_bins.size)[1::5],['%.2f'%x for x in motif_score_bins[1::5]]) + mp.legend(loc='upper left') + + img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_peakmot.png'%m_i) + mp.savefig(img_fn) + mp.clf() + + ret_d ={'m_name': m_name, + 'fg_mean': fg_mean, + 'fg_std': fg_std, + 'bg_mean': bg_mean, + 'bg_std': bg_std, + 'fg_scores': fg_ratios, + 'bg_scores': bg_ratios, + #'wmw_pval': WMWtest(fg_ratios,bg_ratios) + } + + # binned pvalue vs sampled motif score + + + print 'done with',m_name + + seq_scores.append(ret_d)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/nibFrag.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# nibFrag.py - a python implementation of Jim Kent's nibFrag command line utility + +import sys +import warnings +from optparse import OptionParser, OptionGroup + +from chipsequtil import get_file_parts, BEDFile +from chipsequtil.nib import get_nib_batch, validate_nib_file, NibException, NOMASK, MASK, HARDMASK + +usage = '%prog [options] file.nib start end strand [outfile]\n -- or --\n%prog [options] --batch file.nib batchfile [batchfile ...]' +description = """A python implementation of Jim Kent's nibFrag utility that allows outputting to \ +stdout. Otherwise the functionality is identical for the non-batch usage. Batch mode accepts \ +one or more files containing sets of coordinates to extract from the nib file. Only BED formatting \ +is accepted at the moment. All sequences are concatenated together in FASTA format. To retrieve the \ +entire sequence, use END as the end argument.""" +epilog="Note: When specifying --name optionin batch mode, also specify --dbHeader to ensure unique FASTA headers." +parser = OptionParser(usage=usage,description=description,epilog=epilog) +#parser.add_option('--output',dest='output',default=sys.stdout,help='filename to write output to [default: stdout]') +parser.add_option('--no-header',dest='no_header',action='store_true',help='only output sequence (no fasta header)') +parser.add_option('--wrap-width',dest='wrap_width',type='int',default=50,help='wrap output sequence at this number of bases, 0 indicates no wrap (sequence ends up on single line) [default: %default]') +parser.add_option('--batch',dest='batch',action='store_true',help='run in batch mode, interpret arguments after nib file as queries') +parser.add_option('--batch-format',dest='batch_format',type='choice',choices=['BED'],default='BED',help='format to interpret batch files [default: %default]') +#parser.add_option('--mask-type',dest='mask_type',type='choice',choices=['NOMASK','MASK','HARDMASK'],default='NOMASK',help='how to handle masked positions, correspond to original nibFrag options --masked and --hardMasked [default: %default]') + +# original nibFrag usage: +#nibFrag - Extract part of a nib file as .fa (all bases/gaps lower case by default) +#usage: +# nibFrag [options] file.nib start end strand out.fa +#where strand is + (plus) or m (minus) +#options: +# -masked - use lower case characters for bases meant to be masked out +# -hardMasked - use upper case for not masked-out and 'N' characters for masked-out bases +# -upper - use upper case characters for all bases +# -name=name Use given name after '>' in output sequence +# -dbHeader=db Add full database info to the header, with or without -name option +# -tbaHeader=db Format header for compatibility with tba, takes database name as argument + +# original nibFrag options +nibFrag_grp = OptionGroup(parser,"Original nibFrag options") +nibFrag_grp.add_option('--masked',dest='masked',action='store_true',help='use lower case characters for bases meant to be masked out') +nibFrag_grp.add_option('--hardMasked',dest='hardmasked',action='store_true',help='use upper case for non masked-out and \'N\' characters for masked-out bases') +nibFrag_grp.add_option('--upper',dest='upper',action='store_true',help='use upper case characters for all bases') +nibFrag_grp.add_option('--name',dest='name',default=None,help='Use given name after \'>\' in output sequence') +nibFrag_grp.add_option('--dbHeader',dest='dbHeader',default=None,help='Add full database info to the header, with or without -name option') +nibFrag_grp.add_option('--tbaHeader',dest='tbaHeader',default=None,help='Format header for compatibility with tba, takes database name as argument') +parser.add_option_group(nibFrag_grp) + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 1 : + parser.print_usage() + parser.exit(1) + + # setup + nib_path = args[0] + nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) + + queries = [] + if opts.batch : + + if len(args) < 2 : + parser.error('Two arguments must be supplied in batch mode') + + batch_fns = args[1:] + + for fn in batch_fns : + if opts.batch_format == 'BED' : + for bed in BEDFile(fn) : + if bed['chrom'] != nib_base : + warnings.warn('Chromosome in BED line %s does not match file %s, skipping'%(bed['chrom'],nib_base)) + else : + queries.append((int(bed['chromStart']),int(bed['chromEnd']),bed['strand'])) + else : + + if len(args) < 4 : + parser.error('Four arguments must be supplied in non-batch mode') + + # setup + strand = args[3] + start, end = int(args[1]),args[2] + if end == 'END' : + end = -1 + else : + end = int(end) + if end < start : + parser.error('Stop coordinate %d smaller than start %d'%(end,start)) + + queries.append((start,end,strand)) + + mask_type = NOMASK + if opts.masked : + mask_type = MASK + elif opts.hardmasked : + mask_type = HARDMASK + + # set the output file + if len(args) > 4 : + out_f = open(args[4],'w') + else : + out_f = sys.stdout + + # get the sequences from the .nib file + try : + headers, seqs = get_nib_batch(nib_path,queries,mask_type) + except NibException, e : + sys.stderr.write(e.message+'\n') + sys.exit(1) + + nbases = validate_nib_file(nib_path) + + # output all queries + for header, seq in zip(headers,seqs) : + + # write output + out_f.write(header) + + if opts.upper : + seq = seq.upper() + if opts.wrap_width == 0 : + out_f.write(seq+'\n') + else : + for i in xrange(0,len(seq),opts.wrap_width) : + out_f.write(seq[i:i+opts.wrap_width]+'\n') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/org_settings.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +import os +import sys +from optparse import OptionParser +from ConfigParser import ConfigParser, NoSectionError +from pprint import pformat + +from chipsequtil import get_org_settings, get_global_settings, get_all_settings, get_local_settings, GLOBAL_SETTINGS_FN, LOCAL_SETTINGS_FN + +usage = '%prog [options] [<org key> [<org setting>]]' +description='''Tool for retrieving sets of organism-specific settings and paths. +Original paths are set at install time, and can be overridden in the file ~/.org +settings.cfg. Allows output of settings in a variety of shell environment +syntaxes. The tool attempts to guess which shell environment is being used by +examining the SHELL environment variable unless explicitly set. When run without +an argument, returns a listing of all settings available. +''' +parser = OptionParser(usage=usage,description=description) +parser.add_option('-s','--syntax',dest='syntax',type='choice',\ + choices=['auto','python','bash','tcsh'],default='auto',help='syntax flavor \ + of output to produce [default: %auto]') +parser.add_option('-l','--list',dest='list_sets',action='store_true',help='print \ + all available settings for human consumption') + + +def obj_to_format(obj,format='python') : + '''Convert *obj* into a string that can be evaluated in the environment \ + indicated in *format*. + + obj -- a string, a dict of values, or a dict of dicts of values + format -- python (default), or bash + ''' + + if format == 'auto' : + format = os.environ.get('SHELL','python').split('/')[-1] + + r = '' + if format == 'python' : + r = pformat(obj) + elif format in ['sh','bash','zsh','csh','tcsh'] : + statements = [] + if format in ['sh','bash','zsh'] : + export_tmpl = 'export %s=%s' + elif format in ['csh','tcsh'] : + export_tmpl = 'setenv %s %s' + + # dict + if isinstance(obj,dict) : + for k1, v1 in obj.items() : + # dict of dicts + if isinstance(v1,dict) : + # these should be literal values + for k2, v2 in v1.items() : + statements.append(export_tmpl%('_'.join([k1,k2]).upper(),\ + str(v2))) + else : + v1 = str(v1) + s = '\''+v1+'\'' if v1.count(' ') != 0 else str(v1) + statements.append(export_tmpl%(k1.upper(),str(s))) + else : + return str(obj) + + r = '\n'.join(statements) + + return r + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + # output depends on number of arguments passed + output = '' + + # return everything we know about + if len(args) == 0 : + + if opts.list_sets : + + # always use python formatting when listing + opts.syntax = 'python' + + # global settings + settings = get_global_settings() + output = 'Global settings: (%s)\n'%GLOBAL_SETTINGS_FN + output += obj_to_format(settings,opts.syntax) + '\n' + + # local settings + settings = get_local_settings() + output += 'Local settings: (%s)\n'%LOCAL_SETTINGS_FN + output += obj_to_format(settings,opts.syntax) + else : + settings = get_all_settings() + output += obj_to_format(settings,opts.syntax) + + + # return all records from the specific organism + elif len(args) in (1,2) : + + # make sure our config files have the requested organism + try : + settings = get_org_settings(args[0]) + except NoSectionError : + sys.stderr.write('No entry %s found, available:\n'%args[0]+\ + pformat(get_all_settings().keys())+'\nExiting\n') + sys.exit(1) + + # return the requested field from the specific organism + if len(args) == 2 : + + # make sure the config file has the setting for this organism + try : + output = obj_to_format(settings[args[1]],opts.syntax) + except KeyError : + sys.stderr.write('Setting %s not found for %s, choices:\n'%(args[1],args[0])+ + pformat(settings.keys())+'\nExiting\n') + sys.exit(2) + else : + output = obj_to_format(settings,opts.syntax) + else : + parser.error('Provide zero, one, or two argments, found %s'%args) + + # bon voyage + sys.stdout.write(output+'\n') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/peaks_to_fasta.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,144 @@ +#!/usr/bin/env python + +import os +import sys +import textwrap +import warnings +from optparse import OptionParser + +from chipsequtil import BEDFile, MACSFile, get_file_parts, get_org_settings +from chipsequtil.nib import NibDB +from chipsequtil.sampling import rejection_sample_bg +from chipsequtil.util import MultiLineHelpFormatter +from chipsequtil.seq import write_fasta_to_file + + +usage='%prog [options] <organism> <peak file> [<peak file> ...]' +description='''Extract sequences for peaks in provided peak file(s). Can \ +interpret MACS or BED output, determined automatically by .xls or .bed extensions \ +respectively (force explicit format with --peak-format option). Outputs fasta \ +sequences for the peaks in all files extracted from the reference genome specified \ +by the output of *org_settings.py <organism> genome_dir* to stdout by default.\ +Chromosome names in peak files must match nib filenames without extension (e.g. \ +peak line: chr1 0 100 searches *genome_dir*/chr1.nib). Fasta records have the \ +following format: + +><chromosome>:<start>-<end>;fn=<name of file>:<line number>;db_fn=<db filename>;fmt=<format>;<source alignment info> +<sequence...> + +<db filename> is the filename where the sequence was extracted, <format> is the \ +format of the input file (MACS or BED), and <source alignment info> contains all \ +the fields from the originating alignment according to the source format.''' +parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) +parser.add_option('--min-header',dest='min_header',action='store_true',help='only store <chromosome>:<start>-<end> in header') +parser.add_option('--peak-format',dest='peak_format',type='choice', + choices=['auto','MACS','BED'],default='auto', + help='peak file format, \'auto\' determines format by extension, choices: MACS, BED, auto [default: %default]') +parser.add_option('--output',dest='output',default=None,help='filename to output fasta records to [default: stdout]') +parser.add_option('--fixed-peak-width',dest='fixed_peak_width',type='int',default=None,help='return a fixed number of bases flanking peak summit (*summit* field in MACS, (end-start)/2 in BED), ignoring start/stop coords [default: None]') +parser.add_option('--wrap-width',dest='wrap_width',type='int',default=70,help='wrap fasta sequences to specified width. -1 indicates no wrap [default: %default]') + + +def bed_to_fasta(fn,db,min_header=False) : + #headers,seqs = db.get_fasta_from_bed(fn) + fastas = [] + bed_recs = BEDFile(fn) + for i,rec in enumerate(bed_recs) : + + if opts.fixed_peak_width : + midpoint = (rec['chromEnd']-rec['chromStart'])/2 + start = max(0,midpoint-opts.fixed_peak_width/2) + end = min(midpoint+opts.fixed_peak_width/2,db.db_info[rec['chrom']]['nbases']) + coords = start, end + else : + coords = start,end = int(rec['chromStart']), int(rec['chromEnd']) + + seq = db.get_seq(rec['chrom'], start, end) + seq_fn = db.db_info[rec['chrom']]['path'] + + header = '%s:%s;'%(rec['chrom'],'%d-%d'%(start,end)) + if not min_header : + header = header.strip()+'%s:%d;fmt=BED;'%(fn,i)+ \ + ';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()]) + fastas.append((header,seq)) + + return fastas + + +def macs_to_fasta(fn,db,min_header=False) : + macs_recs = MACSFile(fn) + fasta = [] + for i,rec in enumerate(macs_recs) : + + if opts.fixed_peak_width : + # adjust start and end peak position based on summit, ensuring we don't step outside of the reference sequence bounds + start = max(0, rec['start']+rec['summit']-opts.fixed_peak_width/2) + end = min(rec['start']+rec['summit']+opts.fixed_peak_width/2, db.db_info[rec['chr']]['nbases']) + coords = start, end + else : + start, end = coords = rec['start'], rec['end'] + + seq = db.get_seq(rec['chr'],start,end) + seq_fn = db.db_info[rec['chr']]['path'] + + header = '%s:%s'%(rec['chr'],'%d-%d'%coords) + if not min_header : + header += ';%s:%d;db_fn=%s;fmt=MACS;'%(fn,i,seq_fn) + \ + ';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()]) + fasta.append((header,seq)) + + return fasta + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 2 : + parser.error('Must provide at least two non-option arguments') + + # instantiate the NibDB from the provided directory + organism = args[0] + nib_dir = get_org_settings(organism)['genome_dir'] + nib_db = NibDB(nib_dirs=[nib_dir]) + + # determine specified format + peak_fmt = opts.peak_format + + peak_fns = args[1:] + + # determine if there is an output file + if opts.output : + out_f = open(opts.output,'w') + else : + out_f = sys.stdout + + fasta_recs = [] + for peak_fn in peak_fns : + # if --peak-format is auto, figure format out from extension + if opts.peak_format == 'auto' : + fnbase, fnext = os.path.splitext(peak_fn) + if fnext.lower() == '.bed' : # BED file + peak_fmt = 'BED' + elif fnext.lower() == '.xls' : # MACS file + peak_fmt = 'MACS' + else : + warnings.warn('Peak format specified as auto but file extension \ + not recognized in file %s, skipping'%peak_fn) + continue + + if peak_fmt == 'BED' : + fasta_recs.extend(bed_to_fasta(peak_fn,nib_db,min_header=opts.min_header)) + elif peak_fmt == 'MACS' : + fasta_recs.extend(macs_to_fasta(peak_fn,nib_db,min_header=opts.min_header)) + + # write out foreground to file + if opts.output : + if opts.wrap_width == -1 : + opts.wrap_width = sys.maxint + write_fasta_to_file(dict(fasta_recs),opts.output,linelen=opts.wrap_width) + else : + for header, seq in fasta_recs : + if opts.wrap_width != -1 : + seq = textwrap.fill(seq,opts.wrap_width) + sys.stdout.write('>%s\n%s\n'%(header,seq))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/plot_peak_loc_dist.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,225 @@ +#!/usr/bin/env python + +import matplotlib +matplotlib.use('AGG') + +import matplotlib.pyplot as mp +import numpy as np +import os +import sys + +from collections import defaultdict +from csv import reader, writer +from optparse import OptionParser +from StringIO import StringIO + +from chipsequtil import MACSFile, BEDFile + + +usage = '%prog [options] <peaks fn> <gene list fn>' +desc = """Produce a pie chart of the locations of peaks in different bins +(promoter, gene, exon, intron, etc.) and, optionally, save the different +records to their own files for subsequent analysis. Also produce a histogram +of distance from feature values in mapping file. Peaks file is expected +to be as output by MACS, or alternately as a BED file but then the -b plot +is not available. Gene list file is expected to be in the format as +output by peaks_to_known_genes.py script.""" +parser = OptionParser(usage=usage,description=desc) +parser.add_option('-b','--bar-fn',dest='bar_fn',default=None,help='filename for pvalue stacked bar chart') +parser.add_option('-g','--gene-pie-fn',dest='gene_pie_fn',default=None,help='filename for pie chart image') +parser.add_option('-p','--peak-pie-fn',dest='peak_pie_fn',default=None,help='filename for pie chart image') +parser.add_option('-f','--dist-fn',dest='dist_fn',default=None,help='filename for distance from feature image') +parser.add_option('-s','--save',dest='save',action='store_true',help='write out files containing peaks for each category') +parser.add_option('-d','--output-dir',dest='out_dir',default='.',help='output files created by --save option to this directory') +parser.add_option('--no-plot',dest='no_plot',action='store_true',help='dont show (but save) the figure produced') +parser.add_option('--peaks-format',dest='peak_fmt',type='choice',choices=['MACS','BED'],default='MACS',help='format of peaks file, either MACS or BED [default: MACS]') + +GENE_FIELD_NAMES = ['knowngene_id','gene_symbol'] +LOC_FIELD_NAMES = ['peak_loc','dist_from_feature','score','map_type','map_subtype'] +int_or_none = lambda x: int(x) if x != '' else None +float_or_none = lambda x: float(x) if x != '' else None +LOC_FIELD_TYPES = [int_or_none,float_or_none,float_or_none,str,str] + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) != 2 : + parser.error('Exactly 2 non-option argument is required') + + peaks_fn, gene_fn = args + + if opts.peak_fmt == 'BED' : + peaks_f = BEDFile(peaks_fn) + else : + peaks_f = MACSFile(peaks_fn) + + gene_reader = reader(open(gene_fn),delimiter='\t') + gene_recs, macs_recs, loc_recs = [], [], [] + gene_reader.next() # get rid of header + + gene_field_cnt = len(GENE_FIELD_NAMES) + macs_field_cnt = len(MACSFile.FIELD_NAMES) + loc_field_cnt = len(LOC_FIELD_NAMES) + for rec in gene_reader : + + gene_recs.append(dict(zip(GENE_FIELD_NAMES,rec[:gene_field_cnt]))) + + # this automatically coerces recs into correct format + macs_line = [f(x) for f,x in zip(MACSFile.FIELD_TYPES,rec[gene_field_cnt:gene_field_cnt+macs_field_cnt])] + macs_recs.append(dict(zip(MACSFile.FIELD_NAMES,macs_line))) + + loc_line = [f(x) for f,x in zip(LOC_FIELD_TYPES,rec[gene_field_cnt+macs_field_cnt:])] + loc_recs.append(dict(zip(LOC_FIELD_NAMES,loc_line))) + + loc_dist = defaultdict(int) + unique_peaks = defaultdict(set) + exon_scores, intron_scores = [], [] + dist_to_features = defaultdict(list) + pvals = defaultdict(list) + + fn_base, fn_ext = os.path.splitext(gene_fn) + if opts.save : + def get_writer(fn) : + fd = writer(open(fn,'w'),delimiter='\t') + header = MACSFile.FIELD_NAMES + if opts.peak_fmt == 'BED' : + header = BEDFile.FIELD_NAMES + fd.writerow(GENE_FIELD_NAMES+header+LOC_FIELD_NAMES) + return fd + fds = {} + + for gene, peak, loc in zip(gene_recs, macs_recs, loc_recs) : + # weird case, not sure why this happens + if loc['map_subtype'] == '0' : + loc['map_subtype'] = '' + key = loc['map_type']+'_%s'%loc['map_subtype'] if loc['map_subtype'] != '' else loc['map_type'] + loc_dist[key] += 1 + dist_to_features[key].append(int(loc['dist_from_feature'])) + if opts.peak_fmt == 'MACS' : + pvals[key].append(float(peak['-10*log10(pvalue)'])) + + map_key = '%s:%d-%d'%(peak['chr'],peak['start'],peak['end']) + unique_peaks[key].add(map_key) + + if key == 'gene_exon' : + exon_scores.append(loc['score']) + elif key == 'gene_intron' : + intron_scores.append(loc['score']) + + if opts.save : + row = [gene[f] for f in GENE_FIELD_NAMES] + \ + [peak[f] for f in MACSFile.FIELD_NAMES] + \ + [loc[f] for f in LOC_FIELD_NAMES] + if not fds.has_key(key) : + fn = os.path.join(opts.out_dir,fn_base+'_'+key+fn_ext) + fds[key] = get_writer(fn) + fds[key].writerow(row) + + # now find which peaks are intergenic + intergenic = [] + num_peaks = 0 + all_unique_peaks = reduce(lambda x,y: x.union(y), unique_peaks.values()) + for l in peaks_f : + peak = l + map_key = '%s:%d-%d'%(peak['chr'],peak['start'],peak['end']) + if map_key not in all_unique_peaks : + unique_peaks['intergenic'].add(map_key) + intergenic.append(peak) + if opts.peak_fmt == 'MACS' : + pvals['intergenic'].append(peak['-10*log10(pvalue)']) + num_peaks += 1 + + num_int = len(intergenic) + loc_dist['intergenic'] = num_int + if opts.save : + fn = os.path.join(opts.out_dir,fn_base+'_intergenic.xls') + fd = writer(open(fn,'w'),delimiter='\t') + fd.writerow(MACSFile.FIELD_NAMES) + fd.writerows([[x[f] for f in MACSFile.FIELD_NAMES] for x in intergenic]) + + exon_scores, intron_scores = np.array(exon_scores), np.array(intron_scores) + + font = {'size':'9'} + mp.rc('font',**font) + fig = mp.figure(figsize=(4,4)) + + bin_order = ('intergenic','gene_exon','gene_intron','promoter','after') + colors = 'bgrcm' + + # pie chart + #pie_ax_rect = [0.1,0.35, 0.4125, 0.525 ] # left, bottom, width, height + pie_ax = fig.add_axes((0.15,0.15,0.7,0.7)) + pie_ax.set_title('Gene map distribution\n%d peaks'%num_peaks) + pie_labels, pie_values = [], [] + for k in bin_order : + pie_labels.append(k+'\n%d'%(len(unique_peaks[k]))) + pie_values.append(len(unique_peaks[k])) + pie_ax.pie(pie_values,labels=pie_labels) + + img_fn = fn_base+'_gene_loc.png' if opts.gene_pie_fn is None else opts.gene_pie_fn + mp.savefig(img_fn) + mp.clf() + + + fig = mp.figure(figsize=(4,4)) + pie_ax = fig.add_axes((0.15,0.15,0.7,0.7)) + pie_ax.set_title('Peak map distribution\n%d peaks'%num_peaks) + pie_labels, pie_values = [], [] + for k in bin_order : + pie_labels.append(k+'\n%d'%(loc_dist[k])) + pie_values.append(loc_dist[k]) + pie_ax.pie(pie_values,labels=pie_labels) + + img_fn = fn_base+'_peak_loc.png' if opts.peak_pie_fn is None else opts.peak_pie_fn + mp.savefig(img_fn) + mp.clf() + + fig = mp.figure(figsize=(4,4)) + # dist to feature histogram + #hist_ax_rect = [0.65,0.45,0.25,0.45] + hist_ax = fig.add_axes((0.15,0.15,0.7,0.7)) + hist_ax.set_title('Peak distance from TSS') + # join all the lists together + dists = sum(dist_to_features.values(),[]) + pdf, bins, patches = hist_ax.hist(dists,bins=20) + #h = mp.hist(dists,bins=20) + hist_ax.set_xlim((int(min(dists)),int(max(dists)))) + + dist_fn = fn_base+'_dist.png' if opts.dist_fn is None else opts.dist_fn + mp.savefig(dist_fn) + mp.clf() + + if opts.peak_fmt == 'MACS' : + fig = mp.figure(figsize=(4,4)) + bar_ax = fig.add_axes((0.15,0.15,0.7,0.7)) + pval_hists = {} + min_pval, max_pval = min([min(v) for v in pvals.values()]), max([max(v) for v in pvals.values()]) + for key,pvals in pvals.items() : + vals, bins = np.histogram(pvals,range=(0,max_pval),bins=20) + lv = np.log10(vals) + lv[np.isneginf(lv)] = 0.1 + pval_hists[key] = lv + + pval_items = [(k,pval_hists[k]) for k in bin_order if pval_hists.has_key(k)] + bar_width = 0.85*(max_pval-min_pval)/(len(bins)-1) + print max_pval, min_pval, len(bins) + print 'bar_width:',bar_width + bars = [] + b = bar_ax.bar(bins[:-1],pval_items[0][1],width=bar_width,color=colors[0]) + bars.append(b) + + sum_bottoms = pval_items[0][1] + for i, (key, pvals) in enumerate(pval_items[1:]) : + b = bar_ax.bar(bins[:-1],pvals,bottom=sum_bottoms,width=bar_width,color=colors[i+1]) + bars.append(b) + sum_bottoms += pvals + bar_ax.legend([b[0] for b in bars],[x[0] for x in pval_items]) + bar_ax.axis((-10,max(bins),0,max(sum_bottoms))) + bar_ax.set_title('Peak map distribution by pvalue') + bar_ax.set_xlabel('-10*log10(pvalue)') + bar_ax.set_ylabel('relative log10(# peaks)') + + pval_fn = fn_base+'_pval_bar.png' if opts.bar_fn is None else opts.bar_fn + mp.savefig(pval_fn)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/plot_pos_vs_neg_peaks.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +import os +import sys + +import matplotlib +matplotlib.use('AGG') + +from matplotlib.pyplot import * +from numpy import arange, log10 +from optparse import OptionParser + +from chipsequtil import MACSFile + +usage = '%prog [options] <pos peaks fn> <neg peaks fn>' +parser = OptionParser(usage=usage) +parser.add_option('-o','--output',dest='out_fn',default=None,help='filename of output image') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + pos_fn, neg_fn = args + + pos_f, neg_f = MACSFile(pos_fn), MACSFile(neg_fn) + + pos_peaks = [] + pos_pvals = [] + for pk in pos_f : + pos_pvals.append(float(pk['-10*log10(pvalue)'])/10.) + pos_peaks.append((pk['-10*log10(pvalue)'],pk)) + + pos_peaks.sort() + + neg_peaks = [] + neg_pvals = [] + for pk in neg_f : + neg_pvals.append(float(pk['-10*log10(pvalue)'])/10.) + neg_peaks.append((pk['-10*log10(pvalue)'],pk)) + + neg_peaks.sort() + + min_pval, max_pval = min(pos_pvals+neg_pvals), max(pos_pvals+neg_pvals) + + pval_rng = arange(min_pval,max_pval,(max_pval-min_pval)/100.) + + # construct cdfs + pos_cdf, neg_cdf = [], [] + for pval in pval_rng : + pos_cdf.append(len(filter(lambda x: x >= pval,pos_pvals))) + neg_cdf.append(len(filter(lambda x: x >= pval,neg_pvals))) + + # normalize cdfs + pos_cdf_norm = [1.*x/max(pos_cdf) for x in pos_cdf] + neg_cdf_norm = [1.*x/max(neg_cdf) for x in neg_cdf] + + # log of pvals + pos_logs = map(log10,pos_cdf) + neg_logs = map(log10,neg_cdf) + plot(pval_rng,pos_logs) + plot(pval_rng,neg_logs) + ytics, ylabs = yticks() + clf() + + # normalize logs for plotting + pos_logs_norm = [1.-x/max(pos_logs) for x in pos_logs] + neg_logs_norm = [1.-x/max(neg_logs) for x in neg_logs] + + # calculate pos proportion for each pvalue + pos_ratio = [] + pos_only = [] + for pos, neg in zip(pos_cdf,neg_cdf) : + #pos_ratio.append(pos/(pos+neg)) + if neg == 0 : + pos_only.append(pos_ratio[-1]) + #pos_ratio.append(pos_ratio[-1]) + else : + pos_ratio.append(pos/neg) + + subplot(211) + plot(pval_rng, pos_logs, 'b-') + plot(pval_rng, neg_logs, 'g-') + yticks(ytics,[int(10**y) for y in ytics]) + title('positive vs. negative peaks') + legend(('positive','negative'),loc='upper right') + xlabel('-log(p-value)') + ylabel('# Peaks') + axis('tight') + + subplot(212) + plot(pval_rng[:len(pos_ratio)], map(log10,pos_ratio), 'k-') + plot(pval_rng[len(pos_ratio):], map(log10,pos_only),'k--') + #plot(pval_rng,pos_ratio, 'k-') + axis('tight') + xlabel('-log(p-value)') + #ylabel('# pos / (# pos + # neg)') + ylabel('log10(# pos / # neg)') + + if opts.out_fn is None : + pos_base_fn, pos_fn_ext = os.path.splitext(pos_fn) + out_fn = '%s_pos_v_neg.png'%pos_base_fn + else : + out_fn = opts.out_fn + savefig(out_fn)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/probeset_to_known_gene.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +import gzip +import sys +from collections import defaultdict as dd +from csv import DictReader, DictWriter +from optparse import OptionParser +from sqlite3 import connect + +from chipsequtil import KnownGeneFile + +# TODO make these parameters? +#affy_anno_fn = 'Mouse430A_2.na30.annot.csv' + +usage = '%prog [options] <knownGene annotation> <knownToMOE430 file> <knownGene Xref file> <microarray data file>' +description = 'Maps probset data to knownGene database provided by UCSC. Probesets \ +that map to multiple knownGenes have one record per knownGene with duplicate data \ +otherwise. Output is knownGene id prepended to each record in microarray data file.' +parser = OptionParser(usage=usage,description=description) +parser.add_option('--output',dest='output',default=None,help='file to output mapping to [default: stdout]') +#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the provided kgXref file to output gene symbols as second column') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + #affy_bioc_fn = 'microarray_analysis/cbfb_vector_BH_all.txt' + #knownToMOE_sql_fn = 'knownToMOE430.sql' + #knownToMOE_data_fn = 'knownToMOE430.txt' + + if len(args) < 3 : + parser.error('Incorrect number of arguments provided') + + known_gene_fn = args[0] + knownToMOE_data_fn = args[1] + Xref_fn = args[2] + affy_bioc_fn = args[3] + + # affymetrix file from bioconductor + affy_bioc_f = open(affy_bioc_fn) + affy_bioc = {} + affy_bioc_reader = DictReader(affy_bioc_f,delimiter="\t") + for row in affy_bioc_reader : + affy_bioc[row['ID']] = row + + # knownGene annotation + kg = KnownGeneFile(known_gene_fn) + kg_ids = dict([(x['name'],x) for x in kg]) + + # affy to knownGene + affy_to_kg_map = dd(list) + affy_to_kg_fields = ['kgID','affyID'] + affy_to_kg_f = open(knownToMOE_data_fn) + kg_to_affy_map = dd(list) + for row in DictReader(affy_to_kg_f,fieldnames=affy_to_kg_fields,delimiter="\t") : + affy_to_kg_map[row['affyID'][2:]].append(row['kgID']) + kg_to_affy_map[row['kgID']].append(row['affyID'][2:]) + + if opts.output : + out_f = open(opts.output,'w') + else : + out_f = sys.stdout + + out_header = ['knownGeneID']+affy_bioc_reader.fieldnames + + # see if the user wants gene symbols too + opts.symbol_xref = Xref_fn + if opts.symbol_xref : + kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] + symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') + symbol_xref_map = {} + for rec in symbol_xref_reader : + symbol_xref_map[rec['kgID']] = rec + out_header = ['knownGeneID','geneSymbol']+affy_bioc_reader.fieldnames + + out_writer = DictWriter(out_f,delimiter='\t',fieldnames=out_header,lineterminator='\n') + out_writer.writerow(dict(zip(out_header,out_header))) + for probesetID, data in affy_bioc.items() : + kg_ids = affy_to_kg_map[probesetID] + for kg_id in kg_ids : + out_l = {'knownGeneID':kg_id} + if opts.symbol_xref : + out_l['geneSymbol'] = symbol_xref_map[kg_id]['geneSymbol'] + out_l.update(data) + out_writer.writerow(out_l) + + # figure out if any probsets map to non-overlapping loci + # dirty dirty dirty dirty + if False : + affy_id_loci = {} + for affyID, kgIDs in affy_to_kg_map.items() : + # check all pairwise kgIDs to make sure they all overlap in transcription start sites + kg_id_loci = dd(list) + for i, kgID1 in enumerate(kgIDs) : + kgID1_rec = kg_ids[kgID1] + kg_id_loci[kgID1].append(kgID1_rec) + for j, kgID2 in enumerate(kgIDs) : + kgID2_rec = kg_ids[kgID2] + # these are all gene overlap conditions + #kg1Start = kgID1_rec['txEnd'] if kgID1_rec['strand'] == '-' else kgID1_rec['txStart'] + #kg1End = kgID1_rec['txStart'] if kgID1_rec['strand'] == '-' else kgID1_rec['txEnd'] + #kg2Start = kgID2_rec['txEnd'] if kgID2_rec['strand'] == '-' else kgID2_rec['txStart'] + #kg2End = kgID2_rec['txStart'] if kgID2_rec['strand'] == '-' else kgID2_rec['txEnd'] + kg1Start, kg1End = kgID1_rec['txStart'], kgID1_rec['txEnd'] + kg2Start, kg2End = kgID2_rec['txStart'], kgID2_rec['txEnd'] + if (kg2Start <= kg1Start <= kg2End or \ + kg1Start <= kg2Start <= kg1End or \ + (kg2Start < kg1Start and kg2End > kg1End) or \ + (kg1Start < kg2Start and kg1End > kg2End)) and \ + kgID1_rec['chrom'] == kgID2_rec['chrom'] and \ + i != j : + # we have overlap + pass + elif i != j : + # doesn't overlap oh noes + kg_id_loci[kgID1].append(kgID2_rec) + for kg_id, kg_recs in kg_id_loci.items() : + if len(kg_recs) != 1 : + affy_id_loci[affyID] = (kg_id, len(kg_recs),len(kgIDs),kg_recs,kgIDs) + + if len(affy_id_loci) != 0 : + sys.stderr.write('Warning: %d probeset ids map to non-overlapping loci'%len(affy_id_loci)) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/rejection_sample_fasta.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import sys + +from optparse import OptionParser + +from chipsequtil import check_org_settings +from chipsequtil.util import MultiLineHelpFormatter +from chipsequtil.sampling import rejection_sample_bg +from chipsequtil.seq import fasta_to_dict, write_fasta_to_file + +usage = '%prog [options] <organism> <fasta file> [<fasta file> ... ]' +description = """Use rejection sampling to generate a set of background/random \ +sequences matching the distance to nearest transcription start site, sequence \ +length, and GC content distributions of the input fasta file(s). Generated \ +sequences are genomic sequences sampled based on these distributions. All sequences \ +from all files are used to generate the background sequences. The following \ +command must output a path to a nib genomic sequence directory and refGene \ +annotation, respectively : + +$> org_settings.py <organism> genome_dir +$> org_settings.py <organism> refgene_anno_path + +Utility prints out generated fasta records to stdout by default. Input sequences \ +from chr20 are mapped to chrX, chr21 are mapped to chrY, and sequences from chrM \ +are not used. +""" +epilog = "Note: script only considers sequences with unique header names, only the last record of those with identical header names is used" +parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) +parser.add_option('-n','--num-seqs',dest='num_seqs',default='1x', help='number of sequences to generate, either absolute number or factor of # input sequences, e.g. 2.5x for 2.5 times the # of input sequences [default: 1x]') +parser.add_option('--output',dest='output',default=None,help='file to output fasta records to [default: stdout]') +parser.add_option('--bed',dest='bed',action='store_true', help='also produce a BED formatted file representing sampled sequences') +parser.add_option('--bed-output',dest='bed_output',default='output.bed',help='with --bed, file to output BED records to [default: %default]') +parser.add_option('-v','--verbose',dest='verbose',action='store_true',help='print out debug information') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 2 : + parser.error('Must be 2 non-option arguments') + + organism, fasta_fns = args[0], args[1:] + + reqd_settings = ['genome_dir','refgene_anno_path'] + if not check_org_settings(organism,reqd_settings) : + parser.error('The <organism> settings set must contain paths for %s'%reqd_settings) + + # load up all the fasta records + fasta_recs = {} + for fasta_fn in fasta_fns : + fasta = fasta_to_dict(fasta_fn) + fasta_recs.update(fasta) + + # parse --num-seqs argument + if opts.num_seqs.endswith('x') : + num_seq_factor = float(opts.num_seqs[:-1]) + num_seqs = int(len(fasta_recs)*num_seq_factor) + else : + try : + num_seqs = int(opts.num_seqs) + except TypeError : + parser.error("Incorrect format of --num-seqs argument, must either be an integer or a factor ending with x, e.g. 2.5x") + + # generate the sequences + gen_seqs = rejection_sample_bg(fasta_recs,organism,num_samples=num_seqs,verbose=opts.verbose) + + # write out to file + if opts.output : + write_fasta_to_file(gen_seqs,opts.output) + else : + sys.stdout.write(''.join(['>%s\n%s\n'%(k,v) for k,v in gen_seqs.items()])) + + if opts.bed : + bed_f = open(opts.bed_output,'w') + bed_f.write(''.join([k.replace(':','\t').replace('-','\t')+'\n' for k in gen_seqs.keys()])) + bed_f.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/sort_bed.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,44 @@ +#!/usr/bin/env python +import sys, os +from optparse import OptionParser +from collections import defaultdict as dd +from csv import reader, writer + + +usage = "%prog [options] <BED file> [<BED file> <BED file>...]" +description = """\ +Sort the BED formatted files first by chromosome (field 1) and then by start +coordinate (field 2). Lines from all files submitted are concatenated and +sorted in the final output.""" +parser = OptionParser(usage=usage,description=description) +parser.add_option('--output',dest='output',default=sys.stdout,help='filename to write the sorted BED lines [default: stdout]') + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) == 0 : + parser.error("Must provide at least one file") + + fns = args + chromos = dd(list) + + # load each chromosome separately + for fn in fns : + bed_reader = reader(open(fn),delimiter='\t') + for line in bed_reader : + chromos[line[0]].append(line) + + # determine where we're writing to + if opts.output != sys.stdout : + f = open(opts.output,'w') + else : + f = opts.output + + # write the chromos in lexicographic sorted order + bed_writer = writer(f,delimiter='\t') + for k in sorted(chromos.keys()) : + + # sort each chromosome's BED lines by stat position + chromos[k].sort(key=lambda x: int(line[1])) + bed_writer.writerows(chromos[k])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/split_file.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +from optparse import OptionParser +from datetime import datetime +from subprocess import Popen, PIPE +import itertools +import sys, os, getpass, re + +usage = "[%prog] [options] filename" +description = """\ +Split <filename> into a set of files with either a specific number of lines +(--split-type=lines, default) or into a specific number of files (--split-type= +count). Files are created with .XXXX appended, indicating the number of file +split. Writes files to current working directory unless otherwise specified. +""" + +parser = OptionParser(usage=usage,description=description) +parser.add_option('--type',dest='split_type',type='choice',choices=['lines','count'],default='lines',help='how to split the file (WARNING: count does not preserve the sequence of lines in the original file when splitting) [default: %default]') +#parser.add_option('--split-arg',dest='split_arg',default='1000',help='integer argument for split type (size specified as Xb, XK, XM, or XG, others are integers) [default: %default]') +parser.add_option('--arg',dest='split_arg',type='int',default=1000,help='integer argument for split type [default: %default]') +parser.add_option('--outdir',dest='outdir',default='.',help='directory to put the split files in [default: %default]') + +def get_file_parts(fn) : + fpath,fname = os.path.split(fn) + fbase,fext = os.path.splitext(fname) + return fpath,fname,fbase,fext + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 1 : + parser.print_usage() + sys.exit(1) + + filename = args[0] + abs_filename = os.path.abspath(filename) + + # check to ensure filename exists + if not os.path.exists(abs_filename) : + sys.stderr.write('File %s does not exist, exiting\n'%abs_filename) + parser.print_usage() + sys.exit(2) + + # split the file + split_size = opts.split_arg + fpath,fname,fbase,fext = get_file_parts(abs_filename) + if opts.split_type == 'lines' : + curr_split = 0 # for first condition + split_fd = None + for i,l in enumerate(open(abs_filename)) : + if i%split_size == 0 : + if split_fd : split_fd.close() # close it if we aren't on the first split + split_fd = open(os.path.join(opts.outdir,fname)+'.%04d'%curr_split,'w') + curr_split += 1 + split_fd.write(l) + nlines = i + elif opts.split_type == 'count' : + # create split_size split files by writing lines round robin + split_fds = [open(os.path.join(opts.outdir,fname)+'.%04d'%x,'w') for x in range(split_size)] + split_cycle = itertools.cycle(split_fds) + for i,l in enumerate(open(abs_filename)) : + split_cycle.next().write(l) + nlines = i + + # close all the handles + [fd.close() for fd in split_fds] + + elif opts.split_type == 'size' : + # parse split_arg argument, into integer if split_type is 'size' + if opts.split_type == 'size' : + m = re.match('^(\d+)([bKMG])$',opts.split_arg) + if m is None : + sys.stderr.write("Incorrect --split-arg argument for --split-type=size, I understand only X[bKMG], exiting\n") + parser.print_usage() + sys.exit(3) + else : + size_d = {'b':1,'K':1024,'M':pow(1024,2),'G':pow(1024,3)} + split_size = int(m.groups()[0])*size_d[m.groups()[1]] + + fd = open(abs_filename) + curr_split_size = 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/split_qsub.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +from __future__ import with_statement +import os +import sys +from optparse import OptionParser +from subprocess import Popen, PIPE + +from chipsequtil import get_file_parts + +usage = "[%prog] [options] <utility> <file> [<file> <file> ...]" +description = """\ +Submit a job using qsub for <utility>, each with one <file> as an argument. Any +options specified on the command line that [%prog] cannot interpret are passed +on to the utility for each call.""" +epilog = "Note: this script only works in Unix-style environments" +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('--suffix',dest='suffix',default=None,help='string to append to stdout files, e.g. <filename>_<--suffix>.<--ext> [default: <utility>]') +parser.add_option('--ext',dest='ext',default='.out',help='file extension to use for stdout files') +parser.add_option('--util-args',dest='util_args',default='',help='double quote wrapped arguments to pass to <utility>') +parser.add_option('--keep-stderr',dest='keep_stderr',action='store_true',help='capture stderr files, useful for debugging') +parser.add_option('--keep-scripts',dest='keep_scripts',action='store_true',help='do not delete qsub scripts generated after job submission') +parser.add_option('--die-on-error',dest='die_on_err',action='store_true',help='if any one of the qsub submissions returns non-zero exit status, stop executing') + + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + utility, filenames = args[0], args[1:] + + # try to find the utility + abs_utility = os.path.abspath(utility) + if not os.path.exists(abs_utility) : + # look on the path + abs_utility = Popen('which %s'%utility,shell=True,stdout=PIPE,stderr=PIPE).communicate()[0].strip() + if not os.path.exists(abs_utility) : + raise Exception("Utility %s could not be found in the local directory or on the user's path, exiting"%utility) + sys.exit(1) + + upath,uname,ubase,uext = get_file_parts(abs_utility) + + runscript_tmpl = """ +#!/bin/bash + +#$ -N %(jobname)s +#$ -S /bin/sh +#$ -o %(stdout)s +#$ -e %(stderr)s +#$ -cwd +export PYTHONPATH=%(pythonpath)s:${PYTHONPATH} + +%(utility)s %(utilargs)s %(filename)s""" + + suffix = ubase if opts.suffix is None else opts.suffix + for fn in filenames : + abs_fn = os.path.abspath(fn) + fpath,fname,fbase,fext = get_file_parts(abs_fn) + stdout = os.path.join(fpath,fname+'_'+suffix+opts.ext) + stderr = '/dev/null' if not opts.keep_stderr else os.path.join(fpath,fname+'_'+suffix+'.err') + call_script = runscript_tmpl%{'jobname':fname,'utility':abs_utility,'filename':abs_fn,'stdout':stdout,'stderr':stderr,'utilargs':opts.util_args,'pythonpath':os.environ.get('PYTHONPATH','')} + f = open('%s'%abs_fn+'_'+utility+'.script','w') + f.write(call_script) + f.close() + p = Popen('qsub %s'%f.name,shell=True) + p.wait() + if not opts.keep_scripts : + os.remove(f.name) + + if opts.die_on_err and p.returncode != 0 : + with open(stderr,'w') as f : + f.write('qsub returned non-zero exit code for file %s, aborting\n'%fn) + sys.exit(1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/wait_for_jobid.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import re +import sys +import time + +from optparse import OptionParser +from subprocess import Popen, PIPE + +usage = '%prog [options] <job id> [<job id>...]' +desc = 'Poll qstat and wait until all <job id>s are finished' +parser = OptionParser(usage=usage,description=desc) + +array_job_match = '^(\d+)\[\]\.(.*)' +array_job_regex = '^%s\[[0-9]\+\]' + +def is_job_done(jobid) : + + done = False + + # have to handle array jobs differently than standalone + array_match = re.search(array_job_match,jobid) + if array_match is not None : + idnum, rest = array_match.groups() + jobid_regex = array_job_regex%idnum + qstat_p = Popen('qstat -t | grep "%s" | cut -f 1 -d " "'%jobid_regex,shell=True,stdout=PIPE) + stdout, stderr = qstat_p.communicate() + done = len(stdout) == 0 + + else : + # -j is only for SGE + qstat_p = Popen('qstat -j %s'%jobid,shell=True,stdout=PIPE,stderr=PIPE) + qstat_p.wait() + if qstat_p.returncode == 0 : + pass + # assume any != 0 return code means job is done + else : + done = True + + return done + +if __name__=='__main__': + + opts, args = parser.parse_args(sys.argv[1:]) + + jobids = map(lambda x: x.strip(), args) + + # wait for all of them + sys.stderr.write('Waiting for jobs to complete\n') + jobs_done = [False]*len(jobids) + try : + while not all(jobs_done) : + jobs_not_done = filter(lambda x: not x[1], enumerate(jobs_done)) + for i, jid in jobs_not_done : + jobs_done[i] = is_job_done(jobids[i]) + sys.stderr.write('Jobs done: %d/%d\r'%(sum(jobs_done),len(jobs_done))) + time.sleep(2) + sys.stderr.flush() + except KeyboardInterrupt : + sys.stderr.write('\n') + resp = raw_input('Caught keyboard interrupt, kill all jobs? [y/N] ') + if resp.lower() == 'y' : + Popen('kill_all_jobs.sh',shell=True) + + sys.stderr.write('done\n')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/wait_for_qsub.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import time +from subprocess import Popen, PIPE + +if __name__ == '__main__' : + + # this is gross, but it works when you need to stall a pipeline until all your jobs are done + done = False + while not done : + qstat_output = Popen('qstat',shell=True,stdout=PIPE).communicate()[0] + if qstat_output == '' : + done = True + else : + time.sleep(1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/wqsub.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,145 @@ +#!/usr/bin/env python + +from __future__ import with_statement +import os +import re +import sys +import time +from optparse import OptionParser +from subprocess import Popen, PIPE + +from chipsequtil import get_file_parts + +usage = "[%prog] [options] command" +description = """Wrap the specified command into a qsub script and submit it +for execution. Script captures both stdout and stderr to the current directory. +By default, all of the user's environment variables are put into the script +(compatible with SGE only ATM).""" +epilog = "Note: this script only works in Unix-style environments." +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('--wqsub-name',dest='wqsub_name',default='wqsub',help='job name to submit as <--wqsub-name>_<first non-whitespace chars in command> [default: %default]') +parser.add_option('--wqsub-ext',dest='wqsub_ext',default='.out',help='file extension to use for stdout files') +parser.add_option('--wqsub-keep-script',dest='wqsub_keep_script',action='store_true',help='do not delete qsub script generated after job submission') +parser.add_option('--wqsub-no-env',dest='wqsub_no_env',action='store_true',help='do not include any local environment variables in the script') +parser.add_option('--wqsub-no-submit',dest='wqsub_no_sub',action='store_true',help='create script but do not submit job (useful for generating scripts)') +parser.add_option('--wqsub-drm',dest='drm',default='SGE',type='choice',choices=['SGE','TORQUE'],help='the DRM to generate scripts for [default: %default]') +parser.add_option('--wqsub-drm-arg',dest='drm_args',action='append',default=[],help='arguments to pass as parameters in the job script specific to the DRM, use multiple option flags to specify multiple parameters') +parser.add_option('--wqsub-wait',dest='wait',action='store_true',help='poll the DRM and do not return control until job is finished (only works for TORQUE)') + +templates = { +'TORQUE': """\ +#!/bin/bash + +#PBS -N %(jobname)s +#PBS -o %(stdout)s +#PBS -e %(stderr)s +#PBS -d %(cwd)s +%(env)s +%(addnl)s + +%(command)s +""", +'SGE':"""\ +#!/bin/bash + +#$ -N %(jobname)s +#$ -S /bin/bash +#$ -o %(stdout)s +#$ -e %(stderr)s +#$ -cwd +%(env)s +%(addnl)s + +%(command)s +""" +} + +drm_symb = { +'TORQUE': 'PBS', +'SGE': '$' +} + +if __name__ == '__main__' : + + # get the wqsub args out first + wqsub_args = [] + other_args = [] + for arg in sys.argv : + if arg.count('wqsub') != 0 or arg in ['-h','--help'] : + wqsub_args.append(arg) + else : + other_args.append(arg) + + opts, args = parser.parse_args(wqsub_args) + + if len(other_args) == 0 : + parser.error('Must provide a command') + + command = ' '.join(other_args) + runscript_tmpl = templates[opts.drm] + # set up job parameters + cmd_exe = os.path.basename(other_args[0]) + jobname = opts.wqsub_name+'_'+cmd_exe + stdout_fn = jobname+opts.wqsub_ext + stdout = os.path.abspath(stdout_fn) + fpath,fname,fbase,fext = get_file_parts(stdout) + stderr = os.path.abspath(os.path.join(jobname+'.err')) + + # get the user's current environment and put it into the execute script + if opts.wqsub_no_env : + env_str = '# local environment variables omitted' + else : + env_str = '#%s -V'%drm_symb[opts.drm] + + # construct the script + addnl_params = [] + for addnl in opts.drm_args : + addnl_params.append('#%s %s'%(drm_symb[opts.drm],addnl)) + addnl_params = '\n'.join(addnl_params) + + job_dict = {'jobname':fname, + 'stdout':stdout, + 'stderr':stderr, + 'command':command, + 'env':env_str, + 'cwd':os.getcwd(), + 'addnl':addnl_params} + + call_script = runscript_tmpl%job_dict + # write the script to file + script_fn = os.path.abspath(jobname+'.script') + with open(script_fn,'w') as f : + f.write(call_script) + + if not opts.wqsub_no_sub : + p = Popen('qsub %s'%f.name,shell=True,stdout=PIPE) + p.wait() + stdout, stderr = p.communicate() + if not opts.wqsub_keep_script : + os.remove(f.name) + if opts.wait : + done = False + print 'Waiting on job id %s'%stdout.strip() + while not done : + qstat_p = Popen('qstat %s'%stdout,shell=True,stdout=PIPE,stderr=PIPE) + qstat_p.wait() + if opts.drm == 'TORQUE' : + done = False if qstat_p.returncode != 153 else True + elif opts.drm == 'SGE' : + done = False if qstat_p.returncode != 1 else True + time.sleep(3) # wait three seconds because it's nice + else : + if opts.drm == 'TORQUE' : + print stdout.strip() + elif opts.drm == 'SGE' : + qsub_output_patt = 'Your job (\d+)' + m = re.match(qsub_output_patt,stdout.strip()) + if m is not None: + print m.group(1) + sys.exit(0) + + # might be an array job + qsub_output_patt = 'Your job-array (\d+)\.' + m = re.match(qsub_output_patt,stdout.strip()) + if m is not None: + print m.group(1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/scripts/wqsub_drmaa.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +from __future__ import with_statement +import os +import sys +from optparse import OptionParser +from subprocess import Popen, PIPE + +import drmaa + +from chipsequtil import get_file_parts + +usage = "[%prog] [options] command" +description = """Submit *command* to a DRMAA-enabled job queueing system. +Output of the command goes to file, stderr is ignored unless specified +as an option. By default, all of the user's environment +variables are imported into job environment.""" +epilog = "Note: this script only works in Unix-style environments." +parser = OptionParser(usage=usage,description=description,epilog=epilog) +parser.add_option('--wqsub-name',dest='wqsub_name',default='wqsub',help='job name to submit as <--wqsub-name>_<first non-whitespace chars in command> [default: %default]') +parser.add_option('--wqsub-stdout',dest='wqsub_stdout',default=None,help='name of file to write stdout to (equivalent to -o argument in SGE) [default: <wqsub-name>_<command>.out]') +parser.add_option('--wqsub-stderr',dest='wqsub_stderr',default=None,help='name of file to write stderr to (equivalent to -e argument in SGE) [default: <wqsub-name>_<command>.err]') +parser.add_option('--wqsub-join',dest='wqsub_join',action='store_true',help='join stdout and stderr into file indicated by --wqsub-stdout (equivalent to -j flag in SGE)') +parser.add_option('--wqsub-no-env',dest='wqsub_no_env',action='store_true',help='do not include any local environment variables in the script') +parser.add_option('--wqsub-wait',dest='wqsub_wait',action='store_true',help='wait for job to finish executing before returning from script') + + +if __name__ == '__main__' : + + # get the wqsub args out first + wqsub_args = [] + other_args = [] + for arg in sys.argv : + if arg.count('wqsub') != 0 or arg in ['-h','--help'] : + wqsub_args.append(arg) + else : + other_args.append(arg) + + opts, args = parser.parse_args(wqsub_args) + + if len(other_args) == 0 : + parser.error('Must provide a command') + + # set up job parameters + jobname = opts.wqsub_name+'_'+other_args[0] + stdout_fn = jobname+'.out' + if opts.wqsub_stdout : + stdout_fn = opts.wqsub_stdout + stdout = os.path.abspath(stdout_fn) + + if os.path.exists(stdout) : + os.remove(stdout) + + stderr_fn = jobname+'.err' + if opts.wqsub_stderr : + stderr_fn = opts.wqsub_stderr + stderr = os.path.abspath(stderr_fn) + if os.path.exists(stderr) : + os.remove(stderr) + + # drmaa job submission + session = drmaa.Session() + session.initialize() + + # initialize job template + job_template = session.createJobTemplate() + + # construct DRMAA job + command,args = other_args[0],other_args[1:] + job_template.remoteCommand = command + job_template.args = args + job_template.jobName = jobname + job_template.joinFiles = opts.wqsub_join + + # output and error paths apparently require a ':' in front + job_template.outputPath = ':'+stdout + job_template.errorPath = ':'+stderr + + # get the user's current environment and put it into the execute script + if not opts.wqsub_no_env : + job_template.jobEnvironment = os.environ + + # submit the job and wait for it + jobid = session.runJob(job_template) + + if opts.wqsub_wait : + # submit and wait for job to complete, keyboard interrupt aborts job + try : + + retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER) + + except KeyboardInterrupt : + sys.stderr.write('Keyboard interrupt caught (^C), aborting') + pass + + # clean up + session.deleteJobTemplate(job_template) + session.exit()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/setup.cfg Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,2 @@ +[install] +prefix=~/arch/univ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/setup.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import os +import sys + +from distutils.core import setup +#from ez_setup import use_setuptools +#use_setuptools() +#from setuptools import setup + +# convenience is king +opj = os.path.join + +# make sure org_settings.cfg is in source directory +org_settings_fn = 'org_settings.cfg' +dist_settings_path = opj(os.getcwd(),'src','chipsequtil',org_settings_fn) +if not os.path.exists(dist_settings_path) : + sys.stderr.write('WARNING: %s could not be found \ + in distribution root directory. org_settings.py script may \ + not work properly.\n'%dist_settings_path) + +scripts = ['scripts/build_chipseq_infosite.py', + 'scripts/chipseq_pipeline.py', + 'scripts/combine_gerald_stats.py', + 'scripts/compare_microarray_binding.py', + 'scripts/create_pipeline_script.py', + 'scripts/extract_promoters.py', + 'scripts/filter_bed_by_position_count.py', + 'scripts/filter_macs_peaks.py', + 'scripts/filter_gps_peaks.py', + 'scripts/filter_mapped_known_genes.py', + 'scripts/generate_stats_doc.py', + 'scripts/gerald_stats.py', + 'scripts/gerald_to_bed.py', + 'scripts/integrate_macs_ucsc.py', + 'scripts/join_mapped_known_genes.py', + 'scripts/map_intervals.py', + 'scripts/map_peaks_to_genes.py', + 'scripts/map_peaks_to_known_genes.py', + 'scripts/motif_scan.py', + 'scripts/nibFrag.py', + 'scripts/org_settings.py', + 'scripts/peaks_to_fasta.py', + 'scripts/plot_pos_vs_neg_peaks.py', + 'scripts/plot_peak_loc_dist.py', + 'scripts/probeset_to_known_gene.py', + 'scripts/rejection_sample_fasta.py', + 'scripts/sort_bed.py', + 'scripts/split_file.py', + 'scripts/split_qsub.py', + 'scripts/THEME.sh', + 'scripts/wait_for_qsub.py', + 'scripts/wait_for_jobid.py', + 'scripts/wqsub.py', + 'scripts/wqsub_drmaa.py', + ] + +# setup and install +setup(name='chipsequtil', + version='0.5', + author='Adam Labadorf', + author_email='alabadorf@gmail.com', + package_dir={'':'src'}, + py_modules=['chipsequtil.nib','chipsequtil.util','chipsequtil.plotting', + 'chipsequtil.sampling','chipsequtil.seq'], + packages=['chipsequtil'], + package_data={'': ['org_settings.cfg']}, + scripts=scripts, + #cmdclass={'uninstall': uninstall}, + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/__init__.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,5 @@ +""" +This module needs documentation. +""" + +from chipsequtil import *
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/chipsequtil.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,718 @@ +import math +import os +import re +import string +import sys + +from ConfigParser import ConfigParser +from csv import DictReader +from collections import defaultdict + +import chipsequtil + +# for RefGeneDB +from util import KeyedBinaryTree + + +def get_file_parts(path) : + """For <path>/<basename>.<ext>, returns 4-tuple (<path>,<basename>.<ext>,<basename>,<ext>)""" + path,fn = os.path.split(path) + basename,ext = os.path.splitext(fn) + return path,fn,basename,ext + +def parse_number(n) : + """Try to cast intput first to float, then int, returning unchanged if both fail""" + try : + return float(n) if '.' in n else int(n) + except : + return n + + +def gerald_to_bed(gerald,min_fields=False) : + """Convert a GERALDOutput object into a BEDOutput object + + Keyword argument *min_fields* produces BED alignment with only the first + three fields populated + """ + + d = {}.fromkeys(BEDOutput.FIELD_NAMES,'') + + # required BED fields + d['chrom'] = gerald.match_chromo + d['chromStart'] = gerald.match_pos + d['chromEnd'] = gerald.match_pos+len(gerald.read) + + # load the remaining information + if not min_fields : + d['strand'] = '+' if gerald.match_strand == 'F' else '-' + # TODO consider encoding single-read alignment score into BED score format + # that's it? + return BEDOutput(**d) + + +class GERALDOutput : + """Container for one line of GERALD alignment output as generated by Illumina + pipeline version >= 1.3.""" + + FIELD_NAMES = ['machine', + 'run_number', + 'lane', + 'tile', + 'x_coord', + 'y_coord', + 'index', + 'read_no', + 'read', + 'quality_string', + 'match_chromo', + 'match_contig', + 'match_pos', + 'match_strand', + 'match_desc', + 'single_read_score', + 'paired_read_score', + 'partner_chromo', + 'partner_contig', + 'partner_offset', + 'partner_strand', + 'filtering', + ] + + def __init__(self,line) : + + if type(line) == str : + line = line.strip().split('\t') + + if len(line) != len(GERALDOutput.FIELD_NAMES) : + raise GERALDOutput.FormatException('Expected %d fields in input, \ + found %d in line: %s'% + (len(GERALDOutput.FIELD_NAMES), + len(line), + line)) + + for fn,d in zip(GERALDOutput.FIELD_NAMES,line) : + setattr(self,fn,parse_number(d)) + + def __repr__(self) : + return 'GERALDOutput(%s)'%repr(self.output_format()) + + def output_format(self) : + """Tab delimited string of fields as they would appear in GERALD output file""" + return '\t'.join([str(getattr(self,d)) for d in GERALDOutput.FIELD_NAMES])+'\n' + + class FormatException(Exception) : + """GERALD format exception, raised on malformatted input""" + pass + + +class SmartFileIter : + r"""An 'abstract' class implementing a smart file iterator. It is essentially + a wrapper around a collections.DictReader object that parses fields into + Python datatypes (int, float, tuple, objects, etc) as they are iterated. + The constructor argument *f* can be either a valid filename or a file-like + object. This class should not be directly instantiated - rather it should + be subclassed with FIELD_NAMES and FIELD_TYPES defined. FIELD_NAMES is a + list of strings referring to the names of the fields, FIELD_TYPES is a list + of the same length of callables that will parse the column into the desired + format. Example:: + + >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n') + >>> class IntervalFile(SmartFileIter): + r'''A SmartFileIter for files with lines formatted like: + chrom\tstart\tend\tstrand''' + FIELD_NAMES = ['chrom','start','end','strand'] + FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1] + >>> f = IntervalFile(s) + >>> for r in f : + print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand'] + + ``r['start']`` and ``r['end']`` are automatically available as integers, + so the subraction works as expected. Arbitrary functions that accept a + single argument and return a value may also be specified. + """ + + def __init__(self,f,skip_line_chars='#') : + if not hasattr(self,'FIELD_NAMES') or not hasattr(self,'FIELD_TYPES') : + raise Exception('Subclasses must define class members FIELD_NAMES and FIELD_TYPES') + if isinstance(f,str) : + f = open(f) + self._dict_reader = DictReader(f,delimiter='\t',fieldnames=self.FIELD_NAMES) + self.fieldnames = self.FIELD_NAMES + self.curr_line = self._dict_reader.next() + self.skip_line_chars = skip_line_chars + + # skip initial comment lines + while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : + self.curr_line = self._dict_reader.next() + + if self.FIELD_NAMES[0] in self.curr_line.values() : + self.curr_line = self._dict_reader.next() + + def __iter__(self) : + return self + + def __getattr__(self,attr) : + try: + return self.__dict__[attr] + except KeyError : + return getattr(self._dict_reader,attr) + + def next(self) : + """Emit the next record in the file as a dictionary with parsed values""" + + if self.curr_line is None : + raise StopIteration() + + line = self.curr_line + + # check for comment + while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : + line = self.curr_line = self._dict_reader.next() + + for k,f in zip(self.FIELD_NAMES, self.FIELD_TYPES) : + try : + line[k] = f(line[k]) + except Exception, e : + #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e))) + line[k] = line[k] + + try : + self.curr_line = self._dict_reader.next() + except StopIteration : + self.curr_line = None + + return line + + +class BEDOutput : + """*Deprecated*: Use *BEDFile* instead. + + Container for one line of BED alignment output""" + + FIELD_NAMES = ['chrom', + 'chromStart', + 'chromEnd', + 'name', + 'score', + 'strand', + 'thickStart', + 'thickEnd', + 'itemRgb', + 'blockCount', + 'blockSizes', + 'blockStarts', + ] + + def __init__(self,line='',*args,**kwargs) : + + if type(line) == str : + line = line.strip().split('\t') + + if len(line) < 3 and any([x not in kwargs.keys() for x in ['chrom','chromStart','chromEnd']]) : + raise BEDOutput.FormatException('Format requres at least 3 fields in \ + input, found %d in line: %s'%(len(line),line)) + if len(line) > len(BEDOutput.FIELD_NAMES) : + raise BEDOutput.FormatException('Format requres at most %d fields in \ + input, found %d in line: %s'% + (len(BEDOutput.FIELD_NAMES),len(line),line)) + + empty_fields = ['']*(len(BEDOutput.FIELD_NAMES)-len(line)) + for fn,d in zip(BEDOutput.FIELD_NAMES,line+empty_fields) : + setattr(self,fn,parse_number(d)) + + # kwargs override line input + for k,v in kwargs.items() : + setattr(self,k,parse_number(v)) + + def __repr__(self) : + return 'BEDOutput(%s)'%(repr(self.output_format())) + + def output_format(self) : + """Returns a string for the BED line as it would appear in a file""" + return '\t'.join([str(getattr(self,d)) for d in BEDOutput.FIELD_NAMES])+'\n' + + class FormatException(Exception) : + """BED format exception, raised on malformatted input""" + pass + + +class BEDFile(SmartFileIter) : + '''An iterable object containing the records in the supplied BED formatted + file. Fieldnames are:: + + FIELD_NAMES = ['chrom', + 'chromStart', + 'chromEnd', + 'name', + 'score', + 'strand', + 'thickStart', + 'thickEnd', + 'itemRgb', + 'blockCount', + 'blockSizes', + 'blockStarts', + ] + ''' + + FIELD_NAMES = BEDOutput.FIELD_NAMES + FIELD_TYPES = [str,int,int,str,float,str,int,int,str,lambda x: x.split(','), lambda x: x.split(','), lambda x: x.split(',')] + + +class BEDFile_dictreader(DictReader) : + '''An iterable object (subclasses csv.DictReader) containing the records in + the supplied BED formatted file.''' + FIELD_NAMES = BEDOutput.FIELD_NAMES + def __init__(self,bed) : + '''*bed* is either a filename or a file-like object representing a BED file''' + if isinstance(bed,str) : + bed = open(bed) + DictReader.__init__(self,bed,delimiter='\t', + fieldnames=BEDOutput.FIELD_NAMES) + + +class GPSFile(SmartFileIter) : + '''An iterable object containing the records in the peaks file format + generated by GPS. Fieldnames are:: + + FIELD_NAMES = ["Position", + "IP", + "Control", + "Fold", + "Q_-lg10", + "P_-lg10", + "IPvsEMP", + "IPvsCTR", + "blank" + ] + ''' + + FIELD_NAMES = ["Position", + "IP", + "Control", + "Fold", + "Q_-lg10", + "P_-lg10", + "IPvsEMP", + "IPvsCTR", + "blank" + ] + + FIELD_TYPES = [lambda x: ('chr%s'%x.split(':')[0],int(x.split(':')[1]),x), + float, + float, + float, + float, + float, + float, + float, + str + ] + + def __init__(self,gps_fn) : + f = open(gps_fn) + + SmartFileIter.__init__(self,f) + + +class AffyBiocFile(DictReader) : + '''An iterable object (subclasses csv.DictReader) containing microarray data records in + the supplied bioconductor formatted file.''' + + FIELD_NAMES = [ 'ID', + 'Symbol', + 'Name', + 'M', + 'A', + 't', + 'P.Value', + 'B' + ] + + def __init__(self,affyfn) : + '''*affyfn* is either a filename or a file-like object representing a bioconductor output file''' + if isinstance(affyfn,str) : + bed = open(bed) + DictReader.__init__(self,bed,delimiter='\t', + fieldnames=BEDOutput.FIELD_NAMES) + + +class RefGeneOutput(object) : + # http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql + FIELD_NAMES = ['bin', + 'name', + 'chrom', + 'strand', + 'txStart', + 'txEnd', + 'cdsStart', + 'cdsEnd', + 'exonCount', + 'exonStarts', + 'exonEnds', + 'score', + 'name2', + 'cdsStartStat', + 'cdsEndStat', + 'exonFrames',] + + +class RefGeneFile(DictReader) : + '''An iterable object (subclasses csv.DictReader) containing the records in + the supplied BED formatted file''' + def __init__(self,refGene_fn) : + refGene_f = open(refGene_fn) + # check for header + first_line = refGene_f.next() + if not first_line.strip().startswith('#') : + refGene_f.seek(0) # first line not header, reset the file pointer + DictReader.__init__(self,refGene_f,delimiter='\t',fieldnames=RefGeneOutput.FIELD_NAMES) + +class RefGeneFile_nottested(SmartFileIter) : + '''An iterable object containing the records in the supplied UCSC RefGene + refFlat formatted file (see e.g. + http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql)''' + FIELD_NAMES = ['bin', + 'name', + 'chrom', + 'strand', + 'txStart', + 'txEnd', + 'cdsStart', + 'cdsEnd', + 'exonCount', + 'exonStarts', + 'exonEnds', + 'score', + 'name2', + 'cdsStartStat', + 'cdsEndStat', + 'exonFrames',] + FIELD_TYPES = [str,str,str,str,int,int,int,int,int, + lambda x: [int(y) for y in x.split(',') if len(y) > 0], + lambda x: [int(y) for y in x.split(',') if len(y) > 0], + float, + str,str,str,str] + +class KnownGeneFile(SmartFileIter) : + '''An iterable that parses UCSC's KnownGene gene annotation files. Field + names are:: + + FIELD_NAMES = [ 'name', + 'chrom', + 'strand', + 'txStart', + 'txEnd', + 'cdsStart', + 'cdsEnd', + 'exonCount', + 'exonStarts', + 'exonEnds', + 'proteinID', + 'alignID', + ] +''' + + FIELD_NAMES = [ 'name', + 'chrom', + 'strand', + 'txStart', + 'txEnd', + 'cdsStart', + 'cdsEnd', + 'exonCount', + 'exonStarts', + 'exonEnds', + 'proteinID', + 'alignID', + ] + + # function pointers for correct formatting of field names + FIELD_TYPES = [ str, + str, + str, + int, + int, + int, + int, + lambda x: [int(y) for y in x.split(',') if len(y) > 0], + lambda x: [int(y) for y in x.split(',') if len(y) > 0], + lambda x: [int(y) for y in x.split(',') if len(y) > 0], + str, + str, + ] + + def __init__(self,kg_fn) : + self.meta_data = [] + self.file_info = {} + f = open(kg_fn) + self._dict_reader = DictReader(f,delimiter='\t',fieldnames=KnownGeneFile.FIELD_NAMES) + + def __iter__(self) : + return self + + def next(self) : + line = self._dict_reader.next() + for k,f in zip(self.FIELD_NAMES,self.FIELD_TYPES) : + line[k] = f(line[k]) + return line + + +#TODO maybe, finish this +class RefGeneDB : + '''A class for querying RefGene annotation files. NOT DONE.''' + + def __init__(self,refgene_fn) : + self._chrom_trees = defaultdict(KeyedBinaryTree) + refgene_f = RefGeneFile(refgene_fn) + genes = defaultdict(list) + for gene in refgene_f : + genes[gene['chrom']].append(gene) + + # do stuff to ensure a balanced tree for each chromosome + for chrom,gene_list in genes.items() : + gene_list.sort(key=lambda x: int(x['txStart'])) + first_half, second_half = gene_list[:len(gene_list)/2],gene_list[len(gene_list)/2:] + first_half.reverse() + for i in range(min(len(first_half,second_half))) : + to_add = first_half.pop(i) + self._chrom_trees[chrom].addNode(int(to_add['txStart']),to_add) + + +class MACSFile(SmartFileIter) : + '''An iterable object containing the records in the supplied MACS peak file. + This class parses the comments found in the header of MACS peak files and + extracts metadata into the member dictionary **file_info**. Here is an example + metadata dictionary:: + + >>> f = MACSFile('macs_peaks.xls') + >>> f.file_info + {'ChIP-seq file': 'experiment_read_alignments.sam', + 'MACS version': '1.4.0rc2 20110214', + 'Range for calculating regional lambda': '1000 bps and 10000 bps', + 'Redundant rate in control': 0.72999999999999998, + 'Redundant rate in treatment': 0.080000000000000002, + 'band width': 300, + 'control file': 'control_read_alignments.sam', + 'd': 203, + 'effective genome size': 2110000000.0, + 'format': 'SAM', + 'maximum duplicate tags at the same position in control': 2, + 'maximum duplicate tags at the same position in treatment': 2, + 'model fold': '10,30', + 'name': 'my_awesome_ChIP', + 'pvalue cutoff': 1.0000000000000001e-05, + 'tag size': 36, + 'tags after filtering in control': 7879454, + 'tags after filtering in treatment': 23927336, + 'total tags in control': 29703098, + 'total tags in treatment': 26092366} + + The complete header can be found as a list in the **meta_data** member with + one comment per item. The field names available are:: + + FIELD_NAMES = ['chr', + 'start', + 'end', + 'length', + 'summit', + 'tags', + '-10*log10(pvalue)', + 'fold_enrichment', + 'FDR(%)', + ] + + ''' + FIELD_NAMES = ['chr', + 'start', + 'end', + 'length', + 'summit', + 'tags', + '-10*log10(pvalue)', + 'fold_enrichment', + 'FDR(%)', + ] + + FIELD_TYPES = [str, + int, + int, + int, + int, + int, + float, + float, + float + ] + + _METADATA_REGEXES = [ + u'# This file is generated by (MACS version) (.*)', + u'# (name) = (.*)', + u'# (format) = (.*)', + u'# (ChIP-seq file) = (.*)', + u'# (control file) = (.*)', + u'# (effective genome size) = (.*)', + u'# (band width) = (\d+)', + u'# (model fold) = (.*)', + u'# (pvalue cutoff) = (.*)', + u'# (Range for calculating regional lambda) is: (.*)', + u'# (tag size) is determined as (\d+) bps', + u'# (total tags in treatment): (\d+)', + u'# (tags after filtering in treatment): (\d+)', + u'# (maximum duplicate tags at the same position in treatment) = (\d+)', + u'# (Redundant rate in treatment): (.*)', + u'# (total tags in control): (.*)', + u'# (tags after filtering in control): (.*)', + u'# (maximum duplicate tags at the same position in control) = (\d+)', + u'# (Redundant rate in control): (.*)', + u'# (d) = (\d+)' + ] + + def __init__(self,macs_fn) : + self.meta_data = [] + self.file_info = {} + if isinstance(macs_fn,str) : + f = open(macs_fn) + else : + f = macs_fn + done_with_header = False + while not done_with_header : + l = f.next().strip() + if l.startswith('#') : + for regex in MACSFile._METADATA_REGEXES : + m = re.search(regex,l) + if m is not None : + self.file_info[m.group(1).strip()] = parse_number(m.group(2).strip()) + self.meta_data.append(l) + elif l.startswith('\t'.join(MACSOutput.FIELD_NAMES[:5])) : + self.meta_data.append(l) + done_with_header = True + + SmartFileIter.__init__(self,f) + + +# for backwards compatibility, use MACSFile instead...? +class MACSOutput(object) : + FIELD_NAMES = MACSFile.FIELD_NAMES + +GLOBAL_SETTINGS_FN = os.path.join(os.path.split(chipsequtil.__file__)[0],'org_settings.cfg') +LOCAL_SETTINGS_FN = os.path.expanduser(os.path.join('~','.org_settings.cfg')) +_ALL_SETTINGS, _LOCAL_SETTINGS, _GLOBAL_SETTINGS = range(3) + +def _get_org_settings(org_key=None,addnl_configs=[],src=_ALL_SETTINGS) : + """Utility function used by get_org_settings and get_all_settings, should \ + not be called directly""" + + config = ConfigParser() + chipsequtil_base = conf_fns = [] + if src in [_LOCAL_SETTINGS, _ALL_SETTINGS] : + conf_fns.append(LOCAL_SETTINGS_FN) + if src in [_GLOBAL_SETTINGS, _ALL_SETTINGS] : + conf_fns.append(GLOBAL_SETTINGS_FN) + config.read(conf_fns+addnl_configs) + + d = {} + if org_key is None : + for sec in config.sections() : + # try to cast numeric-looking arguments into float, int + d[sec] = dict([(k,parse_number(v)) for k,v in config.items(sec)]) + else : + d = dict([(k,parse_number(v)) for k,v in config.items(org_key)]) + + return d + + +def get_org_settings(org_key,addnl_configs=[]) : + '''Returns a dict of setting/path values for a given organism as specified + in system-wide and user's settings. *org_key* is the organism name as found + in the config file, *e.g.* mm9. *addnl_configs* are filenames of other + configuration files to add to the set of settings, usually not needed. + Example usage:: + + >>> org_d = get_org_settings('mm9') + >>> org_d + {'affy_to_known_path': '/nfs/genomes/mouse_gp_jul_07/anno/knownToMOE43-mm9.txt', + 'annotation_path': '/nfs/genomes/mouse_gp_jul_07/anno/refFlat-mm9.txt', + 'description': "UCSC mm9 (July '07 build) with full TRANSFAC hypothesis set", + 'genome': 'mm9', + 'genome_dir': '/nfs/genomes/mouse_gp_jul_07', + 'genome_size': 2107000000, + 'known_gene_anno_path': '/nfs/genomes/mouse_gp_jul_07/anno/knownGene-mm9.txt', + 'known_gene_xref_path': '/nfs/genomes/mouse_gp_jul_07/anno/kgXref-mm9.txt', + 'refgene_anno_path': '/nfs/genomes/mouse_gp_jul_07/anno/refFlat-mm9.txt', + 'theme_hypotheses': '/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo', + 'theme_markov': '/nfs/data/cwng/chipseq/hypotheses/Mouse.markov', + 'ucsc_chrom_sizes': '/nfs/genomes/mouse_gp_jul_07/mm9.chrom.sizes'} + >>> get_org_settings('mm9')['genome_dir'] + '/nfs/genomes/mouse_gp_jul_07' + + ''' + return _get_org_settings(org_key,addnl_configs=addnl_configs) + + +def get_all_settings(addnl_configs=[]) : + '''Returns a dict of setting/path values for every organism as specified in + system-wide and user's settings.''' + return _get_org_settings(None,addnl_configs=addnl_configs) + + +def get_global_settings() : + '''Returns a dict of the global setting/path values installed with the + package.''' + return _get_org_settings(None,src=_GLOBAL_SETTINGS) + + +def get_local_settings() : + '''Returns a dict of the current user's setting/path values taken from + ~/.org_settings.cfg if it exists.''' + return _get_org_settings(None,src=_LOCAL_SETTINGS) + + +def check_org_settings(org_key,setting_list) : + '''Returns true if all setting names in *setting_list* are found in the + org settings for organism *org_key* and false otherwise. Mostly used + internally to sanity check org settings.''' + settings = get_org_settings(org_key) + return all([s in settings.keys() for s in setting_list]) + + +RC_MAP = string.maketrans('acgtACGT','tgcaTGCA') +def reverse_complement(seq) : + """Reverse complements nucleotide string *seq*. Leaves non-nucleotide characters uneffected.""" + return seq.translate(RC_MAP)[::-1] + + +def get_gc_content(seq) : + '''returns the GC content of a DNA sequence as python string''' + seq = seq.lower() + return (seq.count('c')+seq.count('g'))/float(len(seq)) + + +def get_gc_content_distribution(sequences,bins=100) : + '''returns a list of + provided sequences. Approximation is performed by binning.''' + gc_contents = [get_gc_content(s) for s in sequences] + gc_contents.sort() + + # count up the sequences for each bin + bin_counts = [0.]*bins + for c in gc_contents : + sample_bin = int(math.floor(c*bins)) + bin_counts[sample_bin] += 1 + + # normalize bin counts + norm_bins = [x/len(sequences) for x in bin_counts] + + # create a closure for this set of sequences + #def f(seq) : + # gc = get_gc_content(seq) + # return norm_bins[int(math.floor(gc*bins))] + + return norm_bins + + +def get_size_distribution(sequences) : + return (len(s) for s in sequences) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/motiftools.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,2064 @@ +""" +There is a large number of functions and member fucntions here. To get started, +a motif can be instantiated by providing an ambiguity code, a set of aligned DNA +sequences, or from matrices of counts, probabilities or log-likelihoods (akaPSSMs). + +>>> m = MotifTools.Motif_from_text('TGAAACANNSYWT') +>>> print m.oneletter() +TGAAACA..sywT + +Lower case reflects lower information content. For a more detailed view of the distribution +of information, try this:: + + >>> m.textlogo() + # -- 2.30 bits + # + # TGAAACA T + # TGAAACA T + # TGAAACA T + # TGAAACA T + # TGAAACA CCAT + # TGAAACA CCAT + # TGAAACA GTTT + # TGAAACA GTTT -- 0.23 bits + # ------------- + # TGAAACA..sywT + + +Motif objects may be manipulated largely like text strings (with pythonic +indexing):: + + >>> print m[4:5].oneletter + A + >>> print m[4:7].oneletter + ACA + >>> print (m[4:7] + m[1:2]).oneletter + ACAG + >>> print (m[4:7] + m[1:7]).oneletter + ACAGAAACA + +and even padded with blanks:: + + >>> print m[-4:7] + ...TGAAACA + +.. Copyright (2005) Whitehead Institute for Biomedical Research +.. All Rights Reserved + +Author: David Benjamin Gordon + +Modified by: Adam Labadorf + +""" +import copy +import math +import os +import pickle +import re +import string +import sys +import tempfile + +pysum = sum + +from random import random,shuffle +from subprocess import call + +from chipsequtil import reverse_complement +class MotifToolsException(Exception) : pass + +one2two = { 'W':'AT', 'M':'AC', 'R':'AG', + 'S':'CG', 'Y':'CT', 'K':'GT'} +two2one = { 'AT': 'W', 'AC': 'M', 'AG': 'R', + 'CG': 'S', 'CT': 'Y', 'GT': 'K'} +revcomp = { 'A':'T', 'T':'A', 'C':'G', 'G':'C', + 'W':'W', 'S':'S', 'K':'M', 'M':'K', + 'Y':'R', 'R':'Y', 'N':'N', + 'B':'N', 'D':'N', 'H':'N', 'V':'N', ' ':'N'} #[12-11-02] Needs fixing + +ACGT = list('ACGT') +YEAST_BG = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} #Yeast default background freqs + +revcomplement_memo = {'A':'T'} +revcompTBL = string.maketrans("AGCTagctWSKMYRnN", "TCGAtcgaWSMKTYnN") +def revcomplement(seq): + """A quick reverse-complement routine that memo-izes queries, understands + IUPAC ambiguity codes, and preserves case.""" + global revcomplement_memo + try: + rc = revcomplement_memo[seq] + except KeyError: + #_t = map(lambda x,D=revcomp: D[x], seq) + #get = revcomp.get + #_t = map(get, seq) + _t = list(seq.translate(revcompTBL)) + _t.reverse() + rc = ''.join(_t) + revcomplement_memo[seq] = rc + revcomplement_memo[rc] = seq + return rc + + +def Motif_from_ll(ll): + """Constructs a motif object from a log-likelihood matrix, which is in the + form of a list of dictionaries.""" + m = Motif(None,None) + m.compute_from_ll(ll) + return m + +def Motif_from_counts(countmat,beta=0.01,bg={'A':.25,'C':.25,'G':.25,'T':.25}): + """ + Construct a Motif object from a matrix of counts (or probabilities or frequencies). + A default set of uniform background frequencies may be overridden. + + beta refers to the number of pseudocounts that should be distributed over each position + of the PSSM.""" + m = Motif('',bg) + m.compute_from_counts(countmat,beta) + return m + +def Motif_from_text(text,beta=0.05,source='',bg=None): + """Construct a Motif object from a text string constructed from IUPAC + ambiguity codes. + + A default set of uniform background frequencies may be overridden with + a dictionary of the form {'A':.25,'C':.25,'G':.25,'T':.25}). + + beta refers to the number of pseudocounts that should be distributed over each position + of the PSSM.""" + if not bg: bg={'A':.25,'C':.25,'G':.25,'T':.25} + m = Motif('',bg) + m.compute_from_text(text,beta) + m.source = source + return m + +def copy(motif): + """Utility routine for copying motifs""" + a = copy.deepcopy(motif) + #a.__dict__ = motif.__dict__.copy() + return a + +class Motif: + """A pssm model, with scanning, storing, loading, and other operations. A + uniform nucleotide background is assumed if none is provided.""" + def __init__(self,list_of_seqs_or_text=[],backgroundD=None): + self.MAP = 0 + self.evalue = None + self.oneletter = '' + self.nseqs = 0 + self.counts = [] + self.width = 0 + self.fracs = [] + self.logP = [] + self.ll = [] + self.bits = [] + self.totalbits = 0 + self.maxscore = 0 + self.minscore = 0 + self.pvalue = 1 + self.pvalue_rank = 1 + self.church = None + self.church_rank = 1 + self.Cpvalue = 1 + self.Cpvalue_rank= 1 + self.Cchurch = 1 + self.Cchurch_rank= 1 + self.binomial = None + self.binomial_rank=1 + self.E_seq = None + self.frac = None + self.E_site = None + self.E_chi2 = None + self.kellis = None + self.MNCP = None + self.ROC_auc = None + self.realpvalue = None + self.Cfrac = None + self.CRA = None + self.valid = None + self.seeddist = 0 + self.seednum = -1 + self.seedtxt = None + self.family = None + self.source = None + self.threshold = None + self._bestseqs = None + self.bgscale = 1 + self.best_pvalue = None + self.best_factor = None + self.gamma = None + self.nbound = 0 + self.matchids = [] + self.overlap = None + self.cumP = [] + self.numbound = 0 + self.nummotif = 0 + self.numboundmotif = 0 + self.dataset = None + self.bgfile = None + self.cverror = None + self.beta = None + self.match_thresh = None + self.progscore = None + if backgroundD: + self.background = backgroundD + else: + #self.background = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} #Yeast Default + self.background = {'A':.25,'C':.25,'G':.25,'T':.25} # uniform background + + if type(list_of_seqs_or_text) == type(''): + self.seqs = [] + text = list_of_seqs_or_text + self.compute_from_text(text) + else: + self.seqs = list_of_seqs_or_text + if self.seqs: + self._parse_seqs(list_of_seqs_or_text) + self._compute_ll() + self._compute_oneletter() + #self._compute_threshold(2.0) + + def __repr__(self): + return "%s (%d)"%(self.oneletter, self.nseqs) + + def __str__(self): + return "%s (%d)"%(self.oneletter, self.nseqs) + + def summary(self): + """return a text string one-line summary of motif and its metrics""" + m = self + txt = "%-34s (Bits: %5.2f MAP: %7.2f D: %5.3f %3d) E: %7.3f"%( + m, m.totalbits, m.MAP, m.seeddist, m.seednum, nlog10(m.pvalue)) + if m.binomial!=None: txt = txt + ' Bi: %6.2f'%(nlog10(m.binomial)) + if m.church != None: txt = txt + ' ch: %6.2f'%(nlog10(m.church)) + if m.frac != None: txt = txt + ' f: %5.3f'%(m.frac) + if m.E_site != None: txt = txt + ' Es: %6.2f'%(nlog10(m.E_site)) + if m.E_seq != None: txt = txt + ' Eq: %6.2f'%(nlog10(m.E_seq)) + if m.MNCP != None: txt = txt + ' mn: %6.2f'%(m.MNCP) + if m.ROC_auc!= None: txt = txt + ' Ra: %6.4f'%(m.ROC_auc) + if m.E_chi2 != None: + if m.E_chi2 == 0: m.E_chi2=1e-20 + txt = txt + ' x2: %5.2f'%(nlog10(m.E_chi2)) + if m.CRA != None: txt = txt + ' cR: %6.4f'%(m.CRA) + if m.Cfrac != None: txt = txt + ' Cf: %5.3f'%(m.Cfrac) + if m.realpvalue != None: txt = txt + ' P: %6.4e'%(m.realpvalue) + if m.kellis != None: txt = txt + ' k: %6.2f'%(m.kellis) + if m.numbound : txt = txt + ' b: %3d'%(m.numbound) + if m.nummotif : txt = txt + ' nG: %3d'%(m.nummotif) + if m.numboundmotif : txt = txt + ' bn: %3d'%(m.numboundmotif) + + return txt + + def minimal_raw_seqs(self): + '''return minimal list of seqs that represent consensus ''' + seqs = [[], []] + for letter in self.oneletter: + if one2two.has_key(letter): + seqs[0].append(one2two[letter][0]) + seqs[1].append(one2two[letter][1]) + else: + seqs[0].append(letter) + seqs[1].append(letter) + if ''.join(seqs[0]) == ''.join(seqs[1]): + return [''.join(seqs[0])] + else: + return [''.join(seqs[0]), ''.join(seqs[0])] + def _compute_oneletter(self): + """set the oneletter member variable""" + letters = [] + for i in range(self.width): + downcase = None + if self.bits[i] < 0.25: + letters.append('.') + continue + if self.bits[i] < 1.0: downcase = 'True' + tups = [(self.ll[i][x],x) for x in ACGT if self.ll[i][x] > 0.0] + if not tups: #Kludge if all values are negative (can this really happen?) + tups = [(self.ll[i][x],x) for x in ACGT] + tups.sort() + tups.reverse() + tups = [tups[0]] + downcase = 'True' + tups.sort() #Rank by LL + tups.reverse() + bases = [x[1] for x in tups[0:2]] + bases.sort() + if len(bases) == 2: L = two2one[''.join(bases)] + else: L = bases[0] + if downcase: L = L.lower() + letters.append(L) + self.oneletter = ''.join(letters) + def _parse_seqs(self, LOS): + """build a matrix of counts from a list of sequences""" + self.nseqs = len(LOS) + self.width = len(LOS[0]) + for i in range(self.width): + Dc = {'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0} + for seq in LOS: + key = seq[i] + Dc[key] = Dc[key] + 1 + del(Dc['N']) + self.counts.append(Dc) + + def _compute_ll(self): + """compute the log-likelihood matrix from the count matrix""" + self.fracs = [] + self.logP = [] + self.ll = [] + for i in range(self.width): + + Dll = {'A': 0, 'C': 0, 'T': 0, 'G': 0} + Df = {'A': 0, 'C': 0, 'T': 0, 'G': 0} + DlogP= {'A': 0, 'C': 0, 'T': 0, 'G': 0} + + for nuc in self.counts[i].keys(): + + #print i,nuc,self.counts[i][nuc],self.nseqs + # Dll[nuc] = log2( position nucleotide count/background sequence count ) + # Dll[nuc] = log2( (count[nuc]+bgscale*bg[nuc])/(bg[nuc]*(num_seqs+bgscale)) ) + + pos_nuc_count = self.counts[i][nuc] + self.bgscale*self.background.get(nuc,0.) + adj_all_nuc_count = (self.nseqs + self.bgscale) * self.background.get(nuc,1e-10) + + Dll[nuc] = math.log(pos_nuc_count/adj_all_nuc_count,2) + + Pij = self.counts[i][nuc] / float(self.nseqs) + Df [nuc] = Pij + if Pij > 0: + DlogP[nuc] = math.log(Pij) / math.log(2.) + else: + DlogP[nuc] = -100 #Near zero + + self.fracs.append(Df) + self.logP.append (DlogP) + self.ll.append (Dll) + self.P = self.fracs + self._compute_bits() + self._compute_ambig_ll() + self._maxscore() + + + def compute_from_ll(self,ll): + """build motif from an inputed log-likelihood matrix + + (This function reverse-calculates the probability matrix and background frequencies + that were used to construct the log-likelihood matrix) + """ + self.ll = ll + self.width = len(ll) + self._compute_bg_from_ll() + self._compute_logP_from_ll() + self._compute_ambig_ll() + self._compute_bits() + self._compute_oneletter() + self._maxscore() + + def _computeP(self): + """compute the probability matrix (from the internal log-probability matrix)""" + P = [] + for i in range(self.width): + #print i, + _p = {} + for L in ACGT: _p[L] = math.pow(2.0,self.logP[i][L]) + P.append(_p) + #print + self.P = P + + def _compute_bits(self): + """set m.totbits to the number of bits and m.bits to a list of bits at + each position""" + bits = [] + totbits = 0 + bgbits = 0 + bg = self.background + UNCERT = lambda x: x*math.log(x)/math.log(2.0) + for letter in ACGT: + bgbits = bgbits + UNCERT(bg[letter]) + for i in range(self.width): + tot = 0 + for letter in ACGT: + Pij = pow(2.0, self.logP[i][letter]) + tot = tot + UNCERT(Pij) + #bit = Pij * self.ll[i][letter] + #if bit > 0: + # tot = tot + bit + #print tot, bgbits, tot-bgbits + bits.append(max(0,tot-bgbits)) + totbits = totbits + max(0,tot-bgbits) + self.bits = bits + self.totalbits = totbits + + + def denoise(self,bitthresh=0.5): + """set low-information positions (below bitthresh) to Ns""" + for i in range(self.width): + tot = 0 + for letter in ACGT: + if self.logP: + Pij = pow(2.0, self.logP[i][letter]) + else: + Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] + if Pij > 0.01: + bit = Pij * self.ll[i][letter] + tot = tot + bit + if tot < bitthresh: #Zero Column + for letter in ACGT: + self.ll[i][letter] = 0.0 + self.compute_from_ll(self.ll) + + def giflogo(self,id,title=None,scale=0.8,info_str=''): + """make a gif sequence logo""" + return giflogo(self,id,title,scale) + + def printlogo(self,norm=2.3, height=10.0): + """print a text-rendering of the Motif Logo + + norm + maximum number of bits to show + height + number of lines of text to use to render logo + """ + self._print_bits(norm,height) + def print_textlogo(self,norm=2.3, height=8.0): + """print a text-rendering of the Motif Logo + + norm + maximum number of bits to show + height + number of lines of text to use to render logo + """ + self._print_bits(norm,height) + def _print_bits(self,norm=2.3, height=8.0): + """print a text-rendering of the Motif Logo + + norm + maximum number of bits to show + height + number of lines of text to use to render logo + """ + bits = [] + tots = [] + str = [] + for i in range(self.width): + D = {} + tot = 0 + for letter in ['A', 'C', 'T', 'G']: + if self.logP: + Pij = pow(2.0, self.logP[i][letter]) + else: + Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] + if Pij > 0.01: + '''Old''' + D[letter] = Pij * self.ll[i][letter] + #'''new''' + #Q = self.background[letter] + #D[letter] = ( Pij * math.log(Pij) - Pij * math.log(Q) ) / math.log(2.0) + '''for both old and new''' + tot = tot + D[letter] + bits.append(D) + tots.append(tot) + for i in range(self.width): + s = [] + _l = bits[i].keys() + _l.sort(lambda x,y,D=bits[i]: cmp(D[y],D[x])) + for key in _l: + for j in range(int(bits[i][key] / norm * height)): + s.append(key) + str.append(''.join(s)) + fmt = '%%%ds'%height + print '# %s'%('-'*self.width) + for h in range(int(height)): + sys.stdout.write("# ") + for i in range(self.width): + sys.stdout.write((fmt%str[i])[h]) + if h == 0: + sys.stdout.write(' -- %4.2f bits\n'%norm) + elif h == height-1: + sys.stdout.write(' -- %4.2f bits\n'%(norm/height)) + else: + sys.stdout.write('\n') + print '# %s'%('-'*self.width) + print '# %s'%self.oneletter + + def _compute_ambig_ll(self): + """extend log-likelihood matrix to include ambiguity codes + e.g. What the score of a 'S'? Here we use the max of C and G.""" + for Dll in self.ll: + for L in one2two.keys(): + Dll[L] = max(Dll[one2two[L][0]], Dll[one2two[L][1]] ) + Dll['N'] = 0.0 + Dll['B'] = 0.0 + + def compute_from_nmer(self,nmer,beta=0.001): #For reverse compatibility + """See compute_from_text. Here for reverse compatibility""" + self.compute_from_text(nmer,beta) + + def compute_from_text(self,text,beta=0.001): + """compute a matrix values from a text string of ambiguity codes. + Use Motif_from_text utility instead to build motifs on the fly.""" + prevlett = {'B':'A', 'D':'C', 'V':'T', 'H':'G'} + countmat = [] + text = re.sub('[\.\-]','N',text.upper()) + for i in range(len(text)): + D = {'A': 0, 'C': 0, 'T':0, 'G':0} + letter = text[i] + if letter in ['B', 'D', 'V', 'H']: #B == no "A", etc... + _omit = prevlett[letter] + for L in ACGT: + if L != _omit: D[L] = 0.3333 + elif one2two.has_key(letter): #Covers WSMYRK + for L in list(one2two[letter]): + D[L] = 0.5 + elif letter == 'N': + for L in D.keys(): + D[L] = self.background[L] + elif letter == '@': + for L in D.keys(): + D[L] = self.background[L]-(0.0001) + D['A'] = D['A'] + 0.0004 + else: + D[letter] = 1.0 + countmat.append(D) + self.compute_from_counts(countmat,beta) + + def new_bg(self,bg): + """change the ACGT background frequencies to those in the supplied dictionary. + Recompute log-likelihood, etc. with new background. + """ + counts = [] + for pos in self.logP: + D = {} + for L,lp in pos.items(): + D[L] = math.pow(2.0,lp) + counts.append(D) + self.background = bg + self.compute_from_counts(counts,0) + + def addpseudocounts(self,beta=0): + """add pseudocounts uniformly across the matrix""" + self.compute_from_counts(self.counts,beta) + + def compute_from_counts(self,countmat,beta=0): + """build a motif object from a matrix of letter counts.""" + self.counts = countmat + self.width = len(countmat) + self.bgscale = 0 + + maxcount = 0 + #Determine Biggest column + for col in countmat: + tot = pysum(col.values()) + if tot > maxcount : + maxcount = tot + + #Pad counts of remaining columns + for col in countmat: + tot = pysum(col.values()) + pad = maxcount - tot + for L in col.keys(): + col[L] = col[L] + pad * self.background.get(L,0.) + + self.nseqs = maxcount + nseqs = maxcount + + #Add pseudocounts + if beta > 0: + multfactor = {} + bgprob = self.background + pcounts= {} + for L in bgprob.keys(): + pcounts[L] = beta*bgprob[L]*nseqs + for i in range(self.width): + for L in countmat[i].keys(): + _t = (countmat[i][L] + pcounts[L]) #Add pseudo + _t = _t / (1.0 + beta) #Renomalize + countmat[i][L] = _t + + #Build Motif + self.counts = countmat + self._compute_ll() + self._compute_oneletter() + self._maxscore() + + + def _compute_bg_from_ll(self): + """compute background model from log-likelihood matrix + by noting that: pA + pT + pC + pG = 1 + and bgA + bgT + bgC + bgG = 1 + and bgA = bgT, bgC = bgG + and so bgA = 0.5 - bgC + and pA = lA * bgA, etc for T, C, G + so... + (lA + lT)bgA + (lC + lG)bgC = 1 + (lA + lT)bgA + (lC + lG)(0.5 - bgA) = 1 + (lA + lT - lC - lG)bgA +(lC +lG)*0.5 = 1 + bgA = {1 - 0.5(lC + lG)} / (lA + lT - lC - lG) + + Gain accuracy by taking average of bgA over all positions of PSSM + """ + + pow = math.pow + bgATtot = 0 + nocount = 0 + near0 = lambda x:(-0.01 < x and x < 0.01) + for i in range(self.width): + _D = self.ll[i] + ATtot = pow(2,_D['A']) + pow(2,_D['T']) + GCtot = pow(2,_D['C']) + pow(2,_D['G']) + if near0(_D['A']) and near0(_D['T']) and near0(_D['G']) and near0(_D['C']): + nocount = nocount + 1 + continue + if near0(ATtot-GCtot): #Kludge to deal with indeterminate case + nocount = nocount + 1 + continue + bgAT = (1.0 - 0.5*GCtot)/(ATtot - GCtot) + if (bgAT < 0.1) or (bgAT > 1.1): + nocount = nocount + 1 + continue + bgATtot = bgATtot + bgAT + if nocount == self.width: #Kludge to deal with different indeterminate case + self.background = {'A':0.25, 'C':0.25, 'G':0.25, 'T':0.25} + return + bgAT = bgATtot / (self.width - nocount) + bgGC = 0.5 - bgAT + self.background = {'A':bgAT, 'C':bgGC, 'G':bgGC, 'T':bgAT} + + def _compute_logP_from_ll(self): + """compute self's logP matrix from the self.ll (log-likelihood)""" + log = math.log + logP = [] + for i in range(self.width): + D = {} + for L in ACGT: + ''' if ll = log(p/b) then + 2^ll = p/b + and ll = log(p) - log(b) + so log(p) = ll + log(b)''' + #Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] + D[L] = self.ll[i][L] + log(self.background[L])/log(2.) + logP.append(D) + self.logP = logP + + def _print_ll(self): + """print log-likelihood (scoring) matrix""" + print "# ", + for i in range(self.width): + print " %4d "%i, + print + for L in ['A', 'C', 'T', 'G']: + print "#%s "%L, + for i in range(self.width): + print "%8.3f "%self.ll[i][L], + print + def _print_p(self): + """print probability (frequency) matrix""" + print "# ", + for i in range(self.width): + print " %4d "%i, + print + for L in ['A', 'C', 'T', 'G']: + print "#%s "%L, + for i in range(self.width): + print "%8.3f "%math.pow(2,self.logP[i][L]), + print + def _print_counts(self): + """print count matrix""" + print "# ", + for i in range(self.width): + print " %4d "%i, + print + for L in ['A', 'C', 'T', 'G']: + print "#%s "%L, + for i in range(self.width): + print "%8.3f "%self.counts[i][L], + print + + def _maxscore(self): + """sets self.maxscore and self.minscore""" + total = 0 + lowtot= 0 + for lli in self.ll: + total = total + max(lli.values()) + lowtot= lowtot+ min(lli.values()) + self.maxscore = total + self.minscore = lowtot + + def _compute_threshold(self,z=2.0): + """for Motif objects assembled from a set of sequence, + compute a self.threshold with a z-score based on the distribution + of scores in among the original input sequences. + """ + scoretally = [] + for seq in self.seqs: + matches,endpoints,scores = self.scan(seq,-100) + scoretally.append(scores[0]) + ave,std = avestd(scoretally) + self.threshold = ave - z *std + #print '#%s: threshold %5.2f = %5.2f - %4.1f * %5.2f'%( + # self, self.threshold, ave, z, std) + + def bestscanseq(self,seq): + """return score,sequence of the best match to the motif in the supplied sequence""" + matches,endpoints,scores = self.scan(seq,-100) + t = zip(scores,matches) + t.sort() + bestseq = t[-1][1] + bestscore = t[-1][0] + return bestscore, bestseq + + def bestscore(self,seq): + """return the score of the best match to the motif in the supplied sequence""" + return m.bestscan(seq) + + def bestscan(self,seq): + """return the score of the best match to the motif in the supplied sequence""" + matches,endpoints,scores = self.scan(seq,-100) + if not scores: return -100 + scores.sort() + best = scores[-1] + return best + + def matchstartorient(self,seq, factor=0.7): + """returns list of (start,orientation) coordinate pairs of matches to + the motif in the supplied sequence. Factor is multiplied by m.maxscore + to get a match threshold. + """ + ans = [] + txts,endpoints,scores = self.scan(seq,factor=factor) + for txt, startstop in zip(txts,endpoints): + start, stop = startstop + rctxt = reverse_complement(txt) + orient = (self.bestscore(txt,1) >= self.bestscore(rctxt,1)) + ans.append((start,orient)) + return ans + + def scan(self, seq, threshold = '', factor=0.7): + """ + Scan the sequence. Returns three lists: matching sequences, endpoints, + and scores. The value of 'factor' is multiplied by m.maxscore to get a + match threshold if none is supplied + """ + if len(seq) < self.width: + return self._scan_smaller(seq,threshold) + else: + return self._scan(seq,threshold,factor=factor) + + def scansum(self,seq,threshold = -1000): + """ + Sum of scores over every window in the sequence. Returns + total, number of matches above threshold, average score, sum of exp(score) + """ + ll = self.ll + sum = 0 + width = self.width + width_r = range(width) + width_rcr = range(width-1,-1,-1) + width_ranges = zip(width_r,width_rcr) + seqcomp = seq.translate(revcompTBL) + + total = 0 + hits = 0 + etotal= 0 + for offset in range(len(seq)-width+1): + total_f = 0 + total_r = 0 + for i,ir in width_ranges: + pos = offset+i + total_f = total_f + ll[i][ seq[pos]] + total_r = total_r + ll[i][seqcomp[pos]] + total_max = max(total_f,total_r) + if total_max >= threshold: + total = total + total_max + etotal = etotal + math.exp(total_max) + hits = hits + 1 + if not hits: + ave = 0 + else: + ave = float(total)/float(hits) + return total,hits,ave,math.log(etotal) + + def score(self, seq, fwd='Y'): + """returns the score of the first w-bases of the sequence, where w is the motif width.""" + matches, endpoints, scores = self._scan(seq,threshold=-100000,forw_only=fwd) + return scores[0] + + def bestscore(self,seq, fwd=''): + """returns the score of the best matching subsequence in seq.""" + matches, endpoints, scores = self._scan(seq,threshold=-100000,forw_only=fwd) + if scores: return max(scores) + else: return -1000 + + def _scan(self, seq,threshold='',forw_only='',factor=0.7): + """internal tility function for performing sequence scans""" + ll = self.ll #Shortcut for Log-likelihood matrix + if not threshold: threshold = factor * self.maxscore + + #print '%5.3f'%(threshold/self.maxscore) + matches = [] + endpoints = [] + scores = [] + width = self.width + width_r = range(width) + width_rcr = range(width-1,-1,-1) + width_ranges = zip(width_r,width_rcr) + + seqcomp = seq.translate(revcompTBL) + + for offset in range(len(seq)-self.width+1): #Check if +/-1 needed + total_f = 0 + total_r = 0 + for i,ir in width_ranges: + pos = offset+i + total_f = total_f + ll[i ][ seq[pos]] + total_r = total_r + ll[ir][seqcomp[pos]] + + if 0 and total_f > 1: + for i,ir in width_ranges: + print seq[offset+i],'%6.3f'%ll[i ][ seq[offset+i] ],' ', + print '= %7.3f'%total_f + + if 0: + print "\t\t%s vs %s: F=%6.2f R=%6.2f %6.2f %4.2f"%(seq[offset:offset+self.width], + self.oneletter,total_f,total_r, + self.maxscore, + max([total_f,total_r])/self.maxscore) + if total_f > threshold and ((total_f > total_r) or forw_only): + endpoints.append( (offset,offset+self.width-1) ) + scores.append(total_f) + matches.append(seq[offset:offset+self.width]) + elif total_r > threshold: + endpoints.append( (offset,offset+self.width-1) ) + scores.append(total_r) + matches.append(seq[offset:offset+self.width]) + return matches,endpoints,scores + def _scan_smaller(self, seq, threshold=''): + """internal utility function for performing sequence scans. The sequence + is smaller than the PSSM. Are there good matches to regions of the PSSM?""" + ll = self.ll #Shortcut for Log-likelihood matrix + matches = [] + endpoints = [] + scores = [] + w = self.width + for offset in range(self.width-len(seq)+1): #Check if +/-1 needed + maximum = 0 + for i in range(len(seq)): + maximum = maximum + max(ll[i+offset].values()) + if not threshold: threshold = 0.8 * maximum + total_f = 0 + total_r = 0 + for i in range(len(seq)): + total_f = total_f + ll[i+offset ][ seq[i] ] + total_r = total_r + ll[w-(i+offset)-1][revcomp[seq[i]]] + if 0: + print "\t\t%s vs %s: F=%6.2f R=%6.2f %6.2f %4.2f"%(seq, self.oneletter[offset:offset+len(seq)], + total_f, total_r, maximum, + max([total_f,total_r])/self.maxscore) + if total_f > threshold and total_f > total_r: + endpoints.append( (offset,offset+self.width-1) ) + scores.append(total_f) + matches.append(seq[offset:offset+self.width]) + elif total_r > threshold: + endpoints.append( (offset,offset+self.width-1) ) + scores.append(total_r) + matches.append(seq[offset:offset+self.width]) + return matches,endpoints,scores + + def mask_seq(self,seq): + """return a copy of input sequence in which any regions matching m are + replaced with strings of N's """ + masked = '' + matches, endpoints, scores = self.scan(seq) + cursor = 0 + for start, stop in endpoints: + masked = masked + seq[cursor:start] + 'N'*self.width + cursor = stop+1 + masked = masked + seq[cursor:] + return masked + + def masked_neighborhoods(self,seq,flanksize): + """chop up the input sequence into regions surrounding matches to m. + Replace the subsequences that match the motif with N's.""" + ns = self.seq_neighborhoods(seq,flanksize) + return [self.mask_seq(n) for n in ns] + + def seq_neighborhoods(self,seq,flanksize): + """chop up the input sequence into regions surrounding matches to the + motif.""" + subseqs = [] + matches, endpoints, scores = self.scan(seq) + laststart, laststop = -1, -1 + for start, stop in endpoints: + curstart, curstop = max(0,start-flanksize), min(stop+flanksize,len(seq)) + if curstart > laststop: + if laststop != -1: + subseqs.append(seq[laststart:laststop]) + laststart, laststop = curstart, curstop + else: + laststop = curstop + if endpoints: subseqs.append(seq[laststart:laststop]) + return subseqs + + def __sub__(self,other): + pass + """Overloads the '-' operator to compute the Euclidean distance between + probability matrices motifs of equal width.""" + if type(other) != type(self): + print "computing distance of unlike pssms (types %s, %s)"%( + type(other),type(self)) + print 'First: %s'%other + print 'Self: %s'%self + sys.exit(1) + if other.width != self.width: + print "computing distance of unlike pssms (width %d != %d)"%( + other.width,self.width) + sys.exit(1) + D = 0 + FABS = math.fabs + POW = math.pow + for L in self.logP[0].keys(): + for i in range(self.width): + D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) + #D = D + FABS( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L])) + #D = D + FABS(self.logP[i][L] - other.logP[i][L]) + return math.sqrt(D) + + def maskdiff(self,other): + """a different kind of motif comparison metric. See THEME paper for + details""" + return maskdiff(self,other) + + def maxdiff(self): + """compute maximum possible Euclidean distance to another motif. (For + normalizing?)""" + POW = math.pow + D = 0 + for i in range(self.width): + _min = 100 + _max = -100 + for L in ACGT: + val = POW(2,self.logP[i][L]) + if val > _max: + _max = val + _maxL = L + elif val < _min: + _min = val + _minL = L + for L in ACGT: + if L == _minL: + delta = 1-POW(2,self.logP[i][L]) #1-val + D = D + delta*delta + else: + D = D + POW( POW(2,self.logP[i][L]), 2) #0-val + return math.sqrt(D) + + def revcomp(self): + """return reverse complement of motif""" + return revcompmotif(self) + def trimmed(self,thresh=0.1): + """return motif with low-information flanks removed. 'thresh' is in bits.""" + for start in range(0,self.width-1): + if self.bits[start]>=thresh: break + for stop in range(self.width,1,-1): + if self.bits[stop-1]>=thresh: break + m = self[start,stop] + return m + def bestseqs(self,thresh=None): + """return all k-mers that match motif with a score >= thresh""" + if not thresh: + if self._bestseqs: + return self._bestseqs + if not thresh: thresh = 0.8 * self.maxscore + self._bestseqs = bestseqs(self,thresh) + return self._bestseqs + def emit(self,prob_min=0.0,prob_max=1.0): + """consider motif as a generative model, and have it emit a sequence""" + if not self.cumP: + for logcol in self.logP: + tups = [] + for L in ACGT: + p = math.pow(2,logcol[L]) + tups.append((p,L)) + tups.sort() + cumu = [] + tot = 0 + for p,L in tups: + tot = tot + p + cumu.append((tot,L)) + self.cumP.append(cumu) + s = [] + #u = random()+0.01 #Can make higher for more consistent motifs + for cumu in self.cumP: + u = (prob_max-prob_min)*random() + prob_min + #u = random()+0.01 #Can make higher for more consistent motifs + last = 0 + for p,L in cumu: + if last < u and u <= p: + letter = L + break + else: last = p +# print L,'%8.4f'%u,cumu + s.append(L) + #print ''.join(s) + return ''.join(s) + + + def random_kmer(self): + """generate one of the many k-mers that matches the motif. See m.emit() + for a more probabilistic generator""" + if not self._bestseqs: self._bestseqs = self.bestseqs() + seqs = self._bestseqs + pos = int(random() * len(seqs)) + print 'Random: ',self.oneletter,seqs[pos][1] + return seqs[pos][1] + + def __getitem__(self,tup): + pass + """ + m.__getitem__(tup) -- Overload m[a,b] to submotif. Less pythonish than [:], but more reliable + """ + if len(tup) != 2: + print "Motif[i,j] requires two arguments, not ",tup + else: + beg, end = tup[0], tup[1] + return submotif(self,beg,end) + def __getslice__(self,beg,end): + pass + """ + m.__getslice__(,beg,end) -- Overload m[a:b] to submotif. + """ + if beg >= end: + #Probably python converted negative idx. Undo + beg = beg - self.width + return submotif(self,beg,end) + def __add__(self,other): + pass + """ + m.__add__(other) -- Overload '+' for concatenating motifs + """ + return merge(self,other,0) + def __len__(self): + pass + """ + m.__len__() -- Overload len(m) to return width + """ + return self.width + def shuffledP(self): + """ + m.shuffledP() -- Generate motif in which probability matrix has been shuffled. + """ + return shuffledP(self) + def copy(self): + """return a 'deep' copy of the motif""" + a = Motif() + a.__dict__ = self.__dict__.copy() + return a + + def random_diff_avestd(self,iters=5000): + """see modules' random_diff_avestd""" + return random_diff_avestd(self,iters) + def bogus_kmers(self,count=200): + """Generate a faked multiple sequence alignment that will reproduce the + probability matrix.""" + + POW = math.pow + #Build p-value inspired matrix + #Make totals cummulative: + # A: 0.1 C: 0.4 T:0.2 G:0.3 + # -> A:0.0 C:0.1 T:0.5 G:0.7 0.0 + + #Take bg into account: + # We want to pick P' for each letter such that: + # P'/0.25 = P/Q + # so P' = 0.25*P/Q + + m = [] + for i in range(self.width): + _col = [] + tot = 0.0 + for L in ACGT: + _col.append( tot ) + tot = tot + POW(2,self.logP[i][L]) * 0.25 / self.background[L] + _col.append(tot) + #Renormalize + for idx in range(len(_col)): + _col[idx] = _col[idx] / _col[-1] + m.append(_col) + + for p in range(0): #Was 5 + for i in range(len(m)): + print '%6.4f '%m[i][p], + print + + seqs=[] + for seqnum in range(count+1): + f = float(seqnum)/(count+1) + s = [] + for i in range(self.width): + for j in range(4): + if (m[i][j] <= f and f < m[i][j+1]): + s.append(ACGT[j]) + break + seqs.append(''.join(s)) + + del(seqs[0]) + #for i in range(count): + # print ">%3d\n%s"%(i,seqs[i]) + + return seqs + + +def minwindowdiff(M1,M2,overlap=5,diffmethod='diff'): + #Alternate method: maskdiff, infomaskdiff + if type(M1) != type(M2): + print "Error: Attempted to compute alignment of objects that are not both Motifs" + print " types %s: %s and %s: %s"%(M1,type(M1),M2,type(M2)) + sys.exit(1) + + if M1.width <= M2.width: A = M1; Borig = M2 + else: A = M2; Borig = M1 + wA = A.width + wB = Borig.width + O = overlap + + if diffmethod == 'diff': + diff_fcn = diff + elif diffmethod == 'maskdiff': + diff_fcn = maskdiff + elif diffmethod == 'infomaskdiff': + diff_fcn = infomaskdiff + + mindiff = 1000 + #print 'minwindodebug wA ', wA, 'wB ', wB, 'O ', O, 'wA-0', wA-O, 'wB-O', wB-O + for Astart in range(wA-O+1): + subA = A[Astart:Astart+O] + for B in [Borig, Borig.revcomp()]: + for Bstart in range(wB-O+1): + subB = B[Bstart:Bstart+O] + mindiff = min(mindiff, diff_fcn(subA,subB)) + #print 'minwindodebug ',subA, subB, diff_fcn(subA,subB) + return mindiff + + +def minaligndiff(M1,M2,overlap=5,diffmethod='diff'): + #Alternate method: maskdiff, infomaskdiff + if type(M1) != type(M2): + print "Error: Attempted to compute alignment of objects that are not both Motifs" + print " types %s: %s and %s: %s"%(M1,type(M1),M2,type(M2)) + sys.exit(1) + + if M1.width <= M2.width: + A = M1; Borig = M2 + switch = 0 + else: + A = M2; Borig = M1 + switch = 1 + wA = A.width + wB = Borig.width + O = overlap + + ''' + Here is the figure to imagine: + 012345678901234567890 wA: 6 Bstart: 6-3 = 3 + A (A) wB: 11 Bstop: 6+11-3-1= 13 + ------ %%%%%% O: 3 lastA: 6+11-3-3= 11 + ----------- + |O| B + ''' + + if diffmethod == 'diff': + diff_fcn = diff + elif diffmethod == 'maskdiff': + diff_fcn = maskdiff + elif diffmethod == 'infomaskdiff': + diff_fcn = infomaskdiff + + Bstart = wA-O + Bstop = wA+wB-O-1 + lastA = wA+wB-O-O + Dmin = 1000 + Dmins=[] + #print A + #print '%s%s'%(' '*Bstart,Borig) + for B in [Borig, Borig.revcomp()]: + for start in range(0,lastA+1): + Bpos = [] + Apos = [] + for offset in range(wA): + abs = start+offset + if abs >= Bstart and abs <= Bstop: + Apos.append(offset) + Bpos.append(abs-Bstart) + subA = A[min(Apos),max(Apos)+1] + subB = B[min(Bpos),max(Bpos)+1] + #print '%s%s\n%s%s %f'%( + # ' '*start, subA, + # ' '*start, subB, diff_fcn(subA,subB)) + if switch: _diff = diff_fcn(subB,subA) + else: _diff = diff_fcn(subA,subB) + Dmin = min(Dmin, _diff) + return Dmin + +''' +To compare 2 motifs of the same width, there are these five functions: + +m1 - m2 - Euclidean Distance (sqrt(sum_col(sum_row))) +diff(m1,m2) - psuedo-Euclidean (sum_col(sqrt(norm(sum_row)))/#col +maskdiff(m1,m2) - diff, but excluding positions with "N" in m2 +infomaskdiff(m1,m2)- diff, but scaling distance by normalized + information content at each position in m2. +diverge(m1,m2) - Mutual information sum[p log (p/q)] + +**Note that maskdiff, infomaskdiff, and diverge are not symmetric functions + +To compare 2 motifs of different widths, there is the function: + +minaligndiff(M1,M2,overlap=5,diffmethod='diff') + +this does a 'sliding' comparison of two motifs and reports the minimum +distance over all alignments. overlap refers to the minumum overlap +required while sliding. Below, overlap is '2'. The default is '5'. + + ------ + ----------- + +You can optionally specify the distance metric as a text string. +The default is 'diff'. + +''' + + +def diff(self,other): + """psuedo-Euclidean (sum_col(sqrt(norm(sum_row)))/#col""" + if type(other) != type(self): + print "computing distance of unlike pssms (types %s, %s)"%( + type(other),type(self)) + print 'First: %s'%other + print 'Self: %s'%self + sys.exit(1) + if other.width != self.width: + print "computing distance of unlike pssms (width %d != %d)"%( + other.width,self.width) + sys.exit(1) + POW = math.pow + Dtot = 0 + for i in range(self.width): + '''Computes distance''' + D = 0 + for L in ACGT: + D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) + Dtot = Dtot + math.sqrt(D)/math.sqrt(2.0) + return Dtot/self.width + + +def maskdiff(self,other): + """diff, but excluding positions with 'N' in m2. Return pseudo-Euclidean + distance, but only include columns that are not background.""" + if type(other) != type(self): + print "computing distance of unlike pssms (types %s, %s)"%( + type(other),type(self)) + print 'First: %s'%other + print 'Self: %s'%self + sys.exit(1) + if other.width != self.width: + print "computing distance of unlike pssms (width %d != %d)"%( + other.width,self.width) + sys.exit(1) + + Dtot = 0 + POW = math.pow + NEAR0= lambda x:(-0.01 < x and x < 0.01) + divisor = 0 + for i in range(self.width): + nearcount = 0 + + '''Implements mask''' + for L in ACGT: + diff = POW(2,other.logP[i][L]) - other.background[L] + if NEAR0(diff): nearcount = nearcount + 1 + if nearcount == 4: + #print 'Skipping position %d :'%i,other.logP[i] + continue + + '''Computes distance''' + divisor = divisor + 1 + D = 0 + for L in ACGT: + D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) + Dtot = Dtot + math.sqrt(D)/math.sqrt(2.0) + return Dtot/divisor + +def infomaskdiff(self,other): + """Return pseudo-Euclidean distance, but scale column distance by + information content of "other". Used by THEME""" + if type(other) != type(self): + print "computing distance of unlike pssms (types %s, %s)"%( + type(other),type(self)) + print 'First: %s'%other + print 'Self: %s'%self + sys.exit(1) + if other.width != self.width: + print "computing distance of unlike pssms (width %d != %d)"%( + other.width,self.width) + sys.exit(1) + + maxbits = math.log( 1.0/min(other.background.values()) ) / math.log(2.0) + '''or... alternatively''' + #print maxbits, max(other.bits) + #print other.bits + maxbits = max(other.bits) + if maxbits < 0.1: #'''There is nothing important here''' + return 1 + + Dtot = 0 + POW = math.pow + divisor = 0 + '''Computes distance''' + for i in range(self.width): + D = 0 + for L in ACGT: + D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) + col_dist = math.sqrt(D)/math.sqrt(2.0) + col_scale = other.bits[i]/maxbits + divisor = divisor + col_scale + Dtot = Dtot + col_dist*col_scale + return Dtot/divisor + +def diverge(self,other): + """Yet another distance metric""" + if type(other) != type(self): + print "computing distance of unlike pssms (types %s, %s)"%( + type(other),type(self)) + print 'First: %s'%other + print 'Self: %s'%self + sys.exit(1) + if other.width != self.width: + print "computing distance of unlike pssms (width %d != %d)"%( + other.width,self.width) + sys.exit(1) + + Dtot = 0 + POW = math.pow + LOG2 = lambda x:math.log(x)/math.log(2.0) + NEAR0= lambda x:(-0.01 < x and x < 0.01) + divisor = 0 + for i in range(self.width): + nearcount = 0 + + '''Implements mask''' + for L in ACGT: + diff = POW(2,other.logP[i][L]) - self.background[L] + if NEAR0(diff): nearcount = nearcount + 1 + if nearcount == 4: + #print 'Skipping position %d :'%i,other.logP[i] + continue + + '''Computes distance''' + divisor = divisor + 1 + D = 0 + for L in ACGT: + Pself = POW(2, self.logP[i][L]) + Pother= POW(2,other.logP[i][L]) + D = D + Pself * LOG2(Pself/Pother) + Dtot = Dtot + D + return Dtot/divisor + + + +def bestseqs(motif,thresh, seq='',score=0,depth=0,bestcomplete=None,SEQS=[]): + """This function returns a list of all sequences that a motif could + match match with a sum(log-odds) score greater than thresh.""" + if depth == 0: + SEQS = [] #Must be a python 2.1 bug. I shouldn't have to do this + if not bestcomplete: + M = motif + maxs = [] + for i in range(M.width): + bestj = 'A' + for j in ['C', 'G', 'T']: + if M.ll[i][j] > M.ll[i][bestj]: + bestj = j + maxs.append(M.ll[i][bestj]) + bestcomplete = [] + for i in range(M.width): + tot = 0 + for j in range(i,M.width): + tot = tot + maxs[j] + bestcomplete.append(tot) + if depth == motif.width: + if score > thresh: + SEQS.append((score,seq)) + #if len(SEQS) > 2000: + # thresh = 1000.0 # Return Early, You don't really want all these sequences, do you? + return + if depth==-1: + print '# %-10s %6.3f %6.3f %2d'%(seq, score, bestcomplete[depth], depth) + if score + bestcomplete[depth] < thresh: return + #if depth > 0 and len(SEQS) > 2000: + # return + for L in ACGT: + newseq = seq + L + newscore = score + motif.ll[depth][L] + bestseqs(motif,thresh,newseq,newscore,depth+1,bestcomplete,SEQS) + if depth == 0: + SEQS.sort() + SEQS.reverse() + return SEQS + +def seqs2fasta(seqs,fasta_file = ''): + """ + seqs2fasta(seqs,fasta_file = '') -- Dumps a Fasta formatted file of sequences, + keyed by the sequence itself:: + + >ACTTTTTGTCCCA + ACTTTTTGTCCCA + >ACTTTTGGGGCCA + ACTTTTGGGGCCA + ... + + """ + if not fasta_file: + fasta_file = tempfile.mktemp() + FH = open(fasta_file,'w') + for i in range(len(seqs)): + FH.write(">%d\n%s\n"%(i,seqs[i])) + FH.close() + return fasta_file + +def top_nmers(N,seqs,with_counts = 0,purge_Ns = ''): + """Assemble list of all nmers (kmers) with width 'N' from supplied sequences. + Option with_counts returns list of (kmer, count) tuples instead. Purge N's + ignores kmers containing N's. """ + Nmers = {} + revcompTBL = string.maketrans("AGCTagctnN", "TCGAtcganN") + for seq in seqs: + for i in range(len(seq)-N+1): + Nmer = seq[i:i+N] + if purge_Ns: + if Nmer.find('N') >= 0: continue + _t = list(Nmer.translate(revcompTBL)) + _t.reverse() + NmerRC = ''.join(_t) # _t used until here to revese comp seq + _t = [Nmer, NmerRC] + _t.sort() + NmerKey = _t[0] # _t used until here to get alphabetically first seq + if Nmers.has_key(NmerKey): + Nmers[NmerKey] = Nmers[NmerKey] + 1 + else: + Nmers[NmerKey] = 1 + sorted = Nmers.keys() + sorted.sort(lambda x,y,D=Nmers:cmp(D[y],D[x]) or cmp(x,y)) + #for i in range(10): + # print "# %2d %s %d"%(i,sorted[i],Nmers[sorted[i]]) + if with_counts: + return zip(sorted,map(lambda x,N=Nmers:N[x], sorted)) + else: + return sorted + +def m_matches(seqs,wmer,m): + """Returns list of all kmers among sequences that have at most + m mismatches to the supplied wmer (kmer).""" + matches = [] + width = len(wmer) + for (nmer, count) in top_nmers(width,seqs,'with counts'): + match = 0 + for i in range(width): + if nmer[i] == wmer[i]: + match = match+1 + if match >= m: + for i in range(count): + matches.append(nmer) + return matches + +def compare_seqs(s1, s2): + pass + """ + compare_seqs(s1, s2) + """ + if len(s1) > len(s2): + long = s1 + short = s2 + else: + long = s2 + short = s1 + (maxcount,max_i) = (0,0) + for i in range(len(long)-len(short)+1): + idcount_f = 0 + idcount_r = 0 + for j in range(len(short)): + if short[j] == long[i+j]: + idcount_f = idcount_f + 1 + if short[-(j+1)] == revcomp[long[i+j]]: + idcount_r = idcount_r + 1 + if (idcount_f > maxcount and idcount_f >= idcount_r): + maxcount = idcount_f + max_i = i + elif (idcount_r > maxcount): + maxcount = idcount_r + max_i = i + #print i,j,idcount_f,idcount_r,maxcount + maxfrac = float(maxcount) / len(short) + print maxfrac,maxcount,len(short) + return maxfrac,short,long[max_i:max_i+len(short)] + +def shuffle_bases(m): + """return a new motif object in which the probabilities are randomly + re-assigned to different letters at the same position.""" + C = [] + letts = list('ACGT') + for i in range(m.width): + D = {} + vals = m.counts[i].values() + shuffle(vals) + for i in range(4): + D[letts[i]] = vals[i] + C.append(D) + n = Motif() + #n.__dict__ = m.__dict__.copy() #May copy too much information (cached diff information, etc...) + n.compute_from_counts(C) + return n + +def random_diff_avestd(motif,iters=5000): + """Return the average & stddev distance ('diff') between a + motif and "iters" random motifs of the same width.""" + w = motif.width + vals = [] + for i in range(iters): + vals.append(motif - Random_motif(w)) + return avestd(vals) + +def random_motif(w): + """Generate a random motif of width w. Each position will have a dominant + letter with probability around 0.91.""" + C = [] + for i in range(w): + D = {} + tot = 0 + p = int(random.random() * 4) + Lup = ACGT[p] + for L in ACGT: + D[L] = 0.1 + tot = tot + 0.001 + D[Lup] = D[Lup] + 1 + for L in ACGT: + D[L] = D[L]/tot + C.append(D) + m = Motif() + m.compute_from_counts(C) + return m + +def toDict(M): + pass + ''' + toDict(M) -- Convert a 2D array to a list of dictionaries (which is how the motif object + stores information internally). Assumes M entries are in alphabetical order (ACGT) + ''' + if type(M[0]) == type(0.0): + return toDictVect(M) + else: + a = [] + for i in range(len(M)): + a.append(toDictVect(M[i])) + return a + +def toDictVect(V): + pass + """ + toDictVect(V) -- Convert a 1D vector to a dictionary of DNA letters. Assumes values + in V are in alphabetical order (ACGT). + """ + D = {} + for L,i in (('A',0), ('C',1), ('G',2), ('T',3)): + D[L]=V[i] + return D + +def submotif(self,beg,end): + """**Deprecated** Use slice functionality (m[2:4]) instead. + + Utility function + for extracting sub-motifs and padding motifs.""" + bg = self.background.copy() + P = [] + + #Determine if any 'zeros' should be added at begining + #because the user has specified a negative beg index + for i in range(beg,0): + P.append(bg.copy()) + + #Copy relevant content of motif + start = max(beg,0) + stop = min(end,self.width) + for i in range(start,stop): + D = {} + for L in ACGT: + D[L] = math.pow(2.,self.logP[i][L]) + P.append(D) + + #Determine if any 'zeros' should be added at the end + #because the user has specified a width too large + for i in range(self.width,end): + P.append(bg.copy()) + + #print "BEG, END", beg,end + #for i in range(beg,end): + # print i,P[i] + + #Build the Motif + M = copy.deepcopy(self) + #M = Motif(None,bg.copy()) + M.compute_from_counts(P) + M.source = self.source + return M + +def shuffledP(self): + """Construct a motif in which the letter distributions are preserved but + are reassigned to rondom positions in the motif.""" + bg = self.background.copy() + P = [] + + #Copy relevant content of motif + for i in range(0,self.width): + D = {} + _s = ACGT[:] + shuffle(_s) + for L,_L in zip(ACGT,_s): + D[L] = math.pow(2.,self.logP[i][_L]) + P.append(D) + + #Build the Motif + M = copy.deepcopy(self) + #M = Motif(None,bg.copy()) + M.compute_from_counts(P) + M.source = self.source + return M + +def revcompmotif(self): + """Construct the reverse complement of the motif. Use m.revcomp() member + function instead.""" + bg = self.background.copy() + P = [] + + for i in range(self.width): + D = {} + for L in ACGT: + D[L] = math.pow(2.,self.logP[self.width-i-1][revcomp[L]]) + P.append(D) + + #Build the Motif + M = copy.deepcopy(self) + M.compute_from_counts(P) + return M + + +def sum(motifs,weights=[]): + """Perhaps better called 'average'. Constructs a motif by averaging the + probabilities at each position of the (pre-aligned) input motifs. Optional + weights can be assigned, and must be in the same order as the motifs. + """ + if not weights: + weights = [1.0] * len(motifs) + tot = 0.0 + for w in weights: tot=tot+float(w) + weights = [(w/tot) for w in weights] + C = [] + for c in motifs[0].fracs: + D = {} + for L in ACGT: D[L] = 0.0 + C.append(D) + for m,w in zip(motifs,weights): + for i in range(m.width): + for L in ACGT: + C[i][L] = C[i][L] + m.fracs[i][L]*w + motif = Motif_from_counts(C,0.0,bg=motifs[0].background) + return motif.trimmed() + + +def giflogo(motif,id,title=None,scale=0.8): + """Interface to the 'weblogo/seqlogo' perl + scripts that generate colorful sequence logos + """ + return seqlogo(motif,id,title,scale,format='GIF') + + +seqlogo_formats = ('GIF','PDF','EPS','PNG') +illegal_fn_chars = '&;/ ()' +fn_trans = string.maketrans(illegal_fn_chars,'_'*len(illegal_fn_chars)) +def seqlogo(motif,motif_id,title=None,scale=0.8,img_format='GIF') : + """Interface to the'weblogo/seqlogo' perl scripts that generate colorful + sequence logos. Available formats are %s. Replaces illegal filename + characters in *id* parameter (i.e. '%s') with underscores when writing + to file. The executable *seqlogo* must be on your path. + """%(seqlogo_formats,illegal_fn_chars) + #SEQLOGO = TAMOpaths.weblogodir + 'seqlogo' + #TAMOpaths.CHECK(SEQLOGO,'','Weblogo/Seqlogo') + kmers = motif.bogus_kmers(100) + width = float(len(kmers[0]) ) + height = float(4) + m = motif + width, height = width*scale, height*scale + tmp = tempfile.mktemp() + '.fsa' + if title is None: + title = motif_id + + if img_format.upper() not in seqlogo_formats : + raise MotifToolsException('seqlogo requires one of %s'%seqlogo_formats) + + seqs2fasta(kmers,tmp) + fn = id.translate(fn_trans) + cmd = 'seqlogo -F %s -acpY -w%d -h%d -k 1 -M -f %s -o %s -t "%s" '%( + img_format.upper(), width, height, tmp, fn, title) + + call(cmd,shell=True) + return "%s.%s"%(fn,img_format.lower()) + + +def merge(A,B,overlap=0): + """**Deprecated** Use the '+' operator instead. + + Used for concatenating motifs into a new motif, allowing for the averaging + of overlapping bases between them. + """ + if (overlap < 0 or overlap > A.width or overlap >B.width): + print 'Cannot overlap %s with %s by %d bases'%(A.oneletter,B.oneletter,overlap) + return None + + #Build Probability matrix. Width will be A.width + B.width - overlap + w = A.width + B.width - overlap + + P = [] + #Make a copy of A's probabilities into P + for i in range(A.width): + D = {} + logP = A.logP[i] + for L in logP.keys(): + D[L] = math.pow(2,logP[L]) + P.append(D) + #Add B's first 'overlap' probabilities to last 'overlap' probabilities of P + for i in range(overlap): + logP = B.logP[i] + Pidx = len(P)-overlap+i + _tot = 0 + for L in logP.keys(): + P[Pidx][L] = (P[Pidx][L] + math.pow(2,logP[L])) / 2.0 + P[Pidx][L] = max(P[Pidx][L],math.pow(2,logP[L])) + _tot = _tot + P[Pidx][L] + for L in logP.keys(): + P[Pidx][L] = P[Pidx][L] / _tot + #Append B's remaining probabilites to P + for i in range(overlap,B.width): + D = {} + logP = B.logP[i] + for L in logP.keys(): + D[L] = math.pow(2,logP[L]) + P.append(D) + + #Build a motif + M = Motif(None,A.background.copy()) + M.source = A.source,B.source + M.compute_from_counts(P) + return M + +def avestd(vals): + """return an (average, stddev) tuple computed from the supplied list of values""" + (sum, sum2) = (0.,0.) + N = float(len(vals)) + for val in vals: + sum = sum + float(val) + sum2 = sum2 + float(val)*float(val) + if N == 1: + ave = sum + std = 0 + else: + ave = sum / N + std = math.sqrt( (sum2-(N*ave*ave)) / (N-1.0) ) + return ave,std + + +def load(filename): + """load a 'TAMO'-formatted motif file""" + FID = open(filename,'r') + lines = FID.readlines() + FID.close() + motifs = [] + seedD = {} + seedfile = '' + for i in range(len(lines)): + if lines[i][0:10] == 'Log-odds matrix'[0:10]: + w = len(lines[i+1].split())-1 + ll = [] + for pos in range(w): + ll.append({}) + for j in range(0,4): + toks = lines[i+j+2].split() + L = toks[0][1] + for pos in range(w): + ll[pos][L] = float(toks[pos+1]) + m = Motif_from_ll(ll) + motifs.append(m) + if lines[i][0:6] == 'Motif '[0:6]: + toks = lines[i].split() + motifs[-1].nseqs = float(re.sub('[\(\)]','',toks[3])) + motifs[-1].totalbits= float(toks[5]) + motifs[-1].MAP = float(toks[7]) + motifs[-1].seeddist = float(toks[9]) + motifs[-1].seednum = int(toks[10][0:-1]) + motifs[-1].pvalue = math.pow(10,-float(toks[12])) + + if 'ch:' in toks: + _idx = toks.index('ch:') + motifs[-1].church = math.pow(10,-float(toks[_idx+1])) + if 'Es:' in toks: + _idx = toks.index('Es:') + motifs[-1].E_site = math.pow(10,-float(toks[_idx+1])) + if 'x2:' in toks: + _idx = toks.index('x2:') + motifs[-1].E_chi2 = math.pow(10,-float(toks[_idx+1])) + if 'Eq:' in toks: + _idx = toks.index('Eq:') + motifs[-1].E_seq = math.pow(10,-float(toks[_idx+1])) + if 'mn:' in toks: + _idx = toks.index('mn:') + motifs[-1].MNCP = float(toks[_idx+1]) + if 'f:' in toks: + _idx = toks.index('f:') + motifs[-1].frac = float(toks[_idx+1]) + if 'Ra:' in toks: + _idx = toks.index('Ra:') + motifs[-1].ROC_auc = float(toks[_idx+1]) + if 'cR:' in toks: + _idx = toks.index('cR:') + motifs[-1].CRA = float(toks[_idx+1]) + if 'Cf:' in toks: + _idx = toks.index('Cf:') + motifs[-1].Cfrac = float(toks[_idx+1]) + if 'k:' in toks: + _idx = toks.index('k:') + motifs[-1].kellis = float(toks[_idx+1]) + + if 'b:' in toks: + _idx = toks.index('b:') + motifs[-1].numbound = int(toks[_idx+1]) + if 'nG:' in toks: + _idx = toks.index('nG:') + motifs[-1].nummotif = int(toks[_idx+1]) + if 'bn:' in toks: + _idx = toks.index('bn:') + motifs[-1].numboundmotif = int(toks[_idx+1]) + + + + if lines[i][0:10] == 'Threshold: '[0:10]: + toks = lines[i].split() + motifs[-1].threshold= float(toks[1]) + if lines[i][0:5] == 'Seed '[0:5]: + toks = lines[i].split() + id = int(toks[1][0:-1]) #'10:' -> '10' + seedD[id] = toks[2] + if lines[i][0:7] == 'Source: '[0:7]: + motifs[-1].source = lines[i][7:].strip() + if lines[i][0:6] == 'Gamma: '[0:6]: + motifs[-1].gamma = float(lines[i][6:]) + if lines[i][0:6] == 'Evalue: '[0:6]: + motifs[-1].evalue = float(lines[i][7:].strip()) + if lines[i][0:22]=='Program specific score: '[0:22]: + tempprogscore=lines[i][23:].split(":"); + + for i in range(len(tempprogscore)): + tempprogscore[i]=tempprogscore[i].strip() + + if len(tempprogscore)>1: + try: + tempprogscore[1]=float(tempprogscore[1]) + except ValueError: + tempprogscore[1]=tempprogscore[1] + motifs[-1].progscore=tempprogscore + + if lines[i][0:10] == 'fasta file:'[0:10]: + parts=lines[i].strip().split() + motifs[-1].dataset, motifs[-1].beta, motifs[-1].bgfile = \ + parts[2],float(parts[4]), parts[7] + + if lines[i][0:21]=='classification error: '[0:21]: + motifs[-1].cverror=float(lines[i][22:].strip()) + if lines[i][0:20]=='SVM match threshold: '[0:20]: + motifs[-1].match_thresh=float(lines[i][21:].strip()) + if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0: + '''#Using all (132) motifs in SLT_081503.seeds as seeds:''' + seedfile = lines[i].split()[-3] + for i in range(len(motifs)): + if seedfile: motifs[i].seedfile = seedfile + seednum = motifs[i].seednum + if seedD.has_key(seednum): + motifs[i].seedtxt = seedD[seednum] + return motifs + +def save_motifs(motifs,filename,kmer_count=20): + """Save list of motifs as a 'TAMO'-formatted motif file to the specificied file. + optional kmer_count specificies how many sequences to include in the printed + multiple sequence alignment that recapitulates the probability matrix.""" + try : + print_motifs(motifs,kmer_count,f=filename) + except: + print '!-- Error saving motifs to %s'%filename + raise + +def print_motif(motif,kmer_count=20,istart=0,f=None): + """Print a motif in the 'TAMO'-format. istart specificies the motif number, and + optional kmer_count specificies how many sequences to include in the printed + multiple sequence alignment that recapitulates the probability matrix. """ + print_motifs([motif],kmer_count,istart) + sys.stdout.flush() + +def print_motifs(motifs,kmer_count=20,istart=0,f=None): + """Print list of motifs as a 'TAMO'-formatted motif file to the specificied file. + Optional kmer_count specificies how many sequences to include in the printed + multiple sequence alignment that recapitulates the probability matrix. + istart specifies number from which to begin motif ids.""" + + # handle f input cases + if f is None : + f = sys.stdout + elif isinstance(f,str) : + f = open(f,'w') + + i = istart-1 + for m in motifs: + i = i + 1 + print >>f, "Log-odds matrix for Motif %3d %s"%(i,m) + m._print >>f, _ll() + #print >>f, "Probability matrix for Motif %3d %s"%(i,m) + #m._print >>f, _p() + print >>f, "Sequence Logo" + m._print >>f, _bits() + for newprop in ('gamma', 'church', 'E_site', 'E_seq', 'E_chi2', 'realpvalue', + 'kellis', 'MNCP', 'ROC_auc', 'CRA', 'Cfrac', 'frac', 'binomial'): + if not m.__dict__.has_key(newprop): #Kludge to deal w/ old shelves + m.__dict__[newprop] = None + if m.seedtxt: print >>f, "Seed: %3d %s"%(i,m.seedtxt) + if m.gamma: print >>f, "Gamma: %7.5f"%m.gamma + if m.evalue != None: print >>f, 'Evalue: %6.3e'%m.evalue + if m.progscore is not None : + printableProgscore=(m.progscore[0],str(m.progscore[1])) + print >>f, 'Program specific score: '+ ": ".join(printableProgscore) + + if m.family: print >>f, "Family: ",m.family + if m.source: print >>f, "Source: ",m.source + if m.dataset: print >>f, "fasta file: %s beta: %f background sequences: %s"%(m.dataset,m.beta,m.bgfile) + if m.match_thresh: print >>f, "SVM match threshold: ",m.match_thresh + if m.cverror: print >>f, "classification error: ",m.cverror + #Motif 0 NGAGGGGGNN (0) (Bits: 8.24 MAP: 6.53 D: 0.21 0) Enr: 54.000 + print >>f, "Motif %3d %-25s (Bits: %5.2f MAP: %5.2f D: %5.3f %2d) E: %6.3f"%( + i, m, m.totalbits, m.MAP, m.seeddist, m.seednum, nlog10(m.pvalue)), + if m.binomial!=None: print >>f, ' Bi: %5.2f'%nlog10(m.binomial), + if m.church != None: print >>f, ' ch: %5.2f'%nlog10(m.church), + if m.frac != None: print >>f, ' f: %5.2f'%(m.frac), + if m.E_site != None: print >>f, ' Es: %5.2f'%nlog10(m.E_site), + if m.E_seq != None: print >>f, ' Eq: %5.2f'%(nlog10(m.E_seq)), + if m.MNCP != None: print >>f, ' mn: %5.2f'%(m.MNCP), + if m.ROC_auc!= None: print >>f, ' Ra: %6.4f'%(m.ROC_auc), + if m.E_chi2 != None: + if m.E_chi2 == 0: m.E_chi2=1e-20 + print >>f, ' x2: %5.2f'%(nlog10(m.E_chi2)), + if m.CRA != None: print >>f, ' cR: %6.4f'%(m.CRA), + if m.Cfrac != None: print >>f, ' Cf: %6.4f'%(m.Cfrac), + if m.realpvalue != None: print >>f, ' P: %6.4e'%(m.realpvalue) + if m.kellis != None: print >>f, ' k: %5.2f'%(m.kellis), + try: + if m.numbound : print >>f, ' b: %3d'%(m.numbound), + if m.nummotif : print >>f, ' nG: %3d'%(m.nummotif), + if m.numboundmotif : print >>f, ' bn: %3d'%(m.numboundmotif), + except: pass + print >>f, '' + + _max = m.maxscore + m.maxscore = -100 + if kmer_count >= 0: + seqs = m.bogus_kmers(kmer_count) + else: + seqs = m.seqs + + for seq in seqs: + print >>f, seq,i,m.scan(seq)[2][0] + + m.maxscore = _max + print >>f, '*'*m.width + print >>f, "MAP Score: %f"%(m.MAP) + +def nlog10(x,min=1e-323): + """returns -log10(x) with a maximum default value of 323.""" + if x < min: x=min + try: + return math.fabs(math.log(x)/math.log(10)) + except: + return 0 + +def txt2motifs(txt,VERBOSE=1): + """Convert a text string into a list of motifs: + Examples: + + 'TGASTCA,GAATC' --> 2 motifs from ambiguity codes + 'results.tamo' --> All motifs in TAMO-format file + 'results.tamo:34,45' --> Motifs 34 and 45 in TAMO-format file + 'results.pickle' --> All motifs in pickle (list or dict of Motifs) + 'results.pickle%GAL4 --> 'GAL4' entry in results.pickle dictionary + 'results.pickle:34,45 -> Motifs 34 and 45 in results.pickle list + """ + motifs = [] + exists = os.path.exists + toks = txt.split(':') + if exists(toks[0]): #It's a file!! + fname = toks[0] + if fname.find('.pickle') > 0: #It's a pickle!! + return pickletxt2motifs(toks) + else: #It's a "Motif" file!! + if VERBOSE: + print "# Loading motif from %s"%fname + allmotifs = load(fname) + if len(toks) == 1: motifs = allmotifs + else: + idxs = [int(x) for x in toks[1].split(',')] + motifs = [allmotifs[x] for x in idxs] + else: #It's a text string!! + fname = 'TXT' + for t in txt.split(','): + motifs.append(Motif_from_text(t)) + for i in range(len(motifs)): motifs[i].index = i + for i in range(len(motifs)): motifs[i].file = fname + return motifs + +def pickletxt2motifs(toks): + """[Utility function] See txt2motifs documentation.""" + fname = toks[0] + print "# Loading motif pickle from %s"%fname + F = open(fname,'r') + DA = pickle.load(F) + F.close() + ans = [] + if type(DA) == type({}): + if len(toks) > 1: + keys = [x.replace('%',' ') for x in toks[1].split(',')] + for k in keys: ans.append(DA[k]) + else: + for k in DA.keys(): DA[k].key = k + ans = DA.values() + else: #Assuming DA is a list + if len(toks) > 1: + idxs = [int(x) for x in toks[1].split(',')] + ans = [DA[x] for x in idxs] + else: + ans = DA + return ans + + +def sortby(motiflist, property, REV=0): + """Sort a motif list according to a particular property""" + mtype = type(Motif()) + for m in motiflist: + if type(m) != mtype: + print "Not a Motif Object: ",m + return + try: + motiflist.sort(lambda x,y,p=property: cmp(x.__dict__[p],y.__dict__[p])) + if REV: motiflist.reverse() + except: + print 'Could not sort list. Probably, the specificied property "%s" is not posessed by all motifs'%property + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/nib.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,393 @@ +'''Functions and classes used to interface with .nib files as created by Jim +Kent's nibFrag and faToNib utilities.''' + +import glob +import math +import os +import struct +import sys +import warnings +from cStringIO import StringIO +from collections import defaultdict as dd + +from chipsequtil import reverse_complement, get_file_parts, BEDFile + + +# module fields +NOMASK,MASK,HARDMASK = range(3) + + +class NibException(Exception) : pass + + +def _nib_fd(nib) : + '''Returns filename and file descriptor for nib, detecting whether it is a \ + path or fd appropriately''' + + # check to see if nib is a file or a string + if isinstance(nib,file) : + nib_fn = nib.name + nib.seek(0) + nib_f = nib + elif isinstance(nib,str) : + nib_fn = nib + nib_f = open(nib,'rb') + else : + raise NibException('Incompatible .nib argument %s with type %s, needs to \ + be either <type \'file\'> or <type \'str\'>'%(str(nib),type(nib))) + + return nib_fn, nib_f + + +def get_nib(nib,start=0,end=-1,strand='+',mask=NOMASK,name=None,dbHeader=None,tbaHeader=None) : + '''Return a (header,sequence) tuple representing this nibFrag record''' + headers = get_nib_header_batch(nib,[(start,end,strand,name,dbHeader,tbaHeader),]) + seqs = get_nib_seq_batch(nib,[(start,end,strand)],mask) + return headers[0], seqs[0] + + +def get_nib_batch(nib,queries,mask=NOMASK) : + '''Batch interface for fetching fasta records. Returns tuple of lists + (headers,sequences)''' + headers = get_nib_header_batch(nib,queries) + seqs = get_nib_seq_batch(nib,[x[:3] for x in queries],mask=mask) + return headers, seqs + + +def get_nib_seq(nib,start=0,end=-1,strand='+',mask=NOMASK) : + '''Extract subsequence from .nib file like Jim Kent's nibFrag utility. + Default behavior is to return the entire sequence. + + Extract the nucleotide substring defined by the closed interval [start,end] + from the sequence found in *nib_fn*. *mask* parameter has the following + possible values: + + chipsequtil.nib.NOMASK -- masked positions are not indicated (default) + chipsequtil.nib.MASK -- masked positions are capitalized, normal bases lower case + chipsequtil.nib.NOMASK -- masked positions are replaced with Ns + ''' + return get_nib_seq_batch(nib,[(start,end,strand)],mask)[0] + + +def get_nib_header(nib_fn,start=0,end=-1,strand='+',name=None,dbHeader=None,tbaHeader=None) : + '''Method for constructing fasta headers compliant with nibFrag utility''' + headers = get_nib_header_batch(nib,[(start,end,strand,name,dbHeader,tbaHeader),]) + return headers[0] + + +def get_nib_header_batch(nib,queries) : + '''Batch method for creating nibFrag headers. *queries* is a list of at most + 6-tuples (start,end,strand,name,dbHeader,tbaHeader) representing queries as + specified by the original nibFrag utility. Only start, end, and strand + fields are required.''' + + nib_path, nib_f = _nib_fd(nib) + + nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) + nbases = validate_nib_file(nib) + headers = [] + header_tmpl = '>%(name)s%(db)s\n' + + for rec in queries : + + # set some defaults if they are not supplied + rec = list(rec) + rec.extend([None]*(6-len(rec))) + start, end, strand, name, dbHeader, tbaHeader = rec + + if end == -1 : + end = nbases + fields = {} + fields['name'] = nib_path+':%d-%d'%(start,end) if not name else name + fields['db'] = '' + + if tbaHeader : + # ignored for some reason in nibFrag when tbaHeader supplied and dbHeader is not + fields['name'] = '' if not dbHeader else fields['name'] + fields['db'] = '%s.%s:%d-%d of %d'%(tbaHeader,nib_base,start,end,nbases) + if dbHeader : + fields['db'] = ':%s.%s:%d-%d:%s:%d'%(dbHeader,nib_base,start,end,strand,nbases) + + headers.append(header_tmpl%fields) + + return headers + + +def validate_nib_file(nib) : + '''Validate .nib file header, returning number of bases indicated if successful. + *nib* argument is either a filename or an open file object. + ''' + + nib_fn, nib_f = _nib_fd(nib) + + # first 4 bytes are a nib file signature + #TODO - consider attempting to figure out byte order to make truly cross platform + def_sig = 0x6BE93D3A + sig = struct.unpack('=l',nib_f.read(4))[0] + if def_sig != sig : + raise NibException('Invalid nib file signature in %s, found %s, expected \ + %s, perhaps .nib file as not created on this platform?\n\nnibFrag style \ + error: %s is not not a good .nib file.'%(nib_fn,hex(sig),hex(def_sig),nib_fn)) + + # second 4 bytes are number of bases in sequence + nbases = struct.unpack('=l',nib_f.read(4))[0] + + return nbases + + +def get_nib_seq_batch(nib,queries,mask=NOMASK) : + '''Extract subsequence from .nib file like Jim Kent's nibFrag utility. + + Extract the nucleotide substrings defined by the closed intervals in *queries* + from the sequence found in *nib*. *nib* argument is either a filename or + an open file object. Entries in *queries* are 3-tuples defining (start,end,strand) + sequence coordinates. Sequences are returned in order in a list as + strings. *mask* parameter has the following possible values: + + chipsequtil.nib.NOMASK -- masked positions are not indicated (default) + chipsequtil.nib.MASK -- masked positions are capitalized, normal bases lower case + chipsequtil.nib.NOMASK -- masked positions are replaced with Ns + ''' + + nib_fn, nib_f = _nib_fd(nib) + + nbases = validate_nib_file(nib_f) + + # rest of file is sequence, with each nibble (4 bytes) being a base as \ + # follows (from http://genome.ucsc.edu/FAQ/FAQformat.html#format8) : + # + # 0 - T + # 1 - C + # 2 - A + # 3 - G + # 4 - N + # + # The most significant bit in a nibble is set if the base is masked + trans_nuc = 'tcagn' + + # start translating the nibbles into nucleotides + def trans_nib(nib) : + nuc = trans_nuc[nib&7] + mask_bit = nib & 8 + if mask in [MASK,HARDMASK] and mask_bit == 0 : + return nuc.upper() + if mask == HARDMASK and mask_bit == 1 : + return 'N' + return nuc + + headers = [] # stores headers + seqs = [] # stores sequences + + # sort the coords so we can walk most efficiently through the file + queries.sort() + + for start, end, strand in queries : + + if start < 0 : + raise NibException('Received negative start coordinate, this may '\ + 'indicate a region on mitochondrial DNA that '\ + 'spans reference sequence start and end. This '\ + 'utility cannot handle these cases, aborting. '\ + 'Requested interval: %s (%d,%d)'%(nib_fn,start,end)) + + start, end = map(int,(start,end)) + + # end == -1 means caller wants entire sequence + if end == -1 : + end = nbases + + if any([nbases < c for c in [start,end]]) : + raise NibException(('Requested slice (%(start)d,%(end)d) not compatible ' \ + 'with sequence of length %(nbases)d in %(nib_fn)s, aborting\n\nnibFrag '\ + 'style error: nib read past end of file (%(start)d %(end)d) in file: '\ + '%(nib_fn)s')%{'start':start,'end':end,'nbases':nbases,'nib_fn':nib_fn}) + + # figure out how many bytes to read through + start_byte,rem_byte = start/2,start%2 + + # calculate where we need to move to in the file from the current location + # + 8 is from the 2*4 bytes header info in the .nib format + byte_offset = start_byte-nib_f.tell() + 8 + nib_f.seek(byte_offset,1) # seek forward to the beginning byte from current location + seq_bytes,seq_rem_byte = int(math.ceil((end-start+rem_byte)/2.)),(end+1)%2 + seq_bytes = nib_f.read(seq_bytes+seq_rem_byte) + + # start translating the bytes + seq = StringIO() # we use StringIO because it is more efficient than concatenating strings + for c in seq_bytes : + c_byte = struct.unpack('=b',c)[0] + + # higher nibble + c_nib = (c_byte & (15<<4))>>4 + nuc = trans_nib(c_nib) + seq.write(nuc) + + # lower nibble + c_nib = int(c_byte) & 15 + nuc = trans_nib(c_nib) + seq.write(nuc) + + # final nucleotide sequence + seq_str = seq.getvalue() + + # if we're reading to the end, don't clip anything + if end != nbases : + # if the coordinate requested was not on a byte boundary, adjust + if rem_byte == 1 : + seq_str = seq_str[1:] + if seq_rem_byte == 1 : + seq_str = seq_str[:-1] + + # nibFrag apparently uses zero-based indexing, clip off one base + seq_str = seq_str[:-1] + seq.close() + + # adjust strand + if strand == '-' : + seq_str = reverse_complement(seq_str) + seqs.append(seq_str) + + return seqs + + +class SeqDBException(Exception): pass +class NibDBException(Exception): pass + + +class SeqDB(object) : + '''Base class for different kinds of sequence databases. Does nothing, + implement subclasses. Constructor rovides _db_map and db_info class members.''' + def __init__(self) : + self._db_map = {} + self.db_info = dd(dict) + + def get_seq(self,*args, **kwargs) : + raise SeqDBException('Base class SeqDB has no get_seq implementation') + + +class NibDB(SeqDB) : + '''Class providing an interface to a set of .nib files as created by faToNib + in Jim Kent's software suite. + + Sequences are identified by the basename of the .nib file without the .nib + extension, e.g. chr1.nib is identified as chr1. + + Some potentially useful information about the entries in the database is + stored in the *nib_info* dictionary. + ''' + + def __init__(self,nib_fns=[],nib_dirs=[]) : + '''*nib_fns* is a list of paths to specific .nib files desired for the + NibDB. *nib_dirs* is a list of paths to directories containing .nib + files such that every .nib file in the directories is added to the NibDB. + Explicitly passed files take precedence over those found in directories + when sequence names collide. + ''' + SeqDB.__init__(self) + + # find all *.nib files in the directories passed + if isinstance(nib_dirs,str) : # user just provided single directory + nib_dirs = [nib_dirs] + + dir_nibs = [] + for d in nib_dirs : + dir_nibs.extend(glob.glob(os.path.join(d,'*.nib'))) + + if isinstance(nib_fns,str) : + nib_fns = [nib_fns] + # for each .nib found, add to db + # if there is a collision of names, those specified in files (not dirs) + # takes precedence without warning + for fn in dir_nibs+nib_fns : + + # open the nib file + nib_path,nib_fn,nib_base,nib_ext = get_file_parts(fn) + fn, nib_f = _nib_fd(fn) + self._db_map[nib_base] = nib_f + + # store some info + self.db_info[nib_base]['path'] = fn + nbases = validate_nib_file(self._db_map[nib_base]) + self.db_info[nib_base]['nbases'] = nbases + + def __del__(self) : + '''import this + ...Explicit is better than implicit... + ''' + for nib_f in self._db_map.values() : + nib_f.close() + + def _get_db_map(self,name) : + '''Gets appropriate file handle for the requested name, raises NibDBException + if it cannot be found''' + try : + return self._db_map[name] + except KeyError : + raise NibDBException('Sequence name %s not found in NibDB'%name) + + def get_fasta(self,name,start=0,end=-1,strand='+',mask=NOMASK) : + '''Get the fasta record for the specified arguments, returns (header,sequence) + tuple.''' + + nib_f = self._get_db_map(name) + return get_nib(nib_f,start,end,strand,mask) + + def get_fasta_batch(self,recs,mask=NOMASK) : + '''Batch version of *get_fasta* method. *recs* is a list of lists/tuples + with (<chromo>,<start>,<end>,<strand>). Returns list of (header,sequence) + tuples in the same sequence as the input records.''' + + # gather the records for each chromosome together + chrom_recs = dd(list) + for i,r in enumerate(recs) : + chrom_recs[r[0]].append((i,r)) # recs are (index,<tuple>) + + # extract sequences + all_chrom_recs = [] + for chrom, rec_list in chrom_recs.items() : + # sorted lists make sequence extraction efficient + rec_list.sort(key=lambda x: x[1][1]) # recs are (index,<tuple>) + + # separate indexes from records, extract for this chromo + indexes, c_recs = zip(*rec_list) + + # get_nib_batch requires list of (<start>,<end>,<strand>) tuples, remove + # chromo in first position + c_recs = [r[1:] for r in c_recs] + + nib_f = self._get_db_map(chrom) + headers, seqs = get_nib_batch(nib_f,c_recs,mask) + + # return the sequences to a (index,(header,sequence)) list + all_chrom_recs.extend(zip(indexes,zip(headers,seqs))) + + # put the sequences back in the original order + all_chrom_recs.sort(key=lambda x: x[0]) # recs are (index,<tuple>) again + indexes, recs = zip(*all_chrom_recs) + + return zip(*recs) + + def get_fasta_from_bed(self,bed,mask=NOMASK) : + '''Accepts either a chipsequtil.BEDFile instance or a filename for a BED + file (used to construct a BEDFile instance) and returns the fasta + records for all records in order.''' + + # determine if *bed* is a filename or a BEDFile + if isinstance(bed,str) : # filename + bed = BEDFile(bed) + + # construct the records + recs = [] + for rec in bed : + if rec['chrom'].lower().startswith('track') : # track line, skip + continue + recs.append((rec['chrom'],int(rec['chromStart']),int(rec['chromEnd']),rec['strand'])) + + return self.get_fasta_batch(recs,mask) + + def get_seq(self,name,start=0,end=-1,strand='+',mask=NOMASK) : + '''Extract sequence from sequence *name*. Other arguments are passed + directly to *get_nib_seq* function.''' + + nib_f = self._get_db_map(name) + return get_nib_seq(nib_f,start,end,strand,mask)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/plotting.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,24 @@ +import math + +from matplotlib.pyplot import hist, plot, savefig, title, show, xticks, yticks, figure, clf + +from chipsequtil import get_gc_content + +def plot_gc_content(sequences,bins=10,fn=None) : + + # calculate all the GC contents, sort them + gc_contents = map(get_gc_content,sequences) + gc_contents.sort() + + f = figure() + points = hist(gc_contents,bins=bins) + if fn : + savefig(fn) + else : + show() + clf() + + +def plot_pos_neg_peaks(pos_peaks,neg_peaks) : + '''Plot # pos peaks/# neg peaks by p-value''' + pass
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/sampling.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,252 @@ + +import math +import random +import re +import sys +from collections import defaultdict + +from chipsequtil import get_org_settings, get_gc_content, get_gc_content_distribution, RefGeneFile +from nib import NibDB, NibException + +def kl_divergence(p,q) : + """Return Kullback-Leibler divergence for two probability distributions + p and q. p and q should be indexable objects of the same length where + p_i corresponds to q_i. + """ + kl_sum = 0. + for p_i, q_i in zip(p,q) : + if p_i != 0 and q_i != 0 : + kl_sum += p_i * math.log(p_i/q_i) + return kl_sum + +def rejection_sample_bg(fg_dict,organism,bins=100,num_samples=None,verbose=False, + bg_match_epsilon=1e-3) : + '''Generate background sequences according to the size, distance from genes, + and GC content distributions of the supplied foreground sequences. *fg_dict* + is a dictionary of <header>:<sequence> items, where the first part of the + header must contain: + + >chrX:<start>-<end> + + *organism* is a string that will be used to call the *chipsequtil.get_org + settings* function and uses the 'genome_dir' and 'annotation_path' keys. + *bins* is the number of bins to use for representing the GC content + distribution. Function returns a dictionary of <header>:<sequence> items + of generated background sequences.''' + + nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']]) + tss_fn = get_org_settings(organism)['annotation_path'] + tss = defaultdict(list) + for rec in RefGeneFile(tss_fn) : + tss[rec['chrom']].append((int(rec['txStart']),int(rec['txEnd']),)) + + # for each peak find the chromosome, distance to nearest + # gene, size of peaks in bases, and GC content + num_samples = len(fg_dict) if not num_samples else num_samples + dists,sizes=[],[] + + for header,seq in fg_dict.items() : + + # chromosome first field in fasta headers from bed2seq.bedtoseq + chrom = header.split(':')[0] + + # adjust chromosomes in special cases + if re.search('random',chrom.lower()) or chrom.lower() == 'chrm' : + continue + + # start first int in second field of bed2seq.bedtoseq header + start = int(header.split(':')[1].split('-')[0]) + midpoint = start + len(seq)/2 + + # figure out which chromosome we're working on + tss_chr = tss[chrom] + + # dsts_to_genes is the distance of this peak from all the genes, find minimum + dists_to_genes = [(s[0]-midpoint) for s in tss_chr] + try : + min_dist = min(dists_to_genes,key=lambda x : abs(x)) + dists.append(min_dist) + except : + err_str = 'Warning: no genes were found for sequence with header' \ + ' %s, not using to calculate distributions.\n'%header + sys.stderr.write(err_str) + + # calculate # bases + sizes.append(len(seq)) + + # GC content distribution for the foreground sequences + gc_dist = get_gc_content_distribution(fg_dict.values(),bins=bins) + + # max_gc is # peaks w/ highest GC content + max_gc = max(gc_dist) + + # gene_starts is a list of all genes in (chromosome,gene start) tuples + gene_starts=[] + for key in tss.keys(): + chrom=key.split('chr')[-1] + for x in tss[key]: + gene_starts.append((key,x[0])) + + # encapsulated function for proposing sequences + def propose_sequence(dists, gene_starts, sizes, nib_db) : + # sample a random distance from the list of distances + d = random.choice(dists) + + # pick a random gene + chrom, coord = random.choice(gene_starts) + + # propose a starting point for the bg sequence + midpoint = coord-d+random.randint(-100,100) + + # propose a size for the bg sequence + size = random.choice(sizes) + start = int(midpoint-int(size/2)) + stop = int(midpoint+int(size/2)) + + #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d)) + # if start or stop are negative, skip and try again + if start < 0 or stop < 0 : seq = None + + # randomly choose strand + strand = '+' if random.random() > 0.5 else '-' + + # extract the proposed sequence + try : + nib_title, seq = nib_db.get_fasta(chrom,start,stop,strand) + except IOError, e : + if verbose : sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n'%(chrom,start,stop,strand)) + seq = None + except NibException, e : + if verbose : sys.stderr.write('NibDB.get_fasta error, %s\n'%e) + seq = None + + header = '%s:%d-%d'%(chrom,start,stop) + + return header, seq + + + # build gc content distribution based on seq length and + # distance from TSS foreground distributions + # keep sampling sequences until the distribution stops + # changing a lot (KL divergence < epsilon) + bg_gc_cnts = [1.]*bins + converged = False + epsilon = bg_match_epsilon + if verbose : sys.stderr.write('Building empirical background GC content distribution\n') + while not converged : + + # propose a sequence + header, seq = propose_sequence(dists,gene_starts,sizes,nib_db) + + # sometimes this happens when there is an error, just try again + if seq is None : + continue + + # determine the GC bin for this sequence + gc_content = get_gc_content(seq) + gc_bin = -1 + for i in range(bins) : + win_start = i/float(bins) + win_end = (i+1)/float(bins) + if gc_content >= win_start and gc_content < win_end : + gc_bin = i + break + + # update the gc content distribution + sum_cnts = float(sum(bg_gc_cnts)) + if sum_cnts != 0 : # ! on first sequence + + # calculate the current distributions + last_gc_p = map(lambda x:x/sum_cnts,bg_gc_cnts) + bg_gc_cnts[gc_bin] += 1 + new_gc_p = map(lambda x:x/sum_cnts,bg_gc_cnts) + + # calculate the kl divergence between last distribution + # and current one, stopping if less than epsilon + kl_d = kl_divergence(new_gc_p,last_gc_p) + if verbose : sys.stderr.write('dist to converge: %.3g\r'%(kl_d-epsilon)) + if kl_d < epsilon : + converged = True + + else : + bg_gc_cnts[gc_bin] += 1 + + if verbose : sys.stderr.write('\ndone\n') + + # add pseudocounts to account for missing data in bg as to avoid + # inappropriate scaling in rejection sampling step + # the fg bin with the largest value that corresponds to an empty + # bg bin is used to calculate the number of pseudocounts so that + # the resulting bg bin has the same propotion of counts in it as + # the original fg bin. This is calculated as: + # + # x_{pseudo} = \frac{p_i\sum_{i=1}^{N}a_i}{1-p_iN} + # + # where p_i is the value of the max fg bin w/ zero in the bg bin + # x_{pseudo} is added to every bin + pseudocounts = 0 + for fg_i, bg_i in zip(gc_dist,bg_gc_cnts) : + if fg_i != 0 and bg_i == 0 and fg_i*len(fg_dict) > pseudocounts : + # if fg_i > 1/sum(bg_gc_cnts) this won't work, but that *shouldn't* + # ever happen + if fg_i >= 1./sum(bg_gc_cnts) : + raise Exception('There was a numeric issue in the rejection sampling routine, please try it again') + sys.stderr.write(str([fg_i,sum(bg_gc_cnts),len(bg_gc_cnts),1.*fg_i*len(bg_gc_cnts),bg_gc_cnts])+'\n') + sys.stderr.flush() + pseudocounts = (fg_i*sum(bg_gc_cnts))/(1-1.*fg_i*len(bg_gc_cnts)) + + bg_gc_cnts = map(lambda x: x+pseudocounts/sum(bg_gc_cnts),bg_gc_cnts) + bg_gc_dist = map(lambda x: x/sum(bg_gc_cnts),bg_gc_cnts) + + # last, find the multiplier that causes the background gc distribution to + # envelope the foreground gc dist + z_coeff = gc_dist[0]/bg_gc_dist[0] + for fg_i, bg_i in zip(gc_dist[1:],bg_gc_dist[1:]) : + z_coeff = max(z_coeff,fg_i/bg_i) + bg_gc_dist = map(lambda x: x*z_coeff,bg_gc_dist) + + # start generating bg sequences + bg_dict = {} + + bg_gcs,bg_sizes=[],[] + + # generate a bg sequence for every fg sequence + for i in range(num_samples): + if verbose : sys.stderr.write('%d/%d'%(i,num_samples)) + + # propose sequences until one is accepted + accepted_sequence = False + while not accepted_sequence: + if verbose : sys.stderr.write('.') + + # propose a sequence + header, seq = propose_sequence(dists,gene_starts,sizes,nib_db) + + # problem occured in proposing sequence, just keep going + if seq is None : continue + + # determine the GC bin for this sequence + gc_content = get_gc_content(seq) + gc_bin = -1 + for i in range(bins) : + win_start = i/float(bins) + win_end = (i+1)/float(bins) + if gc_content >= win_start and gc_content < win_end : + gc_bin = i + continue + + # pick a uniform random number such that it does not exceed + # the maximum GC content distribution over bins + # if the random number is <= the GC content for this + # proposed sequence, accept, otherwise reject + r = random.random() * bg_gc_dist[gc_bin] + if r > gc_dist[gc_bin] : + continue + else: + bg_gcs.append(x) + #bg_sizes.append(size) + accepted_sequence = True + bg_dict[header] = seq + + if verbose : sys.stderr.write('\r') + return bg_dict
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/seq.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,265 @@ +from itertools import izip +from textwrap import wrap + +# FASTA functions and classes +def fasta_itr(f) : + '''Returns a generator that iterates through a FASTA formatted file. + *f* may be either a text or gzipped file, or a file-like python object + representing either of these. Records are returned in the order they + are found.''' + if isinstance(f,str) : + f = open(f) + + # check for magic number 1f 8b indicating gzip file, I dunno, just cuz + if f.read(2) == "\x1f\x8b" : f = gzip.GzipFile(f) + else : f.seek(0) + + curr_header, curr_seq = None, None + for r in f : + if r.startswith('>') : + if curr_header is not None : + yield (curr_header, curr_seq) + curr_header = r[1:].strip() + curr_seq = '' + else : + curr_seq += r.strip() + # return the last record + yield (curr_header,curr_seq) + +def fasta_to_dict(f) : + '''Returns a dictionary whose keys are FASTA headers and values are + sequences. *f* may be a text, gzipped file, or a file-like + python object representing either of these.''' + return dict(fasta_itr(f)) + +def write_fasta_to_file(fasta,f,linelen=None) : + '''Writes the FASTA records in *fasta* to file specified in *f*. *fasta* + may be a dictionary like that returned by *fasta_to_dict* or a *FASTAFile* + instance. *f* may be a filename or a file-like object opened with write + mode.''' + if isinstance(fasta,dict) : + fasta_itr = fasta.iteritems() + else : + fasta_itr = fasta + + if isinstance(f,str) : + f = open(str,'w') + + for header, seq in fasta_itr : + if linelen is not None : + seq = fill(seq,linelen) + f.write('>%s\n%s\n'%(header,seq)) + f.close() + + +class FASTAFile(object) : + '''A file-like object providing information and statistics about the + sequences in a FASTA formatted file. Efficiently iterates through a + text or gzipped FASTA file and provides sequential or random access to + the records. Instances store header and sequence data as they are read. + + >>> fasta_str = StringIO(">seq1\\nACATAGGGAT\\n>seq2\\nTTATNTAGATA\\n") + >>> fasta_f = FASTAFile(fasta_str) + >>> [r for r in fasta_f] + [('seq1', 'ACATAGGGAT'), ('seq2', 'TTATNTAGATA')] + >>> fasta_f['seq1'] + ACATAGGGAT + >>> fasta_f.headers + ['seq1', 'seq2'] + >>> fasta_f.sequences + ['ACATAGGGAT', 'TTATNTAGATA'] + + Instances have the following members: + + **headers** + list of FASTA headers in original order + + **sequences** + list of FASTA sequences in original order + + .. NOTE:: + The members **headers** and **sequences** are not available until the + the FASTA records have been iterated once. + + When indexing like `fasta_f['seq1']`, the class assumes all headers are + unique, iterating does not make this assumption. + ''' + + def __init__(self,f) : + self._f = f + self._fasta_itr = fasta_itr(f) + self.headers = [] + self.sequences = [] + self._dict = {} + + def __getitem__(self,key) : + return self._dict[key] + + def __setitem__(self,key,val) : + self._dict[key] = val + + def next(self) : + '''Returns next FASTA record in the file as (header, sequence) tuple.''' + + if self._fasta_itr is None : + self._fasta_itr = izip(self.headers,self.sequences) + + try : + header, seq = self._fasta_itr.next() + except StopIteration, e : + self._fasta_itr = None + self._f = None + raise e + + if self._f is not None : + # this means we're not done reading through the file yet + self.headers.append(header) + self.sequences.append(seq) + self._dict[header] = seq + + return header, seq + + def __iter__(self) : + return self + +# FASTQ functions and classes +def fastq_itr(f) : + '''Returns a generator that iterates through a FASTQ formatted file. + *f* may be either a text or gzipped file, or a file-like python object + representing either of these. Records are returned in the order they + are found.''' + if isinstance(f,str) : + f = open(f) + + # check for magic number 1f 8b indicating gzip file, I dunno, just cuz + if f.read(2) == "\x1f\x8b" : f = gzip.GzipFile(f) + else : f.seek(0) + + SEQ, QUAL = 0,1 + in_region = SEQ + curr_header, curr_seq, curr_qual = None, None, None + for r in f : + if r.startswith('@') : + if curr_header is not None : + yield (curr_header, (curr_seq, curr_qual)) + curr_header = r[1:].strip() + curr_seq = '' + curr_qual = '' + in_region = SEQ + elif r.startswith('+') : + in_region = QUAL + else : + curr_field = r.strip() + if in_region == SEQ : + curr_seq += curr_field + elif in_region == QUAL : + curr_qual += curr_field + + # return the last record + yield (curr_header,(curr_seq,curr_qual)) + +def fastq_to_dict(f) : + '''Returns a dictionary whose keys are FASTQ headers and values are + sequences. *f* may be a text, gzipped file, or a file-like + python object representing either of these.''' + return dict(fastq_itr(f)) + +def write_fastq_to_file(fastq,f,linelen=None) : + '''Writes the FASTQ records in *fasta* to file specified in *f*. *fastq* + may be a dictionary like that returned by *fastq_to_dict* or a *FASTQFile* + instance. *f* may be a filename or a file-like object opened with write + mode.''' + if isinstance(fastq,dict) : + fastq_itr = fasta.iteritems() + else : + fastq_itr = fasta + + f_out = open(str,'w') if isinstance(f,str) else f + + for header, (seq, qual) in fastq_itr : + if linelen is not None : + seq = fill(seq,linelen) + f_out.write('>%s\n%s\n'%(header,seq)) + + if isinstance(f,str) : + f_out.close() + + +class FASTQFile(object) : + '''A file-like object providing information and statistics about the + sequences in a FASTQ formatted file. Efficiently iterates through a + text or gzipped FASTQ file and provides sequential or random access to + the records. Instances store header and sequence data as they are read + + >>> fastq_str = StringIO("@seq1\\nACATAGGGAT\\n+seq2\\nY^_cccQYJQ\\n + @seq2\\nTTATNTAGAT\\n+seq2\\nY^_cJcQQJQ") + >>> fastq_f = FASTQFile(fastq_str) + >>> [r for r in fastq_f] + [('seq1', ('ACATAGGGAT', 'Y^_cccQYJQ')), ('seq2', ('TTATNTAGATA', 'Y^_cJcQQJQ'))] + >>> fastq_f['seq1'] + ('seq1', ('ACATAGGGAT', 'Y^_cccQYJQ')) + >>> fastq_f.headers + ['seq1', 'seq2'] + >>> fastq_f.sequences + ['ACATAGGGAT', 'TTATNTAGAT'] + >>> fastq_f.quals + ['Y^_cccQYJQ', 'Y^_cJcQQJQ'] + + Instances have the following members: + + **headers** + list of FASTQ headers in original order + + **sequences** + list of FASTQ sequences in original order + + **quals** + list of FASTQ quality scores in original order + + .. NOTE:: + The members **headers**, **sequences**, and **quals** are not available + until the the FASTQ records have been iterated once + + When indexing like `fastq_f['seq1']`, the class assumes all headers are + unique, iterating does not make this assumption. + ''' + + def __init__(self,f) : + self._f = f + self._fastq_itr = fastq_itr(f) + self.headers = [] + self.sequences = [] + self.quals = [] + self._dict = {} + + def __getitem__(self,key) : + return self._dict[key] + + def __setitem__(self,key,val) : + self._dict[key] = val + + def next(self) : + '''Returns next FASTA record in the file as (header, sequence) tuple.''' + + if self._fastq_itr is None : + self._fastq_itr = izip(self.headers,self.sequences) + + try : + header, (seq, qual) = self._fastq_itr.next() + except StopIteration, e : + self._fastq_itr = None + self._f = None + raise e + + if self._f is not None : + # this means we're not done reading through the file yet + self.headers.append(header) + self.sequences.append(seq) + self.quals.append(qual) + self._dict[header] = (seq, qual) + + return header, (seq, qual) + + def __iter__(self) : + return self +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/src/chipsequtil/util.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,131 @@ +"""Utility/helper classes and functions used by the chipsequtil package. +""" + +import textwrap + +from optparse import IndentedHelpFormatter + +class MultiLineHelpFormatter(IndentedHelpFormatter) : + """An OptionParser formatter that preserves newline characters in + description and epilog fields and word-wraps all sequences of text + not interrupted by newline characters. + """ + + def _format_text(self, text) : + """Wrap paragraphs of text individually separated by + newlines (preserves explicit newline characters). + """ + text_width = self.width - self.current_indent + indent = " "*self.current_indent + output_text = [] + paragraphs = text.split('\n') + for p in paragraphs : + output_text.append(textwrap.fill(p, + text_width, + initial_indent=indent, + subsequent_indent=indent)) + return '\n'.join(output_text) + + + + +# A binary ordered tree example +# shamelessly copied from: http://code.activestate.com/recipes/286239-binary-ordered-tree/ +class CNode: + left , right, data = None, None, 0 + + def __init__(self, data): + # initializes the data members + self.left = None + self.right = None + self.data = data + + +class KeyedBinaryTree : # do this later... + pass + + +class CBOrdTree: + def __init__(self): + # initializes the root member + self.root = None + + def addNode(self, data): + # creates a new node and returns it + return CNode(data) + + def insert(self, root, data): + # inserts a new data + if root == None: + # it there isn't any data + # adds it and returns + return self.addNode(data) + else: + # enters into the tree + if data <= root.data: + # if the data is less than the stored one + # goes into the left-sub-tree + root.left = self.insert(root.left, data) + else: + # processes the right-sub-tree + root.right = self.insert(root.right, data) + return root + + def lookup(self, root, target): + # looks for a value into the tree + if root == None: + return 0 + else: + # if it has found it... + if target == root.data: + return 1 + else: + if target < root.data: + # left side + return self.lookup(root.left, target) + else: + # right side + return self.lookup(root.right, target) + + def minValue(self, root): + # goes down into the left + # arm and returns the last value + while(root.left != None): + root = root.left + return root.data + + def maxDepth(self, root): + if root == None: + return 0 + else: + # computes the two depths + ldepth = self.maxDepth(root.left) + rdepth = self.maxDepth(root.right) + # returns the appropriate depth + return max(ldepth, rdepth) + 1 + + def size(self, root): + if root == None: + return 0 + else: + return self.size(root.left) + 1 + self.size(root.right) + + def printTree(self, root): + # prints the tree path + if root == None: + pass + else: + self.printTree(root.left) + print root.data, + self.printTree(root.right) + + def printRevTree(self, root): + # prints the tree path in reverse + # order + if root == None: + pass + else: + self.printRevTree(root.right) + print root.data, + self.printRevTree(root.left) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil-master/uninstall.py Mon Mar 07 16:18:10 2016 -0500 @@ -0,0 +1,31 @@ +#TODO this doesn't work yet - consider doing this later, use the install +# -f|--force option for now + +# distutils doesn't handle uninstalling things, this class deletes all the files +# this package installs if it has appropriate permissions to do it, otherwise +# print out the files that must be deleted to uninstall +class uninstall(build_py) : + def run(self) : + + + # delete modules + print self.distribution.py_modules + + # delete extensions + print self.distribution.ext_modules + + # delete packages + print self.distribution.packages + + # delete package data + print self.distribution.package_data + + # delete scripts + print self.distribution.scripts + + print self.distribution.get_command_obj('install').get_outputs() + + def remove_path(self,path) : + '''Attempt to remove the specified path, returning non-zero status code on error''' + +