Mercurial > repos > alenail > chipsequtil_old
changeset 5:b2a929e58437 draft
Uploaded
author | alenail |
---|---|
date | Mon, 28 Mar 2016 12:31:17 -0400 |
parents | 19d80c995d7b |
children | 2ef1fca6d530 |
files | chipsequtil-master/docs/._Makefile chipsequtil-master/docs/._get_script_help.py chipsequtil-master/docs/._source chipsequtil-master/docs/Makefile chipsequtil-master/docs/get_script_help.py chipsequtil-master/docs/source/._conf.py chipsequtil-master/docs/source/._index.rst chipsequtil-master/docs/source/._module_reference.rst chipsequtil-master/docs/source/._module_src chipsequtil-master/docs/source/._quick_start.rst chipsequtil-master/docs/source/._script_reference.rst chipsequtil-master/docs/source/conf.py chipsequtil-master/docs/source/index.rst chipsequtil-master/docs/source/module_reference.rst chipsequtil-master/docs/source/module_src/._chipsequtil.rst chipsequtil-master/docs/source/module_src/._file_wrappers.rst chipsequtil-master/docs/source/module_src/._motiftools.rst chipsequtil-master/docs/source/module_src/._nib.rst chipsequtil-master/docs/source/module_src/._org_settings.rst chipsequtil-master/docs/source/module_src/._seq.rst chipsequtil-master/docs/source/module_src/._util.rst chipsequtil-master/docs/source/module_src/chipsequtil.rst chipsequtil-master/docs/source/module_src/file_wrappers.rst chipsequtil-master/docs/source/module_src/motiftools.rst chipsequtil-master/docs/source/module_src/nib.rst chipsequtil-master/docs/source/module_src/org_settings.rst chipsequtil-master/docs/source/module_src/seq.rst chipsequtil-master/docs/source/module_src/util.rst chipsequtil-master/docs/source/quick_start.rst chipsequtil-master/docs/source/script_reference.rst chipsequtil-master/examples/._mapping chipsequtil-master/examples/._nib chipsequtil-master/examples/._seq chipsequtil-master/examples/mapping/._map_to_known_gene.sh chipsequtil-master/examples/mapping/._test_peaks.xls chipsequtil-master/examples/mapping/map_to_known_gene.sh chipsequtil-master/examples/mapping/test_peaks.xls chipsequtil-master/examples/nib/._shuffled_peaks.bed chipsequtil-master/examples/nib/._test_batch_fasta.py chipsequtil-master/examples/nib/._test_nib_db.py chipsequtil-master/examples/nib/shuffled_peaks.bed chipsequtil-master/examples/nib/test_batch_fasta.py chipsequtil-master/examples/nib/test_nib_db.py chipsequtil-master/examples/seq/._test_chipsequtil_seq.py chipsequtil-master/examples/seq/test_chipsequtil_seq.py chipsequtil-master/scripts/._THEME.sh chipsequtil-master/scripts/._build_chipseq_infosite.py chipsequtil-master/scripts/._chipseq_pipeline.py chipsequtil-master/scripts/._chipseq_pipeline_wo_ctrl.py chipsequtil-master/scripts/._combine_gerald_stats.py chipsequtil-master/scripts/._compare_microarray_binding.py chipsequtil-master/scripts/._construct_bg_fasta.py chipsequtil-master/scripts/._create_pipeline_script.py chipsequtil-master/scripts/._extract_promoters.py chipsequtil-master/scripts/._filter_bed_by_position_count.py chipsequtil-master/scripts/._filter_gps_peaks.py chipsequtil-master/scripts/._filter_macs_peaks.py chipsequtil-master/scripts/._filter_mapped_known_genes.py chipsequtil-master/scripts/._generate_stats_doc.py chipsequtil-master/scripts/._gerald_stats.py chipsequtil-master/scripts/._gerald_to_bed.py chipsequtil-master/scripts/._integrate_macs_ucsc.py chipsequtil-master/scripts/._join_mapped_known_genes.py chipsequtil-master/scripts/._kg_to_gff.py chipsequtil-master/scripts/._map_intervals.py chipsequtil-master/scripts/._map_peaks_to_genes.py chipsequtil-master/scripts/._map_peaks_to_known_genes.py chipsequtil-master/scripts/._motif_scan.py chipsequtil-master/scripts/._nibFrag.py chipsequtil-master/scripts/._org_settings.py chipsequtil-master/scripts/._peaks_to_fasta.py chipsequtil-master/scripts/._plot_peak_loc_dist.py chipsequtil-master/scripts/._plot_pos_vs_neg_peaks.py chipsequtil-master/scripts/._probeset_to_known_gene.py chipsequtil-master/scripts/._rejection_sample_fasta.py chipsequtil-master/scripts/._sort_bed.py chipsequtil-master/scripts/._split_file.py chipsequtil-master/scripts/._split_qsub.py chipsequtil-master/scripts/._wait_for_jobid.py chipsequtil-master/scripts/._wait_for_qsub.py chipsequtil-master/scripts/._wqsub.py chipsequtil-master/scripts/._wqsub_drmaa.py chipsequtil-master/scripts/THEME.sh chipsequtil-master/scripts/build_chipseq_infosite.py chipsequtil-master/scripts/chipseq_pipeline.py chipsequtil-master/scripts/chipseq_pipeline_wo_ctrl.py chipsequtil-master/scripts/combine_gerald_stats.py chipsequtil-master/scripts/compare_microarray_binding.py chipsequtil-master/scripts/construct_bg_fasta.py chipsequtil-master/scripts/create_pipeline_script.py chipsequtil-master/scripts/extract_promoters.py chipsequtil-master/scripts/filter_bed_by_position_count.py chipsequtil-master/scripts/filter_gps_peaks.py chipsequtil-master/scripts/filter_macs_peaks.py chipsequtil-master/scripts/filter_mapped_known_genes.py chipsequtil-master/scripts/generate_stats_doc.py chipsequtil-master/scripts/gerald_stats.py chipsequtil-master/scripts/gerald_to_bed.py chipsequtil-master/scripts/integrate_macs_ucsc.py chipsequtil-master/scripts/join_mapped_known_genes.py chipsequtil-master/scripts/kg_to_gff.py chipsequtil-master/scripts/map_intervals.py chipsequtil-master/scripts/map_peaks_to_genes.py chipsequtil-master/scripts/map_peaks_to_known_genes.py chipsequtil-master/scripts/motif_scan.py chipsequtil-master/scripts/nibFrag.py chipsequtil-master/scripts/org_settings.py chipsequtil-master/scripts/peaks_to_fasta.py chipsequtil-master/scripts/plot_peak_loc_dist.py chipsequtil-master/scripts/plot_pos_vs_neg_peaks.py chipsequtil-master/scripts/probeset_to_known_gene.py chipsequtil-master/scripts/rejection_sample_fasta.py chipsequtil-master/scripts/sort_bed.py chipsequtil-master/scripts/split_file.py chipsequtil-master/scripts/split_qsub.py chipsequtil-master/scripts/wait_for_jobid.py chipsequtil-master/scripts/wait_for_qsub.py chipsequtil-master/scripts/wqsub.py chipsequtil-master/scripts/wqsub_drmaa.py chipsequtil-master/src/._chipsequtil chipsequtil-master/src/chipsequtil/.___init__.py chipsequtil-master/src/chipsequtil/._chipsequtil.py chipsequtil-master/src/chipsequtil/._motiftools.py chipsequtil-master/src/chipsequtil/._nib.py chipsequtil-master/src/chipsequtil/._plotting.py chipsequtil-master/src/chipsequtil/._sampling.py chipsequtil-master/src/chipsequtil/._seq.py chipsequtil-master/src/chipsequtil/._util.py chipsequtil-master/src/chipsequtil/__init__.py chipsequtil-master/src/chipsequtil/chipsequtil.py chipsequtil-master/src/chipsequtil/motiftools.py chipsequtil-master/src/chipsequtil/nib.py chipsequtil-master/src/chipsequtil/plotting.py chipsequtil-master/src/chipsequtil/sampling.py chipsequtil-master/src/chipsequtil/seq.py chipsequtil-master/src/chipsequtil/util.py chipsequtil/map_to_known_genes.py chipsequtil/map_to_known_genes.xml chipsequtil/tool_dependencies.xml |
diffstat | 139 files changed, 297 insertions(+), 12242 deletions(-) [+] |
line wrap: on
line diff
--- a/chipsequtil-master/docs/Makefile Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest - -help: - @echo "Please use \`make <target>' where <target> is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ChIPSeqUtil.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ChIPSeqUtil.qhc" - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ - "run these through (pdf)latex." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt."
--- a/chipsequtil-master/docs/get_script_help.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -#!/usr/bin/env python - -import glob -import signal -import time -from subprocess import Popen, PIPE -from textwrap import TextWrapper - -class Alarm(Exception): - pass - -def alarm_handler(signum, frame): - raise Alarm - -signal.signal(signal.SIGALRM, alarm_handler) - -scripts = [#'../scripts/build_chipseq_infosite.py', - '../scripts/chipseq_pipeline.py', - #'../scripts/combine_gerald_stats.py', - #'../scripts/compare_microarray_binding.py', - '../scripts/create_pipeline_script.py', - '../scripts/extract_promoters.py', - '../scripts/filter_bed_by_position_count.py', - '../scripts/filter_macs_peaks.py', - '../scripts/filter_gps_peaks.py', - '../scripts/filter_mapped_known_genes.py', - #'../scripts/generate_stats_doc.py', - '../scripts/gerald_stats.py', - '../scripts/gerald_to_bed.py', - #'../scripts/integrate_macs_ucsc.py', - '../scripts/join_mapped_known_genes.py', - '../scripts/map_intervals.py', - '../scripts/map_peaks_to_genes.py', - '../scripts/map_peaks_to_known_genes.py', - '../scripts/motif_scan.py', - '../scripts/nibFrag.py', - '../scripts/org_settings.py', - '../scripts/peaks_to_fasta.py', - '../scripts/plot_pos_vs_neg_peaks.py', - '../scripts/plot_peak_loc_dist.py', - #'../scripts/probeset_to_known_gene.py', - '../scripts/rejection_sample_fasta.py', - '../scripts/sort_bed.py', - #'../scripts/split_file.py', - #'../scripts/split_qsub.py', - #'../scripts/THEME.sh', - #'../scripts/wait_for_qsub.py', - '../scripts/wait_for_jobid.py', - '../scripts/wqsub.py', - '../scripts/wqsub_drmaa.py', - ] - -if __name__ == '__main__' : - - tw = TextWrapper(initial_indent=" ",subsequent_indent=" ") - script_help_out = '' - refs = '' - for script in scripts : - cmd = 'python %s -h'%script - p = Popen(cmd,shell=True,stdout=PIPE,stderr=PIPE) - - stdout, stderr = None, None - signal.alarm(3) # 3 seconds - try: - stdout, stderr = p.communicate() - signal.alarm(0) # reset the alarm - except Alarm: - pass - - script_str = script.replace('../scripts/','') - - - refs += ' - :ref:`%(script_str)s <%(script_str)s>`\n'%{'script_str':script_str} - script_help_out += '.. _%s:\n\n'%script_str - script_help_out += '%s::\n\n'%script_str - if stderr is None : - script_help_out += tw.fill('empty docstring\n') - else : - script_help_out += '\n'.join([' '+x for x in stdout.split('\n')]) - script_help_out += '\n'.join([' '+x for x in stderr.split('\n')]) - script_help_out += '\n\n' - script_help_out += ':ref:`top <top>`\n\n' - - rst_str = """\ -Illumina pipeline script reference -================================== - -The following is the output of the scripts provided by this package when invoked -on the command line with *-h*. - -.. _top: - -Scripts: -%(refs)s - -%(script_help_out)s -"""%{'refs':refs,'script_help_out':script_help_out} - - print rst_str
--- a/chipsequtil-master/docs/source/conf.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,198 +0,0 @@ -# -*- coding: utf-8 -*- -# -# ChIPSeqUtil documentation build configuration file, created by -# sphinx-quickstart on Mon Oct 31 13:12:52 2011. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys, os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.append(os.path.abspath('.')) - -# -- General configuration ----------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'ChIPSeqUtil' -copyright = u'2011, Adam Labadorf' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.5' -# The full version, including alpha/beta/rc tags. -release = '1.5' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of documents that shouldn't be included in the build. -#unused_docs = [] - -# List of directories, relative to source directory, that shouldn't be searched -# for source files. -exclude_trees = [] - -# The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# "<project> v<release> documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_use_modindex = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a <link> tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'ChIPSeqUtildoc' - - -# -- Options for LaTeX output -------------------------------------------------- - -# The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'ChIPSeqUtil.tex', u'ChIPSeqUtil Documentation', - u'Adam Labadorf', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# Additional stuff for the LaTeX preamble. -#latex_preamble = '' - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_use_modindex = True - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None}
--- a/chipsequtil-master/docs/source/index.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -.. ChIPSeqUtil documentation master file, created by - sphinx-quickstart on Mon Oct 31 13:12:52 2011. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to ChIPSeqUtil's documentation! -======================================= - -ChIPSeqUtil is a python module and accompanying set of scripts used in the -analysis of ChIPSeq short read data. It is designed as a 'push-button' solution -that is easy for non-linux-experts to use but is flexible and extensible enough -to accomodate special cases when they inevitably arise. The default pipeline -performs the following analysis steps: - -1. runs a peak caller (MACS by default) -2. optionally creates and stages bigwig files for viewing on UCSC Genome Browser -3. filters peaks based on confidence criteria (e.g. p-value) -4. maps peaks to genes using UCSC knownGene annotations -5. performs hypothesis-based motif analysis using TRANSFAC motifs -6. builds a web page consolidating results - -ChIPSeqUtil has the following dependencies: - - - MACS (or some other peaks caller) - - TAMO - - reStUtil - - pypeline - - bx python - -.. note:: add links to these bullets - -ChIPSeqUtil has only been tested on ubuntu-based linux distributions and no -certification is made for other OSes. That being said, some/all of it may -still work. - -Contents: - -.. toctree:: - :maxdepth: 2 - - quick_start - script_reference - module_reference - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` -
--- a/chipsequtil-master/docs/source/module_reference.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ - -Module Reference -================ - -The module documentation of the chipsequtil python package is here. - -.. toctree:: - - module_src/chipsequtil - module_src/nib - module_src/seq
--- a/chipsequtil-master/docs/source/module_src/chipsequtil.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - -chipsequtil -=========== - -Contents --------- - -.. toctree:: - - file_wrappers - org_settings - - -.. automodule:: chipsequtil - :members: - :undoc-members: - -Miscellaneous Functions ------------------------ - -.. autofunction:: get_file_parts -.. autofunction:: parse_number -.. autofunction:: gerald_to_bed -.. autofunction:: reverse_complement -.. autofunction:: get_gc_content -.. autofunction:: get_gc_content_distribution -.. autofunction:: get_size_distribution
--- a/chipsequtil-master/docs/source/module_src/file_wrappers.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ - -File Wrappers -============= - -.. module:: chipsequtil - -.. autoclass:: SmartFileIter - :members: - -SmartFileIter-based classes ---------------------------- - -.. autoclass:: BEDFile -.. autoclass:: GPSFile -.. autoclass:: MACSFile -.. autoclass:: KnownGeneFile - -Other wrappers --------------- - -Not all of the file wrappers in this package have been converted to SmartFileIters -yet, these work but are less robust. - -.. autoclass:: AffyBiocFile -.. autoclass:: GERALDOutput - :members: -.. autoclass:: RefGeneFile -
--- a/chipsequtil-master/docs/source/module_src/motiftools.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ - -Motif Classes and Functions -=========================== - -This module is essentially a copy of TAMO.MotifTools, moved into chipsequtil -for strategic sheep purposes. - -.. automodule:: chipsequtil.motiftools - -The Motif Class ---------------- - -.. autoclass:: Motif - :members: - -Functions ---------- - -.. .. autofunction:: revcomplement -.. autofunction:: Motif_from_ll -.. autofunction:: Motif_from_counts -.. autofunction:: Motif_from_text -.. autofunction:: copy -.. .. autofunction:: minwindowdiff -.. .. autofunction:: minaligndiff -.. autofunction:: diff -.. autofunction:: maskdiff -.. autofunction:: infomaskdiff -.. autofunction:: diverge -.. autofunction:: bestseqs -.. autofunction:: seqs2fasta -.. autofunction:: top_nmers -.. autofunction:: m_matches -.. autofunction:: compare_seqs -.. autofunction:: shuffle_bases -.. autofunction:: random_diff_avestd -.. autofunction:: random_motif -.. autofunction:: toDict -.. autofunction:: toDictVect -.. autofunction:: submotif -.. autofunction:: shuffledP -.. autofunction:: revcompmotif -.. autofunction:: sum -.. autofunction:: giflogo -.. autofunction:: seqlogo -.. autofunction:: merge -.. autofunction:: avestd -.. autofunction:: load -.. autofunction:: save_motifs -.. autofunction:: print_motif -.. autofunction:: print_motifs -.. autofunction:: nlog10 -.. autofunction:: txt2motifs -.. autofunction:: pickletxt2motifs -.. autofunction:: sortby -.. .. autoclass:: MotifToolsException
--- a/chipsequtil-master/docs/source/module_src/nib.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - -.. module:: chipsequtil.nib - -nibFrag API -=========== - -These functions and classes are a native python implementation of Jim Kent's nibFrag -utility and file format. The scripts and classes read *.nib* files and can -extract sequences from them as fast or faster than the standalone tools, and -also make sequence data accessible and efficient from within python scripts. -There is no provided utility to create *.nib* files, the original source scripts -must be used and are not provided in this distribution. They might be found on -`Jim Kent's homepage <http://users.soe.ucsc.edu/~kent/>`_. - - -The NibDB Class ---------------- - -.. autoclass:: NibDB - :members: - -Functions ---------- - -Most of these functions should not be used directly, rather they are called -by the NibDB class and implement the gritty details of reading *.nib* files. -Use the NibDB class instead unless you know what you're doing. - - -.. autofunction:: get_nib -.. autofunction:: get_nib_batch -.. autofunction:: get_nib_seq -.. autofunction:: get_nib_header -.. autofunction:: get_nib_header_batch -.. autofunction:: validate_nib_file -.. autofunction:: get_nib_seq_batch
--- a/chipsequtil-master/docs/source/module_src/org_settings.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ - -The `org_settings` System -========================= - -Many scripts in this package require a number of different source files that all -correspond to a single reference genome (*e.g.* mm9). The `org_settings` set of -functions and *org_settings.py* script consolidates sets of paths/variables that -correspond to different references to be bundled together in a customizable, -accessible way. The bundles are configured as a package-wide settings on install -and alternatively by a user-specific configuration file. The format of the file -follows the conventions in `configparser`_. - -.. _configparser: http://docs.python.org/library/configparser.html - -Reference genomes are specified in a configuration file as follows:: - - [mm9] - description=UCSC mm9 (July '07 build) with full TRANSFAC hypothesis set - genome=mm9 - genome_dir=/nfs/genomes/mouse_gp_jul_07 - genome_size=2107000000 - ucsc_chrom_sizes=%(genome_dir)s/%(genome)s.chrom.sizes - annotation_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt - refgene_anno_path=%(genome_dir)s/anno/refFlat-%(genome)s.txt - known_gene_anno_path=%(genome_dir)s/anno/knownGene-%(genome)s.txt - known_gene_xref_path=%(genome_dir)s/anno/kgXref-%(genome)s.txt - affy_to_known_path=%(genome_dir)s/anno/knownToMOE43-%(genome)s.txt - theme_hypotheses=/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo - theme_markov=/nfs/data/cwng/chipseq/hypotheses/Mouse.markov - -This will make **mm9** available as an organism reference to the `org_settings` -functions. The *ucsc_chrom_sizes*, *annotation_path*, *refgene_anno_path*, -*known_gene_anno_path*, *known_gene_xref_path*, and *affy_to_known_path* are -files downloaded from http://hgdownload.cse.ucsc.edu/downloads.html organims -annotation databases. The fields in the above example are all required for the -package to work properly - however, any additional variables may be added as -desired. - -API Functions -------------- - -.. module:: chipsequtil - -.. autofunction:: get_org_settings -.. autofunction:: get_all_settings -.. autofunction:: get_global_settings -.. autofunction:: get_local_settings -.. autofunction:: check_org_settings - -The *org_settings.py* script ----------------------------- - -The script *org_settings.py* is a command line interface into the `org_settings` -system. It has the following usage:: - - $> org_settings.py -h - Usage: org_settings.py [options] [<org key> [<org setting>]] - - Tool for retrieving sets of organism-specific settings and paths. Original - paths are set at install time, and can be overridden in the file ~/.org - settings.cfg. Allows output of settings in a variety of shell environment - syntaxes. The tool attempts to guess which shell environment is being used by - examining the SHELL environment variable unless explicitly set. When run - without an argument, returns a listing of all settings available. - - Options: - -h, --help show this help message and exit - -s SYNTAX, --syntax=SYNTAX - syntax flavor of output to produce - [default: %auto] - -l, --list print all available settings for - human consumption - $> org_settings.py -s bash mm9 genome_dir - /nfs/genomes/mouse_gp_jul_07 - $> - -If you use bash as your shell, you can use shell expansion to conveniently build -commands such as the following:: - - $> map_peaks_to_known_genes.py $(org_settings.py mm9 known_gene_anno_path) \ - $(org_settings.py mm9 known_gene_xref_path) macs_peaks.xls - -Installing ----------- - -The file *org_settings.cfg* exists in the root directory of the source distribution. -This file should be modified and then copied into the *src/chipsequtil/* directory -before installation for org settings that should be available on the system as a -whole. Alternatively, users may create the file *.org_settings.cfg* in their home -directories and add sections like the one above so they may customize their own -sets of variables.
--- a/chipsequtil-master/docs/source/module_src/seq.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ - -.. module:: chipsequtil.seq - -Sequence data functions and classes -=================================== - -This module has simple methods for reading in FASTA and FASTQ formatted files. -*fasta_itr* and *fastq_itr* should be used when it is unnecessary or undesired -to have all sequences loaded into memory. *FASTAFile* and *FASTQFile* classes -store all sequence information in memory, but allow efficient dictionary-style -random access to sequences and quality scores as well as repeated whole-file -iteration. - -Functions ---------- - -.. autofunction:: fasta_itr -.. autofunction:: fasta_to_dict -.. autofunction:: write_fasta_to_file - -.. autofunction:: fastq_itr -.. autofunction:: fastq_to_dict -.. autofunction:: write_fastq_to_file - -Classes -------- - -.. autoclass:: FASTAFile - :members: - -.. autoclass:: FASTQFile - :members: - -
--- a/chipsequtil-master/docs/source/module_src/util.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ - -Utility functions and classes -============================= -
--- a/chipsequtil-master/docs/source/quick_start.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ - -Quick Start Documentation -========================= - -
--- a/chipsequtil-master/docs/source/script_reference.rst Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,892 +0,0 @@ -Illumina pipeline script reference -================================== - -The following is the output of the scripts provided by this package when invoked -on the command line with *-h*. - -.. _top: - -Scripts: - - :ref:`chipseq_pipeline.py <chipseq_pipeline.py>` - - :ref:`create_pipeline_script.py <create_pipeline_script.py>` - - :ref:`extract_promoters.py <extract_promoters.py>` - - :ref:`filter_bed_by_position_count.py <filter_bed_by_position_count.py>` - - :ref:`filter_macs_peaks.py <filter_macs_peaks.py>` - - :ref:`filter_gps_peaks.py <filter_gps_peaks.py>` - - :ref:`filter_mapped_known_genes.py <filter_mapped_known_genes.py>` - - :ref:`gerald_stats.py <gerald_stats.py>` - - :ref:`gerald_to_bed.py <gerald_to_bed.py>` - - :ref:`join_mapped_known_genes.py <join_mapped_known_genes.py>` - - :ref:`map_intervals.py <map_intervals.py>` - - :ref:`map_peaks_to_genes.py <map_peaks_to_genes.py>` - - :ref:`map_peaks_to_known_genes.py <map_peaks_to_known_genes.py>` - - :ref:`motif_scan.py <motif_scan.py>` - - :ref:`nibFrag.py <nibFrag.py>` - - :ref:`org_settings.py <org_settings.py>` - - :ref:`peaks_to_fasta.py <peaks_to_fasta.py>` - - :ref:`plot_pos_vs_neg_peaks.py <plot_pos_vs_neg_peaks.py>` - - :ref:`plot_peak_loc_dist.py <plot_peak_loc_dist.py>` - - :ref:`rejection_sample_fasta.py <rejection_sample_fasta.py>` - - :ref:`sort_bed.py <sort_bed.py>` - - :ref:`wait_for_jobid.py <wait_for_jobid.py>` - - :ref:`wqsub.py <wqsub.py>` - - :ref:`wqsub_drmaa.py <wqsub_drmaa.py>` - - -.. _chipseq_pipeline.py: - -chipseq_pipeline.py:: - - Usage: chipseq_pipeline.py [options] <organism> <experiment alignment filename> [<control alignment filename>] - - 1st generation ChIPSeq analysis pipeline: - - - runs MACS to find peaks and sorts peaks by p-value - - sorts peaks by pvalue and isolates top *n* - - maps peaks to genes - - extracts fasta files for gene peaks in experiments - - constructs background sequences matching foreground distribution - - runs THEME.py on input sequences w/ refinement - - builds an infosite with stats from this analysis - - Control input file is optional. *organism* argument is passed to the - *org_settings.py* command to specify organism specific parameters, ensure - that the following commands return valid paths: - - If running MACS: - - org_settings.py <organism> genome_size - - org_settings.py <organism> genome_dir - - org_settings.py <organsim> refgene_anno_path - - If running THEME: - - org_settings.py <organism> theme_hypotheses - - org_settings.py <organism> theme_markov - - - - Options: - -h, --help show this help message and exit - --auto run all steps non-interactively (for batch mode, e.g.) - --steplist=STEPLIST with --auto, run specific steps - --exp-name=EXP_NAME name for the experiment/pipeline, used for convenience - [default: current directory name] - --bed-args=BED_ARGS double quote wrapped arguments for gerald_to_bed.py - [default: --stdout --chromo-strip=.fa] - --macs-exec=MACS_EXEC - the executable to use for MACS, if not an absolute - path it needs to be on your shell environment path - [default: macs14] - --macs-args=MACS_ARGS - double quote wrapped arguments for macs, only changing - --mfold, --tsize, --bw, and --pvalue recommended - [default: --pvalue=1e-5] - --map-args=MAP_ARGS double quote wrapped arguments for mapping peaks to - genes [default: --tss --upstream-window=10000 - --downstream-window=10000] - --filter-peaks-args=FILTER_PEAKS_ARGS - double quote wrapped arguments for - filter_macs_peaks.py [default: --sort-by=pvalue - --top=1000 -f 'tags>20'] - --filter-neg-peaks-args=FILTER_NEG_PEAKS_ARGS - double quote wrapped arguments for - filter_macs_peaks.py applied to negative peaks - [default: -f 'tags>20'] - --peaks-to-fa-args=PEAKS_TO_FA_ARGS - double quote wrapped arguments for peaks_to_fasta.py - [default: --fixed-peak-width=200] - --bg-exec=BG_EXEC the executable to use for generating background - sequences for THEME, if not an absolute path it needs - to be on your shell environment path [default: - rejection_sample_fasta.py] - --bg-args=BG_ARGS double quote wrapped arguments for background sequence - generation utility [default: --num-seq=2.1x] - --theme-args=THEME_ARGS - double quote wrapped arguments for THEME.py [default: - --beta=0.7 --cv=5 --trials=25] - --motif-pval-cutoff=MOTIF_PVAL - the p-value cutoff for sending non-refined enrichmed - motifs to THEME for refinement - --parallelize parallelize portions of the pipeline using qsub, only - works from SGE execution hosts - --ucsc perform tasks for automated integration with UCSC - genome browser [default:False] - --build-infosite-args=INFOSITE_ARGS - arguments to pass to build_chipseq_infosite.py - [default: None] - - UCSC Integration Options (with --ucsc): - --stage-dir=STAGE_DIR - root directory where UCSC integration files should be - made available [default: ./] - --stage-url=STAGE_URL - URL where UCSC integration files will be made - available over the web [default: http://localhost/] - - Note: it is advised to leave the --*-args arguments unchanged - unless you really know what you're doing. - - -:ref:`top <top>` - -.. _create_pipeline_script.py: - -create_pipeline_script.py:: - - This is an interactive script that creates an executable script to use - for ChIPSeq analyses. When prompted for experiment and control files, - tab completion is available a la bash or tcsh shells. Press Ctrl-C at - any time to quit. - Usage: create_pipeline_script.py - - Script for creating a custom run script for ChIPSeq/DNAse hypersensitivity - experiments. User is asked for paths and settings required for ChIPSeq - analysis using the *chipseq_pipeline.py* utility and produces an executable - run script with helpful information on how to run it. Also creates a JSON - formatted file containing all the parameters for this pipeline run. - - Options: - -h, --help show this help message and exit - - Note: this script only works in Unix-style environments - - ================= ChIPSeq Experiment Pipeline Script Generator ================= - - -:ref:`top <top>` - -.. _extract_promoters.py: - -extract_promoters.py:: - - Usage: extract_promoters.py [options] <organism> - - Extract the promoter sequences in FASTA format from all genes - or a list of genes specified in an input file. Gene annotation is RefGene - corresponding to the organism passed in, paths returned by: - - $> org_settings.py <organism> refgene_anno_path - $> org_settings.py <organism> genome_dir - - must be valid. - - Options: - -h, --help show this help message and exit - -u UPSTREAM, --upstream=UPSTREAM - upstream window from TSS to extract [default: 3000] - -d DOWNSTREAM, --downstream=DOWNSTREAM - downstream window from TSS to extract [default: 1000] - -l GENE_LIST, --gene-list=GENE_LIST - file containing a list of gene identifiers to extract, - one per line [default: none] - -t GENE_TYPE, --gene-type=GENE_TYPE - type of gene identifier in gene list, choose from - ['symbol', 'refgene'] [default: symbol] - -o OUTPUT, --output=OUTPUT - file to write fasta records to [default: stdout] - - -:ref:`top <top>` - -.. _filter_bed_by_position_count.py: - -filter_bed_by_position_count.py:: - - Usage: filter_bed_by_position_count.py [options] <bed file> - - Analyze BED file and filter out alignments above some threshold that align to - a single genomic position. - - Options: - -h, --help show this help message and exit - -n MAX_COUNT, --max-count=MAX_COUNT - max tag count at a given position, filter above - [default: 5] - --output=OUTPUT write output to file - - Note: only works if BED file is sorted! - - -:ref:`top <top>` - -.. _filter_macs_peaks.py: - -filter_macs_peaks.py:: - - Usage: filter_macs_peaks.py [options] <MACS peak file> - - Filter MACS peaks by supplied criteria. Available filter features are: - - length - tags - pvalue - fold_enrichment - fdr - - Filters are provided as expressions using the [-f |--filter] option, e.g. the - command - - filter_macs_peaks.py -f "tags>100" --filter="pvalue<=1e-9" - --filter="100<length<=200" <MACS peak file> - - finds only peaks with more than 100 tags, a pvalue of less than 1e9, and a - length between 100, exclusive, and 200, inclusive. Any number of filters may - be provided, and only peaks that match *all* filters pass. User is warned if - filters result in zero results. Only inequality operators are valid. - Invoking with no filter arguments returns all peaks. To sort, use the --sort- - by option, e.g. - - filter_macs_peaks.py -f "pvalue<=1e-9" --sort-by=pvalue <MACS peak file> - - sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. - All fields are sorted ascending by default. Output is prepended with comments - describing what the file contains, i.e. which filters are applied, how many - records there are, etc. - - Note: MACS -10*log10(pvalue) values are converted to normal pvalues - - - Options: - -h, --help show this help message and exit - -f FILTERS, --filter=FILTERS - add filter expression - --sort-by=SORT_BY comma delimited list of features to sort by, filtered - peaks are not sorted by default, if provided peaks are - sorted ascending by default - --sort-dir=SORT_DIR direction to sort [default: ASCEND] - --top=TOP accepts an integer, output at most this many peaks - [default: all] - --output=OUTPUT filename to output filtered peaks to [default: stdout] - --encode-filters write out records to a file <MACS peaks - file>_<filters>.xls (incompatible with --output - option) - --summary only print out summary information for the filter - --no-header do not print out header or metadata info - --shuffle shuffle order of filtered records, useful for - selecting random peaks - --print-encoded-fn print out the filename that would be created by - --encode-filters - - -:ref:`top <top>` - -.. _filter_gps_peaks.py: - -filter_gps_peaks.py:: - - Usage: filter_gps_peaks.py [options] <GPS peak file> - - Filter GPS peaks by supplied criteria. Available filter features are: - - IP - Control - Fold - qvalue - pvalue - IPvsEMP - IPvsCTR - - Filters are provided as expressions using the [-f |--filter] option, e.g. the - command - - filter_gps_peaks.py -f "IP>100" --filter="pvalue<=1e-9" <GPS peak file> - - finds only peaks with more than 100 tags and a pvalue of less than 1e9. Any - number of filters may be provided, and only peaks that match *all* filters - pass. User is warned if filters result in zero results. Only inequality - operators are valid. Invoking with no filter arguments returns all peaks. To - sort, use the --sort-by option, e.g. - - filter_gps_peaks.py -f "pvalue<=1e-9" --sort-by=pvalue <GPS peak file> - - sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. - All fields are sorted ascending by default. Output is prepended with comments - describing what the file contains, i.e. which filters are applied, how many - records there are, etc. - - Note: GPS P_-log10 and Q_-log10 values are converted to normal pvalues and - qvalues - - - Options: - -h, --help show this help message and exit - -f FILTERS, --filter=FILTERS - add filter expression - --sort-by=SORT_BY comma delimited list of features to sort by, filtered - peaks are not sorted by default, if provided peaks are - sorted ascending by default - --sort-dir=SORT_DIR direction to sort [default: ASCEND] - --top=TOP accepts an integer, output at most this many peaks - [default: all] - --output=OUTPUT filename to output filtered peaks to [default: stdout] - --encode-filters write out records to a file <GPS peaks - file>_<filters>.xls (incompatible with --output - option) - --summary only print out summary information for the filter - --no-header do not print out header or metadata info - --shuffle shuffle order of filtered records, useful for - selecting random peaks - --print-encoded-fn print out the filename that would be created by - --encode-filters - - -:ref:`top <top>` - -.. _filter_mapped_known_genes.py: - -filter_mapped_known_genes.py:: - - Usage: filter_mapped_known_genes.py [options] <mapped known genes file> - - Filter columns and rows from *join_mapped_known_genes.py* output which was - invoked with *--binary-plus* and *--field-types* flags. Specify full column - names for either binding or expression data with the *--bind-cols* and - *--affy-cols* arguments, respectively. The special fieldname *MAPPED* from - *join_mapped_known_genes.py* is used to determine whether a file contains a - mapping for each gene. To filter genes by their associated binding or - expression data, specify *--bind-filter* or *--affy-filter* as follows: - - - *any* - report gene if at least one input file maps to the gene - - *all* - report gene if every input file maps to the gene - - *absent* - report gene if no input file maps to the gene - - *none* - do not filter genes at all (default) - - Results of binding and expression filters are 'and'ed together, e.g.: - - --bind-filter=all --affy-filter=absent - - returns only genes for which all binding files and none of the expression - files map. - - - Options: - -h, --help show this help message and exit - --bind-cols=BIND_COLS - comma delimited list of binding data column names to - include, [default: all] - --affy-cols=AFFY_COLS - comma delimited list of expression data column names - to include, [default: all] - --bind-filter=BIND_FILT - gene set to include based on binding data [default: - none] - --affy-filter=AFFY_FILT - gene set to include based on expression data [default: - none] - --output=OUTPUT write output to file - - Note: when specifying column names, be sure to escape characters like - (,),&,*,etc... that shells interpret with a \, e.g. --bind- - cols=-10\*log10\(pvalue\) - - -:ref:`top <top>` - -.. _gerald_stats.py: - -gerald_stats.py:: - - Usage: gerald_stats.py [options] <filename> [<filename>...] - - Outputs various stats about the GERALD formatted file(s) input. If multiple - files are provided statistics are aggregated according to the specified output - format. Output formats available via --format=X : - - # *python* - print an eval()'able python dictionary w/ counts - # *rst* - print statistics in a reStructured text table (default) - # *tab* - print statistics in a tab delimited form w/ header names - - Except for *python* format, each input file has its own output line. *python* - summarizes all alignments. - - - Options: - -h, --help show this help message and exit - --output=OUTPUT write output to file [default: stdout] - --format=FORMAT format to print out stats [default: rst] - - -:ref:`top <top>` - -.. _gerald_to_bed.py: - -gerald_to_bed.py:: - - Usage: gerald_to_bed.py [options] <GERALD file> [<GERALD file>...] - - Convert the GERALD alignment formatted files into BED format. Input file - named <path>/<filename>.<ext> is translated into <path>/<filename>.bed unless - --output or --stdout is specified, in which case formatted lines are written - to file or standard output, respectively. If multiple input files are - supplied with the --output or --stdout option all formatted lines are - concatenated together. Formatting only occurs for GERALD input lines that have - a valid Match Position field (i.e. successfully aligned somewhere). - - Options: - -h, --help show this help message and exit - --output=OUTPUT write all records to file - --stdout write out all formatted lines to stdout - --min-fields only format the first three fields - --pass-only only format lines with Y in the Pass Filtering field - --chromo-strip=CHROMO_STRIP - pattern to remove from chromo field in BED output - (e.g. --chromo-strip=.fa to remve .fa from chrX.fa) - [default: .fa] - - -:ref:`top <top>` - -.. _join_mapped_known_genes.py: - -join_mapped_known_genes.py:: - - Usage: join_mapped_known_genes.py -b <mapped DNA binding file>|-a <mapped microarray file> [-b <mapped DNA binding file> ...] [-a <mapped microarray file> ...] - - Join all files on the first column, concatenating records with matching - entries onto one line per entry. Understands DNA binding data as mapped with - *map_peaks_to_known_genes.py* utility microarray data as mapped by - *probeset_to_known_genes.py* utility, passed to program using *-b* and *-a* - options respectively. If a file contains more than one mapping to a gene - additional columns are added. At least one file of either type is required. - Field names are written as <filename>.<original field name>.<map number> - - Options: - -h, --help show this help message and exit - -a AFFY_FILE, --affy-file=AFFY_FILE - add a mapped microarray file - -b BIND_FILE, --bind-file=BIND_FILE - add a mapped DNA binding file (e.g. MACS, BED) - -m MACS_FILE, --macs-file=MACS_FILE - DEPRECATED: use -b instead, add a mapped default MACS - formatted peaks (*.xls) file - --output=OUTPUT file to output joined records to [default: stdout] - --first-only only output the first mapping to a gene from each file - --binary output only one column per file with a 0 or 1 to - indicate whether a mapping exists in that file - --binary-plus output one column per file with a 0 or 1 to indicate - whether a mapping exists in that file in addition to - all other columns - --field-types prepend BIND or AFFY to the beginning of all - appropriate columns - - Note: microarray files should have been created by bioconductor, and all files - should have a row of fieldnames as the first line - - -:ref:`top <top>` - -.. _map_intervals.py: - -map_intervals.py:: - - Usage: map_intervals.py [options] <from> <to> - - Find records in <to> interval file that map to records in <from> interval - file. Files should be tab delimited and are expected to have a chromosome - column, a start column, and an end column. The indices of these columns can - be specified on the command line but by default are the first three columns, - respectively. Prints out to stdout by default one new line separated row per - row in <from> with a line from <to> where there is a mapping. If no mapping is - found (e.g. when specifying a maximum margin to search within) the word None - is printed. By default only prints nearest record, with ties settled by - smallest line number in <to>. - - Options: - -h, --help show this help message and exit - -w WINDOW, --window=WINDOW - window as <int upstream> <int downstream> to search - for intervals [default: (1000000000.0, 1000000000.0)] - -f FROM_IND, --from=FROM_IND - coordinates of chromosome, start, stop in <from> file - -i, --skip-from-header - <from> has a header that should be skipped - -t TO_IND, --to=TO_IND - coordinates of chromosome, start, stop in <to> file - -j, --skip-to-header <to> has a header that should be skipped - - -:ref:`top <top>` - -.. _map_peaks_to_genes.py: - -map_peaks_to_genes.py:: - - Usage: map_peaks_to_genes.py [options] <refGene file> <peaks file> - - Map the peaks in <peaks file> to genes in <refGene file>. <refGene file> is - format is as specified in - http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql. <peaks - file> format is as produced by MACS. - - Options: - -h, --help show this help message and exit - --upstream-window=UPST_WIN - window width in base pairs to consider promoter region - [default: 5500] - --downstream-window=DNST_WIN - window width in base pairs to consider downstream - region [default: 2500] - --map-output=PEAK_OUTPUT - filename to output mapped peaks in BED format to - [default: stdout] - --stats-output=STATS_OUTPUT - filename to output summary stats in conversion - [default: stderr] - --peaks-format=PEAKS_FMT - format of peaks input file [default: MACS] - - -:ref:`top <top>` - -.. _map_peaks_to_known_genes.py: - -map_peaks_to_known_genes.py:: - - Usage: map_peaks_to_known_genes.py [options] <knownGene file> <knownGene xRef file> <peaks file> - - - Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> - isformat is as specified in - http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks - file> format is as produced by MACS. If *auto* is chosen (default) file - extension is examined for *.xls* for default MACS format or *.bed* for BED - format. If the --detailoption is provided, the following extra fields are - appended to each row: - - peak loc, dist from feature, score, map type, map subtype - - - Options: - -h, --help show this help message and exit - --upstream-window=UPST_WIN - window width in base pairs to consider promoter region - [default: 5500] - --downstream-window=DNST_WIN - window width in base pairs to consider downstream - region [default: 2500] - --tss calculate downstream window from transcription start - site instead of transcription end site - --map-output=PEAK_OUTPUT - filename to output mapped peaks to [default: stdout] - --stats-output=STATS_OUTPUT - filename to output summary stats in conversion - [default: stderr] - --peaks-format=PEAKS_FMT - format of peaks input file [default: auto] - --detail add extra fields to output, see description - --intergenic write intergenic peaks to the gene file as well with - None as gene ID - - -:ref:`top <top>` - -.. _motif_scan.py: - -motif_scan.py:: - - Usage: motif_scan.py [options] <org> <peaks fn> <TAMO motif fn> - - Do some motif scanning stuffs - - Options: - -h, --help show this help message and exit - -n TOP_N, --top-n=TOP_N - use top n peaks by pvalue for sequence scanning - [default: all] - -i MOTIF_IND, --motif-indices=MOTIF_IND - which indices from <TAMO motif fn> to use [default: - all] - -d DIR, --dir=DIR write all results into this directory - --fixed-peak-width=FIXED_W - use only a fixed peak window around the summit instead - of whole peak - - -:ref:`top <top>` - -.. _nibFrag.py: - -nibFrag.py:: - - Usage: nibFrag.py [options] file.nib start end strand [outfile] - -- or -- - nibFrag.py [options] --batch file.nib batchfile [batchfile ...] - - A python implementation of Jim Kent's nibFrag utility that allows outputting - to stdout. Otherwise the functionality is identical for the non-batch usage. - Batch mode accepts one or more files containing sets of coordinates to extract - from the nib file. Only BED formatting is accepted at the moment. All - sequences are concatenated together in FASTA format. To retrieve the entire - sequence, use END as the end argument. - - Options: - -h, --help show this help message and exit - --no-header only output sequence (no fasta header) - --wrap-width=WRAP_WIDTH - wrap output sequence at this number of bases, 0 - indicates no wrap (sequence ends up on single line) - [default: 50] - --batch run in batch mode, interpret arguments after nib file - as queries - --batch-format=BATCH_FORMAT - format to interpret batch files [default: BED] - - Original nibFrag options: - --masked use lower case characters for bases meant to be masked - out - --hardMasked use upper case for non masked-out and 'N' characters - for masked-out bases - --upper use upper case characters for all bases - --name=NAME Use given name after '>' in output sequence - --dbHeader=DBHEADER - Add full database info to the header, with or without - -name option - --tbaHeader=TBAHEADER - Format header for compatibility with tba, takes - database name as argument - - Note: When specifying --name optionin batch mode, also specify --dbHeader to - ensure unique FASTA headers. - - -:ref:`top <top>` - -.. _org_settings.py: - -org_settings.py:: - - Usage: org_settings.py [options] [<org key> [<org setting>]] - - Tool for retrieving sets of organism-specific settings and paths. Original - paths are set at install time, and can be overridden in the file ~/.org - settings.cfg. Allows output of settings in a variety of shell environment - syntaxes. The tool attempts to guess which shell environment is being used by - examining the SHELL environment variable unless explicitly set. When run - without an argument, returns a listing of all settings available. - - Options: - -h, --help show this help message and exit - -s SYNTAX, --syntax=SYNTAX - syntax flavor of output to produce - [default: %auto] - -l, --list print all available settings for - human consumption - - -:ref:`top <top>` - -.. _peaks_to_fasta.py: - -peaks_to_fasta.py:: - - Usage: peaks_to_fasta.py [options] <organism> <peak file> [<peak file> ...] - - Extract sequences for peaks in provided peak file(s). Can interpret MACS or - BED output, determined automatically by .xls or .bed extensions respectively - (force explicit format with --peak-format option). Outputs fasta sequences - for the peaks in all files extracted from the reference genome specified by - the output of *org_settings.py <organism> genome_dir* to stdout by - default.Chromosome names in peak files must match nib filenames without - extension (e.g. peak line: chr1 0 100 searches *genome_dir*/chr1.nib). Fasta - records have the following format: - - ><chromosome>:<start>-<end>;fn=<name of file>:<line number>;db_fn=<db - filename>;fmt=<format>;<source alignment info> - <sequence...> - - <db filename> is the filename where the sequence was extracted, <format> is - the format of the input file (MACS or BED), and <source alignment info> - contains all the fields from the originating alignment according to the source - format. - - Options: - -h, --help show this help message and exit - --min-header only store <chromosome>:<start>-<end> in header - --peak-format=PEAK_FORMAT - peak file format, 'auto' determines format by - extension, choices: MACS, BED, auto [default: auto] - --output=OUTPUT filename to output fasta records to [default: stdout] - --fixed-peak-width=FIXED_PEAK_WIDTH - return a fixed number of bases flanking peak summit - (*summit* field in MACS, (end-start)/2 in BED), - ignoring start/stop coords [default: None] - --wrap-width=WRAP_WIDTH - wrap fasta sequences to specified width. -1 indicates - no wrap [default: 70] - - -:ref:`top <top>` - -.. _plot_pos_vs_neg_peaks.py: - -plot_pos_vs_neg_peaks.py:: - - Usage: plot_pos_vs_neg_peaks.py [options] <pos peaks fn> <neg peaks fn> - - Options: - -h, --help show this help message and exit - -o OUT_FN, --output=OUT_FN - filename of output image - - -:ref:`top <top>` - -.. _plot_peak_loc_dist.py: - -plot_peak_loc_dist.py:: - - Usage: plot_peak_loc_dist.py [options] <peaks fn> <gene list fn> - - Produce a pie chart of the locations of peaks in different bins (promoter, - gene, exon, intron, etc.) and, optionally, save the different records to their - own files for subsequent analysis. Also produce a histogram of distance from - feature values in mapping file. Peaks file is expected to be as output by - MACS, or alternately as a BED file but then the -b plot is not available. - Gene list file is expected to be in the format as output by - peaks_to_known_genes.py script. - - Options: - -h, --help show this help message and exit - -b BAR_FN, --bar-fn=BAR_FN - filename for pvalue stacked bar chart - -g GENE_PIE_FN, --gene-pie-fn=GENE_PIE_FN - filename for pie chart image - -p PEAK_PIE_FN, --peak-pie-fn=PEAK_PIE_FN - filename for pie chart image - -f DIST_FN, --dist-fn=DIST_FN - filename for distance from feature image - -s, --save write out files containing peaks for each category - -d OUT_DIR, --output-dir=OUT_DIR - output files created by --save option to this - directory - --no-plot dont show (but save) the figure produced - --peaks-format=PEAK_FMT - format of peaks file, either MACS or BED [default: - MACS] - - -:ref:`top <top>` - -.. _rejection_sample_fasta.py: - -rejection_sample_fasta.py:: - - Usage: rejection_sample_fasta.py [options] <organism> <fasta file> [<fasta file> ... ] - - Use rejection sampling to generate a set of background/random - sequences matching the distance to nearest transcription start site, sequence - length, and GC content distributions of the input fasta file(s). Generated - sequences are genomic sequences sampled based on these distributions. All - sequences - from all files are used to generate the background sequences. The following - command must output a path to a nib genomic sequence directory and refGene - annotation, respectively : - - $> org_settings.py <organism> genome_dir - $> org_settings.py <organism> refgene_anno_path - - Utility prints out generated fasta records to stdout by default. Input - sequences - from chr20 are mapped to chrX, chr21 are mapped to chrY, and sequences from - chrM - are not used. - - - Options: - -h, --help show this help message and exit - -n NUM_SEQS, --num-seqs=NUM_SEQS - number of sequences to generate, either absolute - number or factor of # input sequences, e.g. 2.5x for - 2.5 times the # of input sequences [default: 1x] - --output=OUTPUT file to output fasta records to [default: stdout] - --bed also produce a BED formatted file representing sampled - sequences - --bed-output=BED_OUTPUT - with --bed, file to output BED records to [default: - output.bed] - -v, --verbose print out debug information - - -:ref:`top <top>` - -.. _sort_bed.py: - -sort_bed.py:: - - Usage: sort_bed.py [options] <BED file> [<BED file> <BED file>...] - - Sort the BED formatted files first by chromosome (field 1) and then by start - coordinate (field 2). Lines from all files submitted are concatenated and - sorted in the final output. - - Options: - -h, --help show this help message and exit - --output=OUTPUT filename to write the sorted BED lines [default: stdout] - - -:ref:`top <top>` - -.. _wait_for_jobid.py: - -wait_for_jobid.py:: - - Usage: wait_for_jobid.py [options] <job id> [<job id>...] - - Poll qstat and wait until all <job id>s are finished - - Options: - -h, --help show this help message and exit - - -:ref:`top <top>` - -.. _wqsub.py: - -wqsub.py:: - - Usage: [wqsub.py] [options] command - - Wrap the specified command into a qsub script and submit it for execution. - Script captures both stdout and stderr to the current directory. By default, - all of the user's environment variables are put into the script (compatible - with SGE only ATM). - - Options: - -h, --help show this help message and exit - --wqsub-name=WQSUB_NAME - job name to submit as <--wqsub-name>_<first non- - whitespace chars in command> [default: wqsub] - --wqsub-ext=WQSUB_EXT - file extension to use for stdout files - --wqsub-keep-script do not delete qsub script generated after job - submission - --wqsub-no-env do not include any local environment variables in the - script - --wqsub-no-submit create script but do not submit job (useful for - generating scripts) - --wqsub-drm=DRM the DRM to generate scripts for [default: SGE] - --wqsub-drm-arg=DRM_ARGS - arguments to pass as parameters in the job script - specific to the DRM, use multiple option flags to - specify multiple parameters - --wqsub-wait poll the DRM and do not return control until job is - finished (only works for TORQUE) - - Note: this script only works in Unix-style environments. - - -:ref:`top <top>` - -.. _wqsub_drmaa.py: - -wqsub_drmaa.py:: - - Traceback (most recent call last): - File "../scripts/wqsub_drmaa.py", line 9, in <module> - import drmaa - ImportError: No module named drmaa - - -:ref:`top <top>` - - -
--- a/chipsequtil-master/examples/mapping/map_to_known_gene.sh Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -#!/bin/bash - -# Usage: map_peaks_to_known_genes.py [options] <knownGene file> <knownGene xRef file> <peaks file> -# -# -# Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> -# is -# format is as specified in -# http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql. -# <peaks file> format is as produced by MACS. If *auto* is chosen (default) -# file extension is examined for *.xls* for default MACS format or *.bed* for -# BED format. If the --detailoption is provided, the following extra fields are -# appended to each row: -# -# peak loc, dist from feature, score, map type, map subtype -# -# -# Options: -# -h, --help show this help message and exit -# --upstream-window=UPST_WIN -# window width in base pairs to consider promoter region -# [default: 5500] -# --downstream-window=DNST_WIN -# window width in base pairs to consider downstream -# region [default: 2500] -# --tss calculate downstream window from transcription start -# site instead of transcription end site -# --map-output=PEAK_OUTPUT -# filename to output mapped peaks to [default: stdout] -# --stats-output=STATS_OUTPUT -# filename to output summary stats in conversion -# [default: stderr] -b# --peaks-format=PEAKS_FMT -# format of peaks input file [default: auto] -# --detail add extra fields to output, see description - -ORG=mm9 -KG_FN=$(org_settings.py $ORG known_gene_anno_path) -XREF_FN=$(org_settings.py $ORG known_gene_xref_path) -OPTS="--detail --tss --upstream-window=10000 --downstream-window=10000" -PEAKS_FN=test_peaks.xls - -echo map_peaks_to_known_genes.py $OPTS $KG_FN $XREF_FN $PEAKS_FN -map_peaks_to_known_genes.py $OPTS $KG_FN $XREF_FN $PEAKS_FN \ No newline at end of file
--- a/chipsequtil-master/examples/mapping/test_peaks.xls Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -# genes: -# uc007aet.1 chr1 - 3195984 3205713 3195984 3195984 2 3195984,3203519, 3197398,3205713, uc007aet.1 -# uc008wgw.1 chr5 + 3522764 3525260 3522764 3522764 1 3522764, 3525260, uc008wgw.1 -# -# chr5 3522663 3522664 1 0 1 0 0 1 - promoter -# chr5 3522863 3522864 1 0 1 0 0 1 - in gene -# chr5 3532563 3532564 1 0 1 0 0 1 - in downsteam -# chr1 3205814 3205815 1 0 1 0 0 1 - promoter -# chr1 3205614 3205615 1 0 1 0 0 1 - in gene -# chr1 3195913 3195914 1 0 1 0 0 1 - in downstream -# chr1 319588 319588 1 0 1 0 0 1 - unmapped -# -# chr1 is - strand, chr5 + strand, assumes 10k window around TSS -chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%) -chr5 3522663 3522664 1 0 1 0 0 1 -chr5 3522863 3522864 1 0 1 0 0 1 -chr5 3532564 3532565 1 0 1 0 0 1 -chr1 3205814 3205815 1 0 1 0 0 1 -chr1 3205614 3205615 1 0 1 0 0 1 -chr1 3195913 3195914 1 0 1 0 0 1 -chr1 319588 319588 1 0 1 0 0 1
--- a/chipsequtil-master/examples/nib/shuffled_peaks.bed Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1000 +0,0 @@ -chr19 29505473 29505892 MACS_peak_4348 103.85 -chr5 23950711 23951266 MACS_peak_6268 83.33 -chr1 75303135 75303785 MACS_peak_206 88.17 -chr3 105611391 105612033 MACS_peak_5420 56.03 -chr4 140654843 140655635 MACS_peak_6105 178.49 -chr2 37590398 37590707 MACS_peak_4677 75.45 -chr1 107761995 107762362 MACS_peak_312 96.07 -chr3 153387629 153388143 MACS_peak_5657 52.58 -chr11 88165911 88166520 MACS_peak_1474 62.73 -chr11 109512132 109512551 MACS_peak_1616 128.82 -chr18 57085271 57085755 MACS_peak_4115 107.73 -chr13 96661232 96661599 MACS_peak_2313 62.56 -chr3 95164133 95164494 MACS_peak_5342 93.42 -chr3 107434353 107434982 MACS_peak_5438 65.35 -chr11 6525702 6526208 MACS_peak_1057 56.89 -chr17 71137869 71138311 MACS_peak_3922 65.19 -chr5 120915880 120916171 MACS_peak_6566 100.90 -chr14 115241544 115242039 MACS_peak_2840 66.36 -chr3 115548096 115548809 MACS_peak_5466 146.81 -chr3 143368788 143369115 MACS_peak_5597 63.16 -chr12 73861752 73862246 MACS_peak_1870 80.18 -chr4 83619188 83619568 MACS_peak_5815 52.20 -chr7 80763465 80763988 MACS_peak_7410 71.38 -chr11 78816343 78817112 MACS_peak_1360 53.58 -chr10 80160393 80161035 MACS_peak_822 294.44 -chr13 32893584 32894176 MACS_peak_2117 81.21 -chr10 78218410 78218726 MACS_peak_790 64.14 -chr11 58907018 58907334 MACS_peak_1205 98.43 -chr3 104162680 104163086 MACS_peak_5410 55.17 -chr6 39156271 39156786 MACS_peak_6854 61.68 -chr18 85020575 85021002 MACS_peak_4215 74.27 -chr6 72166566 72167067 MACS_peak_6931 69.03 -chr17 56748737 56749331 MACS_peak_3884 106.79 -chr2 57090575 57091032 MACS_peak_4713 76.38 -chr6 52662598 52663126 MACS_peak_6888 97.50 -chr5 88982859 88983700 MACS_peak_6425 295.50 -chr5 134967688 134968192 MACS_peak_6645 72.85 -chr17 29089160 29089657 MACS_peak_3724 82.93 -chr8 123062177 123062589 MACS_peak_8088 58.85 -chr11 85534180 85534673 MACS_peak_1423 87.33 -chr15 66990142 66990609 MACS_peak_3114 118.53 -chr8 106966580 106967082 MACS_peak_7997 113.60 -chr11 106888391 106889001 MACS_peak_1583 69.90 -chr19 11848049 11848520 MACS_peak_4306 51.63 -chr15 8584865 8585230 MACS_peak_2922 62.73 -chr17 87913100 87913467 MACS_peak_3983 114.07 -chr13 34254496 34254848 MACS_peak_2122 67.47 -chr1 59914119 59914399 MACS_peak_135 57.79 -chr4 140629745 140629986 MACS_peak_6102 81.30 -chr2 180446822 180447260 MACS_peak_5086 99.29 -chr2 29804429 29804860 MACS_peak_4600 65.92 -chr12 32992278 32992842 MACS_peak_1783 84.01 -chr14 99698259 99698564 MACS_peak_2803 84.57 -chr19 3832712 3833378 MACS_peak_4224 118.71 -chr15 100536597 100537082 MACS_peak_3300 154.87 -chr7 109390646 109391459 MACS_peak_7527 161.60 -chr7 151692825 151693219 MACS_peak_7719 66.56 -chr14 52639405 52639860 MACS_peak_2557 52.74 -chr1 158257693 158258023 MACS_peak_461 64.88 -chr12 76836098 76836626 MACS_peak_1878 62.73 -chr1 182998458 182998880 MACS_peak_570 52.74 -chr2 51797359 51797797 MACS_peak_4703 65.46 -chr8 96707068 96707513 MACS_peak_7960 104.94 -chr3 28143185 28143670 MACS_peak_5131 101.35 -chr6 88889418 88889830 MACS_peak_7010 52.74 -chr2 131937255 131937594 MACS_peak_4912 72.89 -chr7 25688982 25689460 MACS_peak_7246 62.73 -chr19 46938054 46938331 MACS_peak_4436 92.02 -chr7 138515654 138516191 MACS_peak_7671 84.15 -chr14 29767339 29767710 MACS_peak_2466 51.44 -chr15 86002731 86003183 MACS_peak_3240 72.40 -chr15 103088442 103089223 MACS_peak_3322 883.55 -chr19 33127653 33128234 MACS_peak_4366 116.11 -chr5 135450040 135450529 MACS_peak_6650 101.01 -chr15 51080445 51080929 MACS_peak_3050 62.73 -chr9 124009677 124010094 MACS_peak_8582 65.09 -chr1 107856029 107856432 MACS_peak_313 52.07 -chr10 107555226 107555677 MACS_peak_929 79.40 -chr7 55762430 55762866 MACS_peak_7364 91.92 -chr12 96882121 96882495 MACS_peak_1959 70.03 -chr3 68480776 68481485 MACS_peak_5235 78.55 -chr1 89537056 89537406 MACS_peak_259 53.07 -chr14 27335329 27335792 MACS_peak_2450 52.74 -chr17 56949680 56949993 MACS_peak_3889 81.91 -chr5 118928141 118928605 MACS_peak_6556 117.21 -chr8 84911554 84912100 MACS_peak_7907 71.83 -chr8 129108351 129108844 MACS_peak_8142 54.79 -chr3 78877870 78878229 MACS_peak_5251 71.22 -chr19 18650375 18650861 MACS_peak_4324 62.73 -chr6 87942729 87943305 MACS_peak_6992 81.55 -chr12 92821124 92821370 MACS_peak_1955 69.53 -chr11 18065187 18065398 MACS_peak_1077 97.88 -chr17 84515588 84516165 MACS_peak_3966 458.83 -chr9 92169110 92169873 MACS_peak_8447 108.56 -chr14 14920422 14920757 MACS_peak_2398 123.05 -chr9 34798448 34798810 MACS_peak_8223 70.98 -chr3 94306130 94306466 MACS_peak_5319 95.95 -chr5 115790919 115791717 MACS_peak_6543 254.81 -chr11 68780920 68781624 MACS_peak_1249 96.66 -chr1 55084208 55084643 MACS_peak_101 56.99 -chr11 115938781 115939242 MACS_peak_1655 106.79 -chr7 134851363 134852112 MACS_peak_7651 388.87 -chr2 25413082 25413751 MACS_peak_4557 108.33 -chr9 70760521 70761198 MACS_peak_8400 125.82 -chr1 132526233 132526605 MACS_peak_367 51.37 -chr12 77462231 77462609 MACS_peak_1880 61.61 -chr2 131322118 131322495 MACS_peak_4905 173.69 -chr12 8886534 8886943 MACS_peak_1732 62.46 -chr1 134921392 134922134 MACS_peak_388 97.78 -chr12 50546587 50546853 MACS_peak_1811 72.51 -chr16 44347497 44348102 MACS_peak_3445 67.73 -chr16 91448123 91448772 MACS_peak_3510 110.35 -chr8 96932624 96932949 MACS_peak_7968 67.46 -chr9 50409776 50410148 MACS_peak_8274 68.39 -chr15 39018860 39019403 MACS_peak_3023 96.24 -chrX 7548382 7548918 MACS_peak_8587 182.65 -chr1 36568547 36568801 MACS_peak_47 57.79 -chr3 133241295 133241605 MACS_peak_5543 56.48 -chr3 36470919 36471238 MACS_peak_5148 54.12 -chr5 137974253 137974619 MACS_peak_6683 59.42 -chr4 107278613 107279232 MACS_peak_5866 117.82 -chr8 3621220 3621676 MACS_peak_7722 76.85 -chr11 68792865 68793384 MACS_peak_1250 61.41 -chr11 107283838 107284259 MACS_peak_1593 62.31 -chr17 36162344 36162790 MACS_peak_3801 77.75 -chr2 119176647 119177021 MACS_peak_4841 59.32 -chr14 75947689 75947989 MACS_peak_2746 115.64 -chr2 32837666 32838081 MACS_peak_4650 56.37 -chr5 21772275 21772751 MACS_peak_6260 88.64 -chr4 88181586 88181956 MACS_peak_5819 83.97 -chr17 46210576 46211375 MACS_peak_3824 152.68 -chr8 113290700 113290975 MACS_peak_8033 68.02 -chr14 100246709 100247166 MACS_peak_2804 114.56 -chr18 21097256 21097529 MACS_peak_4028 188.09 -chr15 58175270 58175626 MACS_peak_3078 52.59 -chr9 61513942 61514355 MACS_peak_8334 216.69 -chr10 92184761 92185425 MACS_peak_881 113.68 -chr2 125450541 125451011 MACS_peak_4863 84.01 -chr7 120579702 120580147 MACS_peak_7571 84.01 -chr17 28313728 28314505 MACS_peak_3710 147.88 -chr17 85092137 85092578 MACS_peak_3972 60.05 -chr7 52391580 52392059 MACS_peak_7336 71.31 -chr4 106607491 106607860 MACS_peak_5861 62.73 -chr15 76531134 76532498 MACS_peak_3158 205.05 -chr12 86815403 86815709 MACS_peak_1937 55.62 -chr8 97381250 97381634 MACS_peak_7975 67.56 -chr2 18892130 18892531 MACS_peak_4517 53.98 -chr13 93362690 93363352 MACS_peak_2290 156.01 -chr4 134276344 134276744 MACS_peak_6023 66.29 -chr5 136189308 136189833 MACS_peak_6660 92.87 -chr13 54712548 54712992 MACS_peak_2192 70.78 -chr3 95116459 95117202 MACS_peak_5338 276.81 -chr15 55668280 55668565 MACS_peak_3068 57.79 -chr7 86508145 86508581 MACS_peak_7430 65.59 -chr13 64134767 64135424 MACS_peak_2229 84.01 -chr14 75405717 75405947 MACS_peak_2740 56.48 -chr2 34655577 34655906 MACS_peak_4662 86.22 -chr2 178420601 178420979 MACS_peak_5071 60.99 -chr7 80675775 80676079 MACS_peak_7406 57.04 -chr6 120314001 120314656 MACS_peak_7092 155.03 -chr11 103889450 103889863 MACS_peak_1547 105.51 -chr1 75209595 75210147 MACS_peak_201 195.59 -chr4 136209837 136210242 MACS_peak_6063 91.62 -chr19 38298472 38299109 MACS_peak_4384 52.09 -chr3 146318049 146318677 MACS_peak_5622 65.43 -chr8 97525645 97526124 MACS_peak_7981 83.75 -chr6 42299260 42299977 MACS_peak_6864 156.01 -chr13 95746101 95746664 MACS_peak_2305 118.67 -chr5 68262648 68262928 MACS_peak_6374 76.86 -chr9 4309901 4310202 MACS_peak_8156 57.32 -chr2 130455636 130455898 MACS_peak_4896 68.02 -chr7 133920084 133920580 MACS_peak_7627 94.25 -chr3 144712794 144713309 MACS_peak_5603 333.24 -chr4 41492809 41493178 MACS_peak_5745 61.67 -chr6 83725731 83726256 MACS_peak_6965 72.37 -chr14 123928421 123928771 MACS_peak_2892 53.07 -chr11 94409579 94409974 MACS_peak_1489 68.44 -chr2 165618765 165619347 MACS_peak_5039 77.35 -chr1 97210080 97210414 MACS_peak_302 73.84 -chr19 31412009 31412328 MACS_peak_4353 67.09 -chr7 146028031 146028398 MACS_peak_7696 57.49 -chr14 98617003 98617302 MACS_peak_2799 57.51 -chr19 44406048 44406439 MACS_peak_4413 66.95 -chr14 26681413 26681976 MACS_peak_2449 117.79 -chr2 128037989 128038430 MACS_peak_4878 52.74 -chr17 61434287 61434641 MACS_peak_3905 62.86 -chr15 36390225 36390517 MACS_peak_2989 66.98 -chr14 27398759 27399655 MACS_peak_2452 361.52 -chr11 116115836 116116290 MACS_peak_1661 77.03 -chr15 36579667 36580306 MACS_peak_2996 51.96 -chr1 57568835 57569128 MACS_peak_112 60.79 -chr15 67474872 67475357 MACS_peak_3123 158.15 -chr10 19428365 19428826 MACS_peak_632 89.84 -chr14 113392921 113393120 MACS_peak_2836 66.98 -chr15 38448807 38449350 MACS_peak_3019 59.81 -chr14 20991935 20992435 MACS_peak_2406 75.91 -chr6 134006321 134006678 MACS_peak_7142 71.38 -chr12 112127235 112127724 MACS_peak_2013 80.19 -chr14 76244671 76245541 MACS_peak_2752 107.12 -chr11 104164505 104164874 MACS_peak_1549 69.13 -chr7 134536698 134537132 MACS_peak_7646 78.65 -chr1 137867871 137868260 MACS_peak_415 143.30 -chr18 34665859 34666370 MACS_peak_4058 61.96 -chr1 129101475 129101945 MACS_peak_348 77.88 -chr11 72295448 72295925 MACS_peak_1293 156.01 -chr17 24591995 24592521 MACS_peak_3651 114.96 -chr15 3945339 3946408 MACS_peak_2896 271.22 -chr8 122250900 122251332 MACS_peak_8064 71.78 -chr11 115938158 115938571 MACS_peak_1654 58.78 -chr9 114597610 114598135 MACS_peak_8535 92.87 -chr6 43207256 43207620 MACS_peak_6869 70.82 -chr3 152935129 152935658 MACS_peak_5650 60.73 -chr3 94655429 94656010 MACS_peak_5324 210.22 -chr9 57368841 57369352 MACS_peak_8313 53.61 -chr4 3157974 3158349 MACS_peak_5679 52.74 -chr11 107211666 107212176 MACS_peak_1586 89.67 -chr15 42269449 42270170 MACS_peak_3035 131.90 -chr9 70682529 70683032 MACS_peak_8396 186.94 -chr8 27125446 27126059 MACS_peak_7778 102.67 -chr9 20896025 20896479 MACS_peak_8195 67.10 -chr15 75551370 75551790 MACS_peak_3136 66.67 -chr15 55028995 55029425 MACS_peak_3064 90.94 -chr16 18308240 18308586 MACS_peak_3350 58.96 -chr3 93353745 93354375 MACS_peak_5318 103.23 -chr16 23107242 23107924 MACS_peak_3367 113.70 -chr18 36486603 36487009 MACS_peak_4080 53.46 -chr18 5390330 5390807 MACS_peak_4001 113.10 -chr17 56428661 56429186 MACS_peak_3882 118.67 -chr2 18860310 18861083 MACS_peak_4512 84.24 -chr7 97888242 97888576 MACS_peak_7477 57.36 -chr3 21810071 21810487 MACS_peak_5121 118.67 -chr17 78181904 78182525 MACS_peak_3946 77.05 -chr14 56197450 56198063 MACS_peak_2598 129.98 -chr9 99140804 99141128 MACS_peak_8467 58.22 -chr10 92623323 92623821 MACS_peak_885 100.15 -chr4 140616351 140617131 MACS_peak_6099 80.20 -chr10 61142776 61143539 MACS_peak_744 80.20 -chr7 104485058 104485742 MACS_peak_7488 317.41 -chr11 115939476 115940007 MACS_peak_1656 92.32 -chr10 94580987 94581311 MACS_peak_903 56.69 -chr15 76157364 76157952 MACS_peak_3152 125.64 -chr13 14155415 14155855 MACS_peak_2065 52.91 -chr15 67066485 67066934 MACS_peak_3117 84.01 -chr7 29227640 29228147 MACS_peak_7277 73.17 -chr13 6514405 6514820 MACS_peak_2047 104.32 -chr4 140542557 140543005 MACS_peak_6097 144.88 -chr5 111937855 111938599 MACS_peak_6514 128.49 -chr16 44018427 44018767 MACS_peak_3442 64.02 -chr1 133421664 133422047 MACS_peak_377 82.81 -chrX 166419443 166419942 MACS_peak_8678 54.41 -chr15 93105701 93105937 MACS_peak_3251 154.40 -chr1 108780375 108780748 MACS_peak_320 57.56 -chr11 84636850 84637366 MACS_peak_1410 80.20 -chr17 24995915 24996584 MACS_peak_3656 135.78 -chr14 58033892 58034211 MACS_peak_2613 58.66 -chr13 29847874 29848368 MACS_peak_2108 158.86 -chr1 13520675 13521060 MACS_peak_11 108.01 -chr2 156137538 156137972 MACS_peak_4990 78.65 -chr8 87550632 87550994 MACS_peak_7941 66.66 -chr3 151768385 151768678 MACS_peak_5634 56.48 -chr3 108012888 108013451 MACS_peak_5443 78.80 -chr13 44597050 44597814 MACS_peak_2154 202.82 -chr2 31917741 31918033 MACS_peak_4624 91.45 -chr3 132521750 132522383 MACS_peak_5537 143.48 -chr12 4879663 4880069 MACS_peak_1724 78.35 -chr6 91628640 91629356 MACS_peak_7022 67.31 -chr3 81433756 81434158 MACS_peak_5257 67.93 -chr7 54138715 54139193 MACS_peak_7359 128.73 -chr5 137102584 137103013 MACS_peak_6672 81.24 -chr8 59967224 59967628 MACS_peak_7830 62.73 -chr14 73689765 73690147 MACS_peak_2729 76.26 -chr11 117671467 117671893 MACS_peak_1678 66.26 -chr1 133214967 133215419 MACS_peak_376 174.60 -chr15 72853276 72853636 MACS_peak_3127 52.74 -chr11 109334214 109334929 MACS_peak_1611 51.65 -chrX 45266253 45266899 MACS_peak_8617 128.35 -chr2 131877465 131877919 MACS_peak_4910 132.70 -chr9 20779965 20780304 MACS_peak_8192 60.28 -chr3 90068955 90069393 MACS_peak_5310 78.47 -chr5 76187734 76188295 MACS_peak_6404 65.35 -chr11 104180396 104181197 MACS_peak_1550 84.55 -chr9 43839155 43839734 MACS_peak_8243 170.66 -chr15 85812555 85813334 MACS_peak_3239 124.98 -chr16 30691946 30692407 MACS_peak_3394 226.47 -chr2 110401236 110401587 MACS_peak_4817 63.11 -chr5 125914300 125914711 MACS_peak_6623 65.51 -chr2 166483417 166483707 MACS_peak_5047 52.10 -chr8 60131046 60131454 MACS_peak_7833 52.74 -chr1 153024254 153024901 MACS_peak_439 66.68 -chr6 135133407 135133856 MACS_peak_7151 66.57 -chr7 82993032 82993383 MACS_peak_7423 52.99 -chr12 36728733 36729377 MACS_peak_1795 106.79 -chr19 54161870 54162283 MACS_peak_4440 93.96 -chr13 21366775 21367132 MACS_peak_2077 58.81 -chr7 140828409 140828831 MACS_peak_7684 68.81 -chr7 52771782 52772179 MACS_peak_7344 57.63 -chr11 57258571 57259095 MACS_peak_1191 98.14 -chr10 19855329 19855821 MACS_peak_636 125.55 -chr9 48594723 48595281 MACS_peak_8268 79.19 -chr4 41278073 41278681 MACS_peak_5744 81.51 -chr18 44988493 44988911 MACS_peak_4095 69.91 -chr1 74438395 74439179 MACS_peak_195 162.82 -chr3 108830511 108830918 MACS_peak_5453 62.62 -chr13 96427044 96427529 MACS_peak_2310 152.26 -chr1 142384049 142384472 MACS_peak_423 79.51 -chr1 179064649 179064883 MACS_peak_543 74.87 -chr3 105490131 105490555 MACS_peak_5418 63.69 -chr2 90508129 90508418 MACS_peak_4780 76.86 -chr15 81846602 81847127 MACS_peak_3217 117.96 -chr18 3270592 3271094 MACS_peak_3989 195.43 -chr1 108606863 108607335 MACS_peak_318 95.47 -chr13 75935312 75935640 MACS_peak_2250 63.30 -chr16 30789953 30790403 MACS_peak_3396 148.02 -chr10 111409491 111409958 MACS_peak_950 131.28 -chr9 40880928 40881362 MACS_peak_8236 65.72 -chr8 123191898 123192493 MACS_peak_8089 118.67 -chr12 86713029 86713482 MACS_peak_1934 95.23 -chr18 65281748 65282564 MACS_peak_4150 161.32 -chr9 37296593 37297143 MACS_peak_8230 129.00 -chr18 75530251 75530647 MACS_peak_4189 68.37 -chr14 64162422 64162897 MACS_peak_2650 62.73 -chr10 82222485 82222777 MACS_peak_854 129.73 -chr10 51248911 51249493 MACS_peak_714 104.02 -chr19 45612299 45612910 MACS_peak_4419 89.38 -chr16 59515986 59516330 MACS_peak_3480 72.46 -chr1 37364506 37364915 MACS_peak_55 77.39 -chr9 107436160 107436580 MACS_peak_8495 73.91 -chr6 123239085 123239498 MACS_peak_7098 80.30 -chr8 24145434 24145873 MACS_peak_7767 65.39 -chr17 59064066 59064738 MACS_peak_3903 193.40 -chr18 81626532 81626968 MACS_peak_4206 53.17 -chr8 72498191 72498470 MACS_peak_7850 76.86 -chr2 127033717 127034197 MACS_peak_4869 122.43 -chr3 153427354 153428352 MACS_peak_5658 173.73 -chr13 95777240 95777685 MACS_peak_2306 62.73 -chr6 90654616 90655084 MACS_peak_7016 74.57 -chr6 115545743 115546136 MACS_peak_7072 52.74 -chr7 52392685 52393201 MACS_peak_7337 105.13 -chr1 174445177 174445620 MACS_peak_534 65.13 -chr5 139853354 139853932 MACS_peak_6702 77.65 -chr17 44266175 44266552 MACS_peak_3809 97.39 -chr9 78919711 78920097 MACS_peak_8424 50.34 -chr2 120210305 120210628 MACS_peak_4846 69.96 -chr8 97679869 97680475 MACS_peak_7985 130.84 -chr14 70029196 70029514 MACS_peak_2696 58.75 -chr11 97574402 97574747 MACS_peak_1511 53.47 -chr2 56968627 56969614 MACS_peak_4711 285.57 -chr7 26472954 26473335 MACS_peak_7258 62.73 -chr1 146985918 146986334 MACS_peak_432 62.66 -chr6 30276109 30276518 MACS_peak_6816 73.17 -chr18 4969715 4970163 MACS_peak_3999 62.73 -chr6 85298851 85299333 MACS_peak_6971 130.45 -chr18 62318702 62319054 MACS_peak_4130 55.87 -chr7 97493416 97493783 MACS_peak_7473 95.23 -chr5 84728325 84728797 MACS_peak_6420 65.95 -chr15 96290510 96290960 MACS_peak_3260 75.41 -chr5 64493902 64494502 MACS_peak_6348 155.47 -chr12 70683782 70684144 MACS_peak_1854 74.38 -chr7 28259485 28260176 MACS_peak_7269 157.45 -chr3 102072769 102073100 MACS_peak_5391 64.80 -chr3 121177634 121178278 MACS_peak_5487 124.60 -chr3 141995570 141995959 MACS_peak_5587 74.70 -chr10 12681163 12681522 MACS_peak_617 57.44 -chr7 35770301 35770804 MACS_peak_7310 130.48 -chr3 107901318 107901701 MACS_peak_5442 68.87 -chr4 155406985 155407313 MACS_peak_6229 63.11 -chr14 46277533 46277983 MACS_peak_2523 63.09 -chr7 142790268 142790503 MACS_peak_7693 89.37 -chr9 66360249 66360570 MACS_peak_8377 57.12 -chr15 95621015 95621459 MACS_peak_3254 77.44 -chr4 71861086 71862075 MACS_peak_5807 206.03 -chr11 121065722 121066055 MACS_peak_1707 99.74 -chr19 9041528 9042024 MACS_peak_4289 107.21 -chr8 98477882 98478259 MACS_peak_7992 69.80 -chr18 75722207 75722491 MACS_peak_4193 91.45 -chr15 57812241 57812965 MACS_peak_3074 52.05 -chr3 58917608 58918452 MACS_peak_5209 152.48 -chr4 41660450 41660822 MACS_peak_5747 57.64 -chr11 11641587 11641928 MACS_peak_1066 55.63 -chr8 50911172 50911534 MACS_peak_7818 70.98 -chr11 120209562 120209886 MACS_peak_1697 66.59 -chr14 66971802 66972207 MACS_peak_2681 62.73 -chr3 98621426 98621709 MACS_peak_5379 72.51 -chr12 49775350 49775758 MACS_peak_1809 67.51 -chr12 17040311 17040756 MACS_peak_1752 86.89 -chr14 70465516 70466231 MACS_peak_2709 158.40 -chr4 106926454 106926892 MACS_peak_5863 65.46 -chr11 5221117 5221579 MACS_peak_1043 129.84 -chr11 51762768 51763314 MACS_peak_1170 137.47 -chr12 73948553 73949012 MACS_peak_1872 142.64 -chr15 12123626 12124218 MACS_peak_2933 94.43 -chr15 12246914 12247416 MACS_peak_2937 294.48 -chr2 7924537 7924842 MACS_peak_4478 63.31 -chr16 56916814 56917191 MACS_peak_3470 51.00 -chr14 57190198 57191173 MACS_peak_2608 120.19 -chr5 138011402 138012367 MACS_peak_6684 707.79 -chr1 36153980 36154800 MACS_peak_40 119.59 -chr9 105397273 105397630 MACS_peak_8483 90.84 -chr4 148542288 148542494 MACS_peak_6147 99.30 -chr7 134234472 134235313 MACS_peak_7633 215.08 -chr1 187186557 187186854 MACS_peak_584 84.01 -chr2 156703464 156703925 MACS_peak_5000 135.84 -chr2 45507624 45507896 MACS_peak_4694 66.03 -chr2 25110687 25111472 MACS_peak_4543 265.66 -chr13 23494534 23495087 MACS_peak_2082 74.85 -chr2 118738734 118739174 MACS_peak_4833 58.77 -chrX 11733021 11733752 MACS_peak_8601 84.09 -chr3 153560124 153560559 MACS_peak_5664 53.23 -chr8 97479035 97479520 MACS_peak_7976 156.01 -chr9 114662010 114662635 MACS_peak_8538 65.64 -chr18 56618529 56618905 MACS_peak_4110 56.89 -chr17 34057391 34058028 MACS_peak_3762 56.35 -chr1 99519858 99520254 MACS_peak_306 57.70 -chr4 136194817 136195184 MACS_peak_6060 98.40 -chr7 16611238 16611688 MACS_peak_7204 52.74 -chr1 60215214 60215684 MACS_peak_140 88.89 -chr6 149257575 149258040 MACS_peak_7180 65.32 -chr4 8159311 8159627 MACS_peak_5687 55.94 -chr14 45660604 45661144 MACS_peak_2518 98.84 -chr11 84024342 84024705 MACS_peak_1402 50.69 -chr11 108110784 108111439 MACS_peak_1606 82.28 -chr7 87590346 87590812 MACS_peak_7448 54.95 -chr9 35018443 35018749 MACS_peak_8226 74.08 -chr7 61764305 61764697 MACS_peak_7375 62.07 -chr3 137620670 137621228 MACS_peak_5569 110.82 -chr8 89147603 89148183 MACS_peak_7945 106.99 -chr10 80982282 80982979 MACS_peak_848 148.39 -chr2 113012940 113013326 MACS_peak_4821 56.59 -chr16 93767743 93768080 MACS_peak_3559 109.86 -chr2 4483390 4484698 MACS_peak_4459 128.33 -chr6 128792917 128793800 MACS_peak_7130 123.88 -chr5 148241425 148242026 MACS_peak_6759 97.14 -chr4 34829946 34830380 MACS_peak_5730 65.72 -chr3 37558222 37559100 MACS_peak_5161 173.11 -chr2 90894346 90894793 MACS_peak_4781 72.76 -chr8 107486121 107486440 MACS_peak_7999 81.28 -chr7 140064742 140065140 MACS_peak_7681 73.17 -chr12 30367083 30367515 MACS_peak_1770 73.89 -chrX 11711607 11711970 MACS_peak_8600 84.60 -chr15 5058192 5058833 MACS_peak_2901 89.38 -chr7 104727397 104728070 MACS_peak_7489 70.84 -chr6 133055524 133055892 MACS_peak_7138 62.47 -chr3 95558657 95559026 MACS_peak_5348 79.54 -chr17 35326947 35327306 MACS_peak_3781 66.90 -chr14 52816486 52817050 MACS_peak_2560 64.66 -chr1 87632880 87633301 MACS_peak_233 58.17 -chr9 57495286 57495888 MACS_peak_8318 128.82 -chr11 87571803 87572391 MACS_peak_1454 59.68 -chr4 101511482 101511839 MACS_peak_5847 71.38 -chr15 12251825 12252367 MACS_peak_2938 54.29 -chr8 24276703 24277308 MACS_peak_7770 91.11 -chr6 117981548 117981940 MACS_peak_7086 68.66 -chr7 118300107 118300551 MACS_peak_7564 60.79 -chr5 77553172 77553619 MACS_peak_6415 77.65 -chr7 133428410 133429279 MACS_peak_7615 176.96 -chr5 54386367 54386928 MACS_peak_6343 135.99 -chr2 157967843 157968322 MACS_peak_5015 52.74 -chr1 13579885 13580466 MACS_peak_13 106.79 -chr17 47825794 47826338 MACS_peak_3846 103.13 -chr15 96115848 96116105 MACS_peak_3259 89.28 -chr6 8018474 8018781 MACS_peak_6782 61.94 -chr1 58769938 58770557 MACS_peak_122 106.56 -chr18 13100063 13100479 MACS_peak_4020 101.54 -chr1 95462306 95462739 MACS_peak_289 61.51 -chr13 8456217 8456514 MACS_peak_2052 121.95 -chr8 87426937 87427392 MACS_peak_7932 147.42 -chr3 69488182 69488508 MACS_peak_5241 55.06 -chr5 108495385 108495819 MACS_peak_6502 53.36 -chr7 26391500 26391968 MACS_peak_7253 62.73 -chr14 122222542 122222864 MACS_peak_2879 60.02 -chr7 16880847 16881143 MACS_peak_7213 72.15 -chr10 84379493 84379935 MACS_peak_862 65.19 -chr1 93218296 93218729 MACS_peak_276 67.88 -chr7 134005243 134005813 MACS_peak_7631 156.01 -chr9 25059978 25060419 MACS_peak_8215 65.26 -chr2 4802272 4802882 MACS_peak_4463 90.71 -chr9 114640488 114640918 MACS_peak_8537 63.14 -chr1 155044510 155044840 MACS_peak_453 52.74 -chr2 181598797 181599317 MACS_peak_5098 52.05 -chr16 30227325 30227906 MACS_peak_3389 144.56 -chr2 33582864 33583285 MACS_peak_4654 61.53 -chr2 38920882 38921337 MACS_peak_4683 62.69 -chr12 8639627 8640095 MACS_peak_1730 89.27 -chr1 193244835 193245254 MACS_peak_592 63.53 -chr19 28042093 28042496 MACS_peak_4342 185.71 -chr18 67399653 67399896 MACS_peak_4155 104.20 -chr15 81702453 81702935 MACS_peak_3214 70.29 -chr2 4354267 4354521 MACS_peak_4454 66.03 -chr17 71599086 71599527 MACS_peak_3929 65.92 -chr11 115016216 115016976 MACS_peak_1629 260.88 -chr13 49402730 49403235 MACS_peak_2170 96.40 -chr1 173607566 173608216 MACS_peak_521 161.65 -chr4 149943597 149944833 MACS_peak_6170 714.32 -chr2 30033180 30033596 MACS_peak_4605 60.37 -chr12 73775435 73775826 MACS_peak_1868 193.90 -chr19 6686904 6687279 MACS_peak_4266 106.79 -chr13 94372068 94372377 MACS_peak_2298 52.57 -chr3 134875358 134875676 MACS_peak_5549 74.76 -chr14 35123318 35123817 MACS_peak_2496 91.32 -chr4 134064080 134064462 MACS_peak_6020 65.66 -chr7 38451614 38451982 MACS_peak_7318 51.67 -chr2 59721277 59721585 MACS_peak_4718 52.74 -chr4 148521433 148521717 MACS_peak_6146 50.43 -chr6 29651055 29651524 MACS_peak_6813 99.40 -chr2 25283862 25284454 MACS_peak_4553 118.30 -chr1 180335685 180336069 MACS_peak_548 70.43 -chr15 9000420 9001374 MACS_peak_2924 50.16 -chr17 76783407 76783741 MACS_peak_3945 73.33 -chr10 79377042 79377510 MACS_peak_798 105.25 -chr4 137813129 137813637 MACS_peak_6069 79.35 -chr19 23347935 23348633 MACS_peak_4335 168.98 -chr2 77014459 77014721 MACS_peak_4764 57.79 -chr17 27725137 27725648 MACS_peak_3703 81.61 -chr3 84271282 84271895 MACS_peak_5266 107.11 -chr4 149036130 149036714 MACS_peak_6156 127.11 -chr17 36157226 36157966 MACS_peak_3800 58.06 -chr9 113925463 113925818 MACS_peak_8532 139.16 -chr18 62455027 62455412 MACS_peak_4131 69.19 -chr2 143717397 143717821 MACS_peak_4930 57.72 -chr14 70058939 70059262 MACS_peak_2698 78.56 -chr9 8004492 8005091 MACS_peak_8167 146.39 -chr2 22750741 22751369 MACS_peak_4526 107.68 -chr11 113663893 113664272 MACS_peak_1623 118.67 -chr11 60643876 60644376 MACS_peak_1224 75.91 -chr13 55463887 55464601 MACS_peak_2197 89.38 -chr3 138158153 138158835 MACS_peak_5576 101.13 -chr9 61779725 61780181 MACS_peak_8335 52.74 -chr5 141092685 141093127 MACS_peak_6724 60.64 -chr4 151560621 151560894 MACS_peak_6192 80.71 -chr12 71087816 71088552 MACS_peak_1856 105.21 -chr3 136623971 136624307 MACS_peak_5565 54.21 -chr18 64675715 64676128 MACS_peak_4137 67.15 -chr5 93521864 93522250 MACS_peak_6451 50.34 -chr14 27666233 27666572 MACS_peak_2457 95.10 -chr17 65649466 65649790 MACS_peak_3914 74.21 -chr3 96961630 96962284 MACS_peak_5365 62.03 -chr19 46681813 46682242 MACS_peak_4433 64.27 -chr5 33677654 33678040 MACS_peak_6302 133.38 -chr1 155123197 155123551 MACS_peak_454 52.75 -chr11 104222718 104223628 MACS_peak_1551 135.09 -chr12 40834638 40835084 MACS_peak_1801 104.85 -chr5 140797328 140797751 MACS_peak_6714 136.27 -chr8 124636207 124636603 MACS_peak_8095 55.87 -chr1 33776550 33777146 MACS_peak_29 54.49 -chr2 127277423 127277785 MACS_peak_4871 62.22 -chr16 11144052 11144357 MACS_peak_3337 75.98 -chr2 71759141 71759569 MACS_peak_4740 61.84 -chr5 144654264 144654609 MACS_peak_6750 202.40 -chr6 136416896 136417766 MACS_peak_7155 107.68 -chr19 61160284 61160710 MACS_peak_4445 79.27 -chr5 135513632 135514247 MACS_peak_6652 52.16 -chr10 69559457 69559926 MACS_peak_764 75.43 -chr19 34625289 34625732 MACS_peak_4369 58.58 -chr3 129778582 129778971 MACS_peak_5530 52.74 -chr3 40549079 40549989 MACS_peak_5170 139.15 -chr12 63655639 63655947 MACS_peak_1841 75.70 -chr12 88027775 88028206 MACS_peak_1939 57.42 -chr4 149930560 149930906 MACS_peak_6169 51.58 -chr7 26175003 26175297 MACS_peak_7251 193.27 -chr3 137631502 137632466 MACS_peak_5570 270.50 -chr7 75095358 75096309 MACS_peak_7396 276.27 -chr13 112597147 112598260 MACS_peak_2361 120.74 -chr8 73397210 73397892 MACS_peak_7870 88.07 -chr10 57870391 57870790 MACS_peak_723 61.56 -chr12 21379875 21380271 MACS_peak_1759 71.11 -chr4 149229209 149229627 MACS_peak_6162 68.37 -chr11 79454167 79454630 MACS_peak_1371 103.29 -chr2 118577801 118578303 MACS_peak_4831 83.83 -chr12 90031052 90031356 MACS_peak_1953 67.25 -chr3 89221936 89222334 MACS_peak_5302 55.52 -chr11 49015967 49017374 MACS_peak_1149 140.92 -chr5 101854756 101855187 MACS_peak_6476 57.25 -chr14 55045118 55046069 MACS_peak_2572 278.29 -chr8 122360636 122360974 MACS_peak_8068 72.98 -chr6 29559590 29559996 MACS_peak_6810 66.51 -chr8 37675573 37676057 MACS_peak_7806 52.74 -chr7 135604640 135605584 MACS_peak_7660 140.03 -chr7 75215546 75215889 MACS_peak_7400 73.17 -chr11 6387328 6387780 MACS_peak_1054 66.34 -chr6 97171581 97172040 MACS_peak_7033 76.80 -chr2 71652110 71652536 MACS_peak_4738 52.74 -chr14 70205001 70205574 MACS_peak_2702 90.17 -chr7 4636478 4636845 MACS_peak_7189 84.01 -chr1 163697037 163697580 MACS_peak_480 152.14 -chr14 69905127 69905516 MACS_peak_2694 60.69 -chr4 105905243 105905631 MACS_peak_5853 53.13 -chr19 43763805 43764296 MACS_peak_4405 84.01 -chr15 98863988 98864259 MACS_peak_3288 109.33 -chr8 28268378 28268863 MACS_peak_7783 143.30 -chr5 50210130 50210512 MACS_peak_6329 203.50 -chr1 49424163 49424526 MACS_peak_75 70.90 -chr11 114416815 114417130 MACS_peak_1628 75.04 -chr2 29967973 29968359 MACS_peak_4603 84.01 -chr11 87275081 87275514 MACS_peak_1443 92.18 -chr9 72510503 72510911 MACS_peak_8403 63.21 -chr18 32996570 32997045 MACS_peak_4049 106.79 -chr7 108812030 108812396 MACS_peak_7522 58.27 -chr11 61377499 61378145 MACS_peak_1228 59.79 -chr5 141051472 141051938 MACS_peak_6718 69.03 -chr13 36416595 36416984 MACS_peak_2127 55.95 -chr9 14446069 14446592 MACS_peak_8182 98.84 -chr10 117850777 117851031 MACS_peak_971 57.79 -chr8 126502767 126503316 MACS_peak_8125 69.46 -chr6 66891898 66892295 MACS_peak_6917 59.59 -chr4 122959709 122960251 MACS_peak_5935 121.49 -chr12 60308039 60308451 MACS_peak_1839 80.38 -chr5 137108320 137108562 MACS_peak_6673 91.45 -chr4 129373773 129374378 MACS_peak_5972 131.90 -chr2 45268392 45268789 MACS_peak_4692 56.57 -chr5 141120758 141121124 MACS_peak_6726 71.31 -chr16 30453372 30453956 MACS_peak_3392 106.68 -chrX 71542249 71542578 MACS_peak_8628 64.97 -chr12 72743380 72743811 MACS_peak_1864 61.64 -chrX 108755267 108755697 MACS_peak_8648 65.99 -chr9 45983547 45983831 MACS_peak_8258 57.79 -chr14 63049340 63049683 MACS_peak_2644 61.79 -chr7 105719591 105719912 MACS_peak_7498 88.72 -chr7 65987933 65988294 MACS_peak_7377 74.48 -chr7 26496882 26497392 MACS_peak_7260 73.32 -chr3 157588086 157588412 MACS_peak_5676 86.48 -chr5 66089157 66089851 MACS_peak_6367 87.07 -chr1 63823189 63823575 MACS_peak_148 56.59 -chr19 8872798 8873377 MACS_peak_4283 89.38 -chr2 179759459 179759977 MACS_peak_5073 51.39 -chr6 128611850 128612175 MACS_peak_7127 74.12 -chr6 125049277 125049748 MACS_peak_7109 130.85 -chr14 58645884 58646276 MACS_peak_2621 52.85 -chr7 20080932 20081328 MACS_peak_7239 66.58 -chr2 131917466 131917870 MACS_peak_4911 59.48 -chr5 3152015 3152483 MACS_peak_6238 160.90 -chr2 132512500 132512844 MACS_peak_4916 59.86 -chrX 99352299 99352715 MACS_peak_8645 66.94 -chr18 55059820 55060479 MACS_peak_4108 118.53 -chr3 40456923 40457382 MACS_peak_5169 255.48 -chr11 57331929 57332212 MACS_peak_1192 57.79 -chr9 65389306 65389659 MACS_peak_8364 62.73 -chr6 30252722 30253129 MACS_peak_6815 70.78 -chr9 74844269 74844678 MACS_peak_8409 69.94 -chr3 79787772 79788143 MACS_peak_5255 89.40 -chr5 97259867 97260133 MACS_peak_6460 68.02 -chr7 147392845 147393149 MACS_peak_7705 66.02 -chrX 71516418 71516883 MACS_peak_8627 52.74 -chr4 135841295 135841647 MACS_peak_6056 62.73 -chr17 34781424 34781705 MACS_peak_3772 57.79 -chr6 108654497 108654889 MACS_peak_7052 82.04 -chr1 88337836 88338284 MACS_peak_244 100.57 -chr16 18876401 18877082 MACS_peak_3353 122.18 -chr15 86033062 86033641 MACS_peak_3242 60.29 -chr11 16851380 16851985 MACS_peak_1071 71.31 -chr7 125272857 125273444 MACS_peak_7585 163.78 -chr12 53738815 53739179 MACS_peak_1825 70.07 -chr2 156665349 156666095 MACS_peak_4998 100.62 -chr7 133942356 133942843 MACS_peak_7630 98.04 -chr9 90020990 90021384 MACS_peak_8443 52.74 -chr11 83658247 83658585 MACS_peak_1398 60.37 -chr14 52103248 52103624 MACS_peak_2554 55.37 -chr18 36446981 36447399 MACS_peak_4078 89.45 -chr14 22367170 22367534 MACS_peak_2429 72.40 -chr15 53498017 53498814 MACS_peak_3061 116.56 -chr11 87256810 87257164 MACS_peak_1441 111.94 -chr9 122859679 122860394 MACS_peak_8568 109.13 -chr1 23930853 23931253 MACS_peak_23 249.39 -chr12 70598412 70598748 MACS_peak_1852 89.26 -chr13 51943389 51943842 MACS_peak_2182 104.20 -chr19 29138427 29138708 MACS_peak_4343 57.79 -chr8 81885020 81885368 MACS_peak_7895 87.82 -chr11 106277303 106277812 MACS_peak_1574 61.29 -chr14 119583365 119583707 MACS_peak_2862 52.74 -chr6 32801035 32801303 MACS_peak_6834 51.52 -chr10 94394483 94395062 MACS_peak_900 88.16 -chr3 37565697 37565925 MACS_peak_5162 90.55 -chr3 145588349 145588654 MACS_peak_5611 80.31 -chr19 23061529 23061970 MACS_peak_4334 65.26 -chr17 26989228 26989502 MACS_peak_3680 76.86 -chr1 95970905 95971720 MACS_peak_298 160.55 -chr4 108520352 108520992 MACS_peak_5880 118.67 -chr3 26391575 26392055 MACS_peak_5126 88.33 -chr3 8919741 8920292 MACS_peak_5104 77.70 -chr1 29104970 29105334 MACS_peak_25 70.82 -chr16 58637925 58638495 MACS_peak_3475 107.78 -chr15 57966348 57966985 MACS_peak_3075 149.78 -chr13 115022343 115022626 MACS_peak_2368 64.17 -chr11 67905507 67905890 MACS_peak_1243 200.30 -chr17 29330165 29330593 MACS_peak_3730 142.62 -chr11 119161198 119161769 MACS_peak_1689 135.03 -chr4 140249323 140249820 MACS_peak_6089 56.52 -chr1 35926096 35926623 MACS_peak_38 55.35 -chr1 59412217 59412591 MACS_peak_129 51.22 -chr2 181414705 181415126 MACS_peak_5096 57.92 -chr17 57418275 57418714 MACS_peak_3896 78.38 -chr8 87246451 87247128 MACS_peak_7927 93.62 -chr12 81913169 81913458 MACS_peak_1914 52.00 -chr9 88275002 88275236 MACS_peak_8441 69.53 -chr11 103078799 103079762 MACS_peak_1540 129.73 -chr7 148141747 148142194 MACS_peak_7708 52.74 -chr19 41338432 41338860 MACS_peak_4389 51.90 -chr16 91538765 91539078 MACS_peak_3515 52.29 -chr7 132761686 132762056 MACS_peak_7609 66.03 -chr5 138070239 138070549 MACS_peak_6688 62.84 -chr1 174294816 174295355 MACS_peak_528 124.08 -chr19 41912152 41912595 MACS_peak_4394 51.03 -chr3 96217894 96218423 MACS_peak_5355 67.16 -chr8 11393666 11393996 MACS_peak_7740 77.69 -chr15 37172600 37172975 MACS_peak_3007 91.99 -chr1 173611130 173611397 MACS_peak_522 76.86 -chr1 133022808 133023128 MACS_peak_372 72.77 -chr1 88454389 88454942 MACS_peak_252 84.11 -chr5 34856205 34856831 MACS_peak_6311 57.06 -chr7 71082000 71082779 MACS_peak_7382 80.20 -chr14 63736378 63736637 MACS_peak_2648 59.25 -chr19 32843677 32843920 MACS_peak_4364 63.61 -chr3 138702613 138702924 MACS_peak_5578 62.02 -chr17 86566107 86566474 MACS_peak_3976 84.24 -chr8 96910090 96910444 MACS_peak_7966 52.74 -chr13 112430419 112430951 MACS_peak_2359 81.28 -chr10 42013834 42014255 MACS_peak_700 66.60 -chr11 31517779 31518060 MACS_peak_1118 68.02 -chr18 5101351 5101714 MACS_peak_4000 52.05 -chr9 62724326 62725109 MACS_peak_8338 201.21 -chr9 99083674 99084236 MACS_peak_8465 132.54 -chr4 134827884 134829500 MACS_peak_6036 231.64 -chr17 13498739 13499070 MACS_peak_3624 73.59 -chr2 103006169 103006579 MACS_peak_4802 67.36 -chr15 6925244 6925735 MACS_peak_2914 54.92 -chr7 53078238 53078607 MACS_peak_7352 51.59 -chr2 90910384 90910774 MACS_peak_4782 68.81 -chr14 60870155 60870663 MACS_peak_2627 98.84 -chr2 118798450 118798941 MACS_peak_4834 74.57 -chr11 100870661 100871158 MACS_peak_1523 172.41 -chr11 87562630 87563498 MACS_peak_1450 403.76 -chr1 88154721 88155315 MACS_peak_239 86.42 -chr11 83112056 83112724 MACS_peak_1391 71.38 -chr12 101425557 101426166 MACS_peak_1973 147.38 -chr6 85401368 85402272 MACS_peak_6974 228.57 -chr11 78966191 78966722 MACS_peak_1368 117.55 -chr3 129236434 129236906 MACS_peak_5523 68.23 -chr9 109777897 109778345 MACS_peak_8516 77.56 -chr3 88426615 88427427 MACS_peak_5292 205.76 -chr1 46004702 46005146 MACS_peak_69 84.01 -chr5 76126811 76127048 MACS_peak_6401 66.98 -chr10 59405079 59405490 MACS_peak_730 58.60 -chr1 9690569 9690998 MACS_peak_3 64.27 -chr11 88281205 88281634 MACS_peak_1478 51.83 -chr10 21199165 21199653 MACS_peak_652 194.61 -chr1 173433353 173434146 MACS_peak_518 161.22 -chr12 35731430 35731910 MACS_peak_1792 101.79 -chr15 38446130 38446559 MACS_peak_3017 57.57 -chr4 144679039 144679338 MACS_peak_6130 76.57 -chr10 92865497 92865758 MACS_peak_887 57.79 -chr14 121027316 121027735 MACS_peak_2865 55.30 -chr3 96530843 96531751 MACS_peak_5362 201.73 -chr16 91406386 91406908 MACS_peak_3506 89.38 -chr5 67336216 67336546 MACS_peak_6372 54.72 -chr3 89746156 89746412 MACS_peak_5305 60.79 -chr14 106991035 106991399 MACS_peak_2826 58.26 -chr1 36186077 36186421 MACS_peak_41 72.46 -chr14 66211596 66212092 MACS_peak_2670 58.08 -chr2 127911067 127911519 MACS_peak_4875 63.04 -chr8 73335210 73335509 MACS_peak_7867 62.14 -chr17 24291509 24291795 MACS_peak_3646 66.03 -chr16 92938013 92938490 MACS_peak_3549 62.73 -chr11 3279575 3280176 MACS_peak_1024 89.38 -chr6 32447109 32447460 MACS_peak_6833 67.55 -chr1 133724229 133724871 MACS_peak_379 70.44 -chr3 138152249 138152833 MACS_peak_5575 98.84 -chr1 38420121 38420414 MACS_peak_60 72.51 -chr14 55224814 55225200 MACS_peak_2579 50.34 -chr4 140624561 140624920 MACS_peak_6101 83.78 -chr2 106328336 106328622 MACS_peak_4811 57.79 -chr11 114335454 114335791 MACS_peak_1626 77.00 -chr1 133850783 133851479 MACS_peak_381 101.13 -chr15 101084784 101085109 MACS_peak_3307 66.49 -chr1 121422851 121423230 MACS_peak_328 58.94 -chr5 50093335 50093768 MACS_peak_6328 78.73 -chr17 44569507 44569864 MACS_peak_3811 119.82 -chr9 40965392 40966162 MACS_peak_8238 135.71 -chr18 57409148 57409560 MACS_peak_4116 95.69 -chr11 106227571 106228161 MACS_peak_1573 68.64 -chr12 106264328 106264856 MACS_peak_1994 115.56 -chr11 51694649 51695006 MACS_peak_1167 50.61 -chr14 73304152 73304540 MACS_peak_2723 56.45 -chr13 38249483 38249806 MACS_peak_2137 53.73 -chr17 23939899 23940349 MACS_peak_3642 104.48 -chr8 13353101 13353525 MACS_peak_7749 169.06 -chr6 134203272 134203641 MACS_peak_7145 68.62 -chr13 3869743 3870092 MACS_peak_2043 57.73 -chr14 71173919 71174385 MACS_peak_2717 65.91 -chr15 8711544 8712011 MACS_peak_2923 52.74 -chr14 60883642 60884282 MACS_peak_2628 73.32 -chr6 100238263 100238692 MACS_peak_7041 79.31 -chr18 43246353 43246775 MACS_peak_4094 54.45 -chr3 32427840 32428279 MACS_peak_5142 65.39 -chr4 114176976 114177339 MACS_peak_5890 69.10 -chr15 24413374 24413825 MACS_peak_2953 72.47 -chr17 24388206 24388689 MACS_peak_3648 52.74 -chr2 31983332 31984086 MACS_peak_4626 138.76 -chr1 82784012 82784449 MACS_peak_226 85.75 -chr11 115527669 115527986 MACS_peak_1642 74.85 -chr4 133958319 133958921 MACS_peak_6017 111.16 -chr3 33698778 33699287 MACS_peak_5144 72.46 -chr14 122276884 122277969 MACS_peak_2882 101.55 -chr12 87310571 87310976 MACS_peak_1938 95.23 -chr13 58545231 58545547 MACS_peak_2208 75.30 -chr4 151382308 151383335 MACS_peak_6186 211.80 -chr4 107730838 107731131 MACS_peak_5875 76.86 -chr7 127973750 127974191 MACS_peak_7591 56.60 -chr13 51831495 51831924 MACS_peak_2177 53.62 -chr6 113256331 113257237 MACS_peak_7059 493.29 -chr18 75366936 75367315 MACS_peak_4186 69.64 -chr8 83893580 83893897 MACS_peak_7903 64.52 -chr6 82852344 82853038 MACS_peak_6953 269.15 -chr5 123271183 123271619 MACS_peak_6591 123.24 -chr14 47344721 47345104 MACS_peak_2524 52.77 -chr3 152575073 152575476 MACS_peak_5646 52.74 -chr3 145596911 145597202 MACS_peak_5612 83.77 -chr9 63985221 63985609 MACS_peak_8354 62.36 -chr1 58851809 58852216 MACS_peak_124 60.72 -chr4 119140346 119140886 MACS_peak_5928 82.34 -chr17 24486224 24486632 MACS_peak_3649 78.65 -chr6 34614205 34614765 MACS_peak_6840 62.73 -chr17 50210060 50210574 MACS_peak_3861 61.75 -chr18 31945760 31946081 MACS_peak_4043 57.12 -chr17 23680231 23680767 MACS_peak_3638 53.90 -chr15 38129140 38129536 MACS_peak_3012 59.66 -chr8 113782043 113782424 MACS_peak_8040 69.49 -chr18 36388485 36388844 MACS_peak_4075 60.49 -chr14 35176361 35177236 MACS_peak_2498 90.81 -chr15 58986223 58986533 MACS_peak_3088 76.07 -chr15 38230489 38231164 MACS_peak_3015 108.56 -chr2 26207239 26207961 MACS_peak_4567 93.92 -chr17 31700930 31701353 MACS_peak_3750 79.51 -chr14 69764756 69765167 MACS_peak_2692 66.11 -chr1 82630866 82631590 MACS_peak_222 118.15 -chr13 63463080 63463547 MACS_peak_2222 137.87 -chr9 88333602 88334003 MACS_peak_8442 170.03 -chr1 108414102 108414881 MACS_peak_317 194.11 -chr17 71202093 71202486 MACS_peak_3924 71.93 -chr2 4397930 4398339 MACS_peak_4456 96.07 -chr19 6313436 6313784 MACS_peak_4261 67.80 -chr17 47430218 47430545 MACS_peak_3836 57.96 -chr1 88383641 88384502 MACS_peak_245 82.34 -chr15 99307421 99307832 MACS_peak_3292 54.82 -chr10 87525443 87525844 MACS_peak_870 52.21 -chr1 137560338 137560774 MACS_peak_409 84.01 -chr2 98177089 98177336 MACS_peak_4795 66.43 -chr6 146904425 146904852 MACS_peak_7174 57.52 -chr3 88489494 88489890 MACS_peak_5293 71.68 -chrX 160870684 160870964 MACS_peak_8672 76.86 -chr5 96978763 96979127 MACS_peak_6459 66.50 -chr11 117515601 117516136 MACS_peak_1677 87.38 -chr8 129497368 129498236 MACS_peak_8150 121.81 -chr9 44134590 44135169 MACS_peak_8247 90.62 -chr3 157699307 157699805 MACS_peak_5678 169.27 -chr1 184472944 184473657 MACS_peak_582 98.66 -chr2 165780325 165780779 MACS_peak_5042 95.23 -chr12 52792839 52793385 MACS_peak_1814 54.49 -chr13 23855588 23856068 MACS_peak_2092 126.79 -chr18 83456033 83456433 MACS_peak_4211 68.08 -chr10 14425071 14425602 MACS_peak_620 94.14 -chr13 41582171 41582615 MACS_peak_2143 65.06 -chr10 94786191 94786440 MACS_peak_905 106.55 -chr8 109816960 109817536 MACS_peak_8026 135.33 -chr12 81878967 81879611 MACS_peak_1912 119.33 -chr7 26059620 26059929 MACS_peak_7250 118.92 -chr4 62176380 62176995 MACS_peak_5799 54.70 -chr18 53436462 53436882 MACS_peak_4104 79.74 -chr11 51502212 51502759 MACS_peak_1164 56.31 -chr19 37281848 37282416 MACS_peak_4379 68.11 -chr16 92710525 92711133 MACS_peak_3535 77.20 -chr3 120874131 120875192 MACS_peak_5483 293.69 -chr13 17696479 17697123 MACS_peak_2068 170.31 -chr19 46077680 46078113 MACS_peak_4423 65.79 -chr5 143682333 143682913 MACS_peak_6736 222.86 -chr1 154568166 154568685 MACS_peak_448 95.23 -chr9 81108308 81108891 MACS_peak_8430 145.29 -chr12 44270399 44270849 MACS_peak_1806 72.54 -chr5 106128761 106129096 MACS_peak_6489 54.49 -chr4 120903868 120904161 MACS_peak_5932 76.86 -chr3 68460731 68461147 MACS_peak_5234 137.12 -chr3 58329271 58329978 MACS_peak_5204 131.98 -chr4 151462216 151462538 MACS_peak_6188 74.40 -chr17 34059082 34059521 MACS_peak_3763 62.02 -chr15 76554091 76554392 MACS_peak_3159 72.02 -chr2 117013160 117013597 MACS_peak_4828 61.25 -chr6 8719505 8720279 MACS_peak_6785 237.86 -chr6 102485635 102486092 MACS_peak_7043 67.00 -chr18 65959921 65960294 MACS_peak_4153 65.80 -chr11 3308916 3309259 MACS_peak_1025 175.63 -chr11 44333429 44333838 MACS_peak_1139 94.32 -chr1 9933866 9934292 MACS_peak_4 151.02 -chr2 49701652 49702023 MACS_peak_4700 86.63 -chr5 147072457 147073005 MACS_peak_6754 70.79 -chr14 21993252 21993662 MACS_peak_2423 117.33 -chr14 76186318 76187158 MACS_peak_2750 90.56 -chr3 139131096 139131484 MACS_peak_5580 50.20 -chr7 26466797 26467478 MACS_peak_7255 106.79 -chr1 123464347 123464745 MACS_peak_335 70.93 -chr7 132615669 132615970 MACS_peak_7605 72.43 -chr10 86169149 86169550 MACS_peak_867 103.08 -chr13 49863349 49863865 MACS_peak_2174 74.46 -chr7 86508733 86509023 MACS_peak_7431 62.85 -chr11 77378344 77378754 MACS_peak_1326 82.92 -chr6 86634390 86634683 MACS_peak_6980 103.74 -chr10 116871908 116872390 MACS_peak_961 57.59 -chr14 41750272 41750682 MACS_peak_2513 95.75 -chr4 151469965 151470437 MACS_peak_6189 78.61 -chr10 60875630 60876040 MACS_peak_740 67.36 -chr6 91572361 91572723 MACS_peak_7021 70.91 -chr6 128795347 128795755 MACS_peak_7131 94.41 -chr18 58701674 58702163 MACS_peak_4120 87.63 -chr5 110698548 110699046 MACS_peak_6511 143.30 -chrX 93325546 93325822 MACS_peak_8637 91.45 -chr7 134432899 134433190 MACS_peak_7644 106.55 -chr12 29320679 29321102 MACS_peak_1769 54.01 -chr10 20124963 20125579 MACS_peak_639 217.11 -chr3 106416991 106417459 MACS_peak_5432 62.73 -chr10 33739040 33739699 MACS_peak_678 218.19 -chr8 74878095 74878556 MACS_peak_7883 94.87 -chr11 83382803 83383271 MACS_peak_1395 68.88 -chr3 58862746 58863399 MACS_peak_5208 157.00 -chr15 77685670 77686204 MACS_peak_3162 62.73 -chr14 86615475 86615825 MACS_peak_2782 87.58 -chr6 124867137 124867557 MACS_peak_7105 62.38 -chr13 55514580 55515153 MACS_peak_2199 117.94 -chr5 102254699 102255015 MACS_peak_6477 60.56 -chr2 25923796 25924257 MACS_peak_4561 56.91 -chr3 152061055 152061395 MACS_peak_5639 51.22 -chr6 86334798 86335187 MACS_peak_6976 60.17 -chr17 53706687 53707067 MACS_peak_3872 69.57 -chr7 18595991 18596427 MACS_peak_7225 65.59 -chr2 166445552 166446686 MACS_peak_5044 237.97 -chrX 12567874 12568307 MACS_peak_8607 92.18 -chr9 57187655 57188087 MACS_peak_8309 150.82 -chr3 96553930 96554350 MACS_peak_5363 58.24 -chr11 49794755 49795218 MACS_peak_1156 97.10 -chr12 86453801 86454179 MACS_peak_1931 69.72 -chr11 113544835 113545435 MACS_peak_1622 189.84 -chr14 14820991 14821374 MACS_peak_2395 50.82 -chr11 115936845 115937272 MACS_peak_1653 53.75 -chr9 66358765 66359165 MACS_peak_8375 66.29 -chr3 14944703 14945194 MACS_peak_5112 88.33 -chr4 147514611 147514949 MACS_peak_6139 51.38 -chr2 154429029 154429495 MACS_peak_4972 62.73 -chr17 21082204 21082596 MACS_peak_3632 82.04 -chr17 14304400 14304838 MACS_peak_3625 65.46 -chr5 65288018 65288287 MACS_peak_6358 72.51 -chr2 26518695 26519103 MACS_peak_4576 63.21 -chr1 9272967 9273590 MACS_peak_2 182.17 -chr9 79658076 79658784 MACS_peak_8427 64.31 -chr12 100872379 100872633 MACS_peak_1969 57.15 -chr2 128684574 128684999 MACS_peak_4881 74.42 -chr2 154558527 154559018 MACS_peak_4975 69.69 -chr19 61171106 61171535 MACS_peak_4446 66.05 -chr6 88431783 88432112 MACS_peak_7003 73.77 -chr3 136303454 136303877 MACS_peak_5562 56.77 -chr12 71328424 71329097 MACS_peak_1859 114.45 -chr6 72059443 72059831 MACS_peak_6930 67.17 -chr7 26514419 26514743 MACS_peak_7262 83.43 -chr12 88174557 88175940 MACS_peak_1945 236.16 -chr17 29530544 29530849 MACS_peak_3735 56.94 -chr10 80895647 80896054 MACS_peak_845 153.53 -chr7 82925765 82926126 MACS_peak_7421 129.84 -chr13 105607134 105607556 MACS_peak_2347 50.79 -chr10 19311634 19312299 MACS_peak_631 73.24 -chr11 113667257 113667748 MACS_peak_1624 426.44 -chr17 29301427 29301965 MACS_peak_3729 91.69 -chr3 28680019 28680490 MACS_peak_5135 89.03 -chr12 70548827 70549327 MACS_peak_1849 143.57 -chr9 57758663 57758985 MACS_peak_8322 57.05 -chr12 55963541 55963840 MACS_peak_1831 67.73 -chr10 80814336 80815092 MACS_peak_838 73.53 -chr9 106106639 106107165 MACS_peak_8487 92.77 -chr18 31948500 31948917 MACS_peak_4044 115.28 -chr3 97817136 97817796 MACS_peak_5372 108.56 -chr5 115608262 115608636 MACS_peak_6540 61.95 -chr1 92677004 92677359 MACS_peak_270 56.79 -chr4 154536407 154536875 MACS_peak_6204 179.88 -chr11 78866726 78867449 MACS_peak_1362 160.60 -chr1 58502152 58502677 MACS_peak_120 107.68 -chr1 78654241 78654789 MACS_peak_210 256.08 -chr17 91266249 91266654 MACS_peak_3987 67.72 -chr10 80392804 80393085 MACS_peak_829 60.79 -chr9 123168920 123169217 MACS_peak_8575 89.95 -chr18 64648305 64648906 MACS_peak_4135 50.43 -chr16 45025492 45026112 MACS_peak_3450 84.01 -chr11 120459679 120460020 MACS_peak_1701 71.31 -chr13 19606884 19607473 MACS_peak_2072 72.34 -chr7 134324277 134324565 MACS_peak_7637 91.45 -chr12 32908423 32908906 MACS_peak_1781 89.00 -chr15 24842100 24842543 MACS_peak_2955 75.93 -chr18 6489913 6490350 MACS_peak_4007 97.14 -chr14 122247568 122247901 MACS_peak_2881 54.94 -chr10 98856446 98856735 MACS_peak_921 64.17 -chr1 182741146 182741385 MACS_peak_566 91.45 -chr4 45045503 45045718 MACS_peak_5770 81.78 -chr1 95531960 95532820 MACS_peak_290 140.79 -chr2 152643374 152643779 MACS_peak_4954 153.80 -chr10 79317823 79318310 MACS_peak_795 114.99 -chr5 106146989 106147692 MACS_peak_6490 267.90 -chr4 34634278 34634904 MACS_peak_5728 116.61
--- a/chipsequtil-master/examples/nib/test_batch_fasta.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -from chipsequtil import get_org_settings, BEDFile -from chipsequtil.nib import NibDB -from pprint import pprint - -genome_dir = get_org_settings('mm9')['genome_dir'] -db = NibDB(nib_dirs=[genome_dir]) -fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') - -pprint(seqs[:10])
--- a/chipsequtil-master/examples/nib/test_nib_db.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -from chipsequtil import get_org_settings, BEDFile -from chipsequtil.nib import NibDB -from pprint import pprint - -# see `org_settings.py -h` for more info on get_org_settings(<organism>) function -genome_dir = get_org_settings('mm9')['genome_dir'] - -# NibDB is an interface to a collection of nib files, typically corresponding -# to chromosomes of a genome - -# example with only one nib file -print 'NibDB with a single nib file' -db = NibDB(nib_fns=[genome_dir+'/chr1.nib']) - -print 'NibDB info:' -pprint(dict(db.db_info)) - -# get a fasta record for some sequence -print 'Example fasta record: chr1:1e8-1e8+100' -print db.get_fasta('chr1',1e8,1e8+100) - -# get just the sequence -print 'Same example, only sequence:' -print db.get_seq('chr1',1e8,1e8+100) -print - - -# example with a directory of nib files -print 'NibDB with a directory of nib files' -db = NibDB(nib_dirs=[genome_dir]) - -# get a fasta record for some sequence -print 'Example fasta record: chr1:1e8-1e8+100' -print db.get_fasta('chr1',1e8,1e8+100) - -print 'Example fasta record: chr1:1e8-1e8+100' -print db.get_fasta('chr2',1e8,1e8+100) - -print 'Example fasta record: chr1:1e8-1e8+100' -print db.get_fasta('chrX',1e8,1e8+100) - - -# example of fetching all sequences from a bed file -fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') - -print 'Num. peaks:',len(open('shuffled_peaks.bed').readlines()) -pprint(seqs[:10])
--- a/chipsequtil-master/examples/seq/test_chipsequtil_seq.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -from StringIO import StringIO -from chipsequtil.seq import FASTAFile, FASTQFile - -fasta_str = StringIO(">seq1\nACATAGGGAT\n>seq2\nTTATNTAGATA\n") -fasta_f = FASTAFile(fasta_str) -print fasta_f.headers - -print "[r for r in fasta_f]", [r for r in fasta_f] -print "fasta_f['seq1']", fasta_f['seq1'] -print "fasta_f.headers", fasta_f.headers -print "fasta_f.sequences", fasta_f.sequences - -fastq_str = StringIO("@seq1\nACATAGGGAT\n+seq2\nY^_cccQYJQ\n@seq2\nTTATNTAGATA\n+seq2\nY^_cJcQQJQ") -fastq_f = FASTQFile(fastq_str) -print "[r for r in fastq_f]", [r for r in fastq_f] -print "fastq_f['seq1']", fastq_f['seq1'] -print "fastq_f.headers", fastq_f.headers -print "fastq_f.sequences", fastq_f.sequences -print "fastq_f.quals", fastq_f.quals
--- a/chipsequtil-master/scripts/THEME.sh Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ -#!/bin/bash - -THEME_EXE=/nfs/data/cwng/archive/cvEM.64/THEME_edit.py - -OPT_SPEC=' -{ -"NAME": "THEME.sh", -"DESC": "Run old THEME version", -"ARGS": ["FG_FASTA","BG_FASTA","HYP_FN","MARKOV"], -"OPTS": { - "CV":{"LONG":"--cv","DEFAULT":5,"TYPE":"int","HELP":"number of cross validation folds [default:%default]"}, - "NOREFINE":{"LONG":"--no-refine","ACTION":"store_true","HELP":"do not run with refinement"}, - "BETA":{"LONG":"--beta","DEFAULT":0.7,"TYPE":"float","HELP":"beta parameter to use [default:%default]"}, - "DELTA":{"LONG":"--delta","DEFAULT":0.001,"TYPE":"float","HELP":"delta parameter to use [default:%default]"}, - "RANDOMIZE":{"LONG":"--randomization","ACTION":"store_true","HELP":"run randomization"}, - "MOTIF_FN":{"LONG":"--motif-file","DEFAULT":"dummy.out","HELP":"filename to write motif results to [default:%default]"}, - "OUTPUT_FN":{"LONG":"--output-filename","DEFAULT":"dummy.txt","HELP":"filename to write motif results to [default:%default]"}, - "RANDOM_FN":{"LONG":"--random-output","DEFAULT":"random.txt","HELP":"filename to write motif results to [default:%default]"}, - "DUMP":{"LONG":"--dump","ACTION":"store_true","HELP":"dump categtories to file"}, - "REM_COM":{"LONG":"--remove-common","ACTION":"store_true","HELP":"remove common sequences from analysis"}, - "NOPARALLEL":{"LONG":"--no-parallelize","ACTION":"store_true","HELP":"do not use wqsub.py for parallelization"}, - "INTERACTIVE":{"LONG":"--interactive","ACTION":"store_true","HELP":"run the script interactively"}, - "HYP_INDS":{"LONG":"--hyp-indices","DEFAULT":"ALL","HELP":"0-based indices of hypotheses to run [default: %default]"}, - "VERBOSE":{"SHORT":"-v","LONG":"--verbose","ACTION":"store_true","HELP":"print out the commands that are being run"}, - "TRIALS":{"LONG":"--trials","HELP":"this option is here only for backwards compatibility with THEME.py"} - } -}' -OUTPUT=$(echo $OPT_SPEC | getopts.py --shell=bash -- $@) -GETOPTS_RET=$? -if [ $GETOPTS_RET -ne 0 ]; then - exit 1 -fi -$OUTPUT - -INTERACTIVE_FLAG="--auto" -if [ $INTERACTIVE != "None" ]; then - INTERACTIVE_FLAG= -fi - -eval "$(steplist.py $INTERACTIVE_FLAG -t "Run THEME" THEME "Wait for jobs" "Combine results")" - -# run THEME -OUTDIR=THEME_data -test \! -e $OUTDIR && mkdir $OUTDIR - -WQSUB_EXE="wqsub.py" -if [ $NOPARALLEL != "None" ]; then - WQSUB_EXE= -fi - -RANDOMIZE_FLAG= -if [ $RANDOMIZE != "None" ]; then - RANDOMIZE_FLAG="-randomization" -fi - -RC= -if [ $RC ]; then - RC='-rc' -fi - -if [ $HYP_INDS != "ALL" ]; then - HYP_INDS=$(parse_steplist.py $HYP_INDS) - HYP_INDS_STATUS=$? - if [ $HYP_INDS_STATUS != 0 ]; then - echo "Incorrectly formatted argument to --hyp-indices option, aborting" - exit $HYP_INDS_STATUS - fi -else - NUM_HYPS=`grep -c '^Source' $HYP_FN` - NUM_HYPS=$(($NUM_HYPS-1)) - HYP_INDS=$(seq 0 $NUM_HYPS) -fi - -JOBIDS= -next_step && \ -for i in $HYP_INDS -do - - WQSUB= - REDIRECT= - if [ ! -z $WQSUB_EXE ]; then - WQSUB="$WQSUB_EXE --wqsub-name=THEME_$i" - fi - - OUTPRE=$OUTDIR/$i - - CMD="$WQSUB python $THEME_EXE $FG_FASTA $BG_FASTA $i \ - -fse $HYP_FN -markov $MARKOV -cv $CV -beta $BETA \ - -delta $DELTA -motif_file $OUTPRE.tamo -out_file $OUTPRE.txt \ - $RC" - JOBID=$($WQSUB $CMD) - JOBIDS="$JOBID $JOBIDS" - if [ $VERBOSE != "None" ]; then - echo $WQSUB $CMD - fi - - if [ $RANDOMIZE != "None" ]; then - - WQSUB="$WQSUB_EXE --wqsub-name=THEME_rand_$i" - - CMD="$WQSUB python $THEME_EXE $FG_FASTA $BG_FASTA $i \ - -fse $HYP_FN -markov $MARKOV -cv $CV -beta $BETA \ - -delta $DELTA -out_file ${OUTPRE}_rand_output.txt \ - -random_file ${OUTPRE}_rand.txt $RC -randomization" - - JOBID=$($WQSUB $CMD) - JOBIDS="$JOBID $JOBIDS" - - if [ $VERBOSE != "None" ]; then - echo $WQSUB $CMD -randomization - fi - fi - -done - - -# wait for jobs -next_step && wait_for_jobid.py $JOBIDS - -# compile results -next_step -DO_COMPILE=$? -if [ $DO_COMPILE == 0 ]; then - - rm -f $MOTIF_FN && touch $MOTIF_FN - ( - cd $OUTDIR - ls *.tamo | sort -n | xargs -n1 -I{} -t cat {} >> ../$MOTIF_FN - ) - - if [ $NOPARALLEL == "None" ]; then - mv -f *.{err,out} THEME_data - fi - - if [ $RANDOMIZE != "None" ]; then - rm -f $RANDOM_FN && touch $RANDOM_FN - ( - cd $OUTDIR - for ind in $HYP_INDS - do - out_fn="${ind}_rand.txt" - echo "Consolidating $out_fn" - python >> ../$RANDOM_FN << EOF -import re -import sys - -from TAMO.MotifTools import load - -ind = re.match('(\d+)',"$out_fn").group(1) - -motif = load("$HYP_FN")[int(ind)] - -src = motif.source.split() -if len(src) == 0 : - print 'Got weird motif source: %s\n'%src -src = src[0]+'_%s'%ind - -mot_str = str(motif) - -cverrs = [] -for l in open("$out_fn") : - m = re.match("trial: \d+ mean test error: (\d+\.\d+)$",l) - if m is not None : - cverrs.append(float(m.group(1))) - -print "\t".join([src,mot_str,str(sum(cverrs)/len(cverrs)),repr(cverrs)]) -sys.stdout.flush() - -EOF - done - - ) - - compile_THEME_results.py $MOTIF_FN $RANDOM_FN --output=$OUTPUT_FN - - fi -fi
--- a/chipsequtil-master/scripts/build_chipseq_infosite.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,675 +0,0 @@ -#!/usr/bin/env python - -import getpass -import glob -import json -import matplotlib -matplotlib.use('AGG') -import matplotlib.pyplot as mp -import os -import re -import shutil -import sys - -from collections import defaultdict -from csv import reader, writer, DictReader -from math import log -from optparse import OptionParser -from subprocess import call - -from chipsequtil import MACSFile, get_org_settings -from reStUtil import * - -usage = '%prog [options] [<peak filename> <peak filename> ...]' -parser = OptionParser(usage=usage) -parser.add_option('-d','--dir',dest='dir',default='.',help='Source directory [default: %default]') -parser.add_option('-n','--name',dest='name',help='Experiment name [default: current directory name]') -parser.add_option('--skip-motif-scan',dest='skip_motif_scan',action='store_true',help="skip motif_scan.py, but still build motifs into document (assumes motif_scan.py was previously run)") -parser.add_option('--skip-motif-stuff',dest='skip_motif_stuff',action='store_true',help="motif stuff takes a long time, manually skip it if no motif results are available or you don't care about them") - -{ - "experiment path": "/nfs/antdata/analysis/100809_P/100809_St7-10ul-p300_degenhar_Fraenkel_L2_mm9_sorted.bed", - "analysis path": "/net/ventral/nfs/people/labadorf/analysis/100809_P_St7_10ul", - "stage url": "http://fraenkel.mit.edu/stage/labadorf", - "peak files": { - "100809_P_St7_10ul_mfold10,30_pval1e-5": { - "total tags in control": 9331149, - "total tags in treatment": 10064908, - "Range for calculating regional lambda": "1000 bps and 10000 bps", - "tag size": 35, - "name": "100809_P_St7_10ul_mfold10,30_pval1e-5", - "model fold": "10,30", - "format": "BED", - "tags after filtering in treatment": 5099883, - "band width": 150, - "Redundant rate in control": 0.40999999999999998, - "Redundant rate in treatment": 0.48999999999999999, - "effective genome size": 2110000000.0, - "d": 145, - "maximum duplicate tags at the same position in control": 1, - "control file": "cntrl_6-3_sorted_filterbed.txt", - "MACS version": "1.4.0beta", - "ChIP-seq file": "exp_100809_St7-10ul-p300_degenhar_Fraenkel_L2_mm9_sorted.bed", - "tags after filtering in control": 5481613, - "maximum duplicate tags at the same position in treatment": 2, - "pvalue cutoff": 1.0000000000000001e-05 - } - }, - "format": "BED", - "FDR filter": "none", - "experiment name": "100809_P_St7_10ul", - "mapping type": "TSS", - "pipeline args": { - "--filter-peaks-args": "--sort-by=pvalue --top=200", - "--macs-args": "--mfold=10,30 --tsize=35 --bw=150 --format=BED --pvalue=1e-5", - "--map-args": "--tss --upstream-window=10000 --downstream-window=10000" - }, - "org": "mm9", - "control path": "/nfs/antdata/analysis/090828_42JVC/6-3/6-3_sorted_filterbed.txt", - "mapping window": [ - "10000", - "10000" - ], - "peaks used by THEME": "200", - "stage_dir": "/nfs/antdata/web_stage/labadorf" -} - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - exp_dir = os.path.abspath(opts.dir) - exp_name = opts.name if opts.name is not None else os.path.basename(exp_dir) - - # 1. find the param JSON file - param_json_fn = glob.glob('*params.json') - if len(param_json_fn) == 0 : - sys.stderr.write('Could not find parameter file, building one as best I can\n') - curr_user = getpass.getuser() - json_d = {'analysis path':os.getcwd(), - 'stage url':'http://fraenkel.mit.edu/stage/'+curr_user, - 'stage dir':'/nfs/antdata/web_stage/'+curr_user - } - else : - if len(param_json_fn) > 1 : - sys.stderr.write('Found more than one parameter file, picking the first one: %s\n'%','.join(param_json_fn)) - param_json_fn = param_json_fn[0] - json_d = json.load(open(param_json_fn)) - - # 2. make a new directory to save all the stuff - infosite_dir_name = exp_name+'_infosite' - infosite_path = os.path.join(os.getcwd(),infosite_dir_name) - if not os.path.exists(infosite_path) : - os.mkdir(infosite_path) - - infosite_img_path = os.path.join(infosite_path,'images') - if not os.path.exists(infosite_img_path) : - os.mkdir(infosite_img_path) - - # 3. setup web staging directory - stage_dir_path = os.path.join(json_d['stage dir'],infosite_dir_name) - if not os.path.exists(stage_dir_path) : - os.symlink(infosite_path,stage_dir_path) - - # 4. get the peaks files stats, don't want negative peaks - if len(args) == 0 : - peaks_fns = glob.glob('*_peaks.xls') - peaks_fns = filter(lambda x: 'negative' not in x,peaks_fns) - else : - peaks_fns = args - analysis_sets = [] - peak_json = json_d['peak files'] = {} - - # analyze all the peak files - for peak_fn in peaks_fns : - print 'processing:',peak_fn - macs_f = MACSFile(peak_fn) - peak_json[peak_fn] = macs_f.file_info - - # positive peaks - peak_stats = defaultdict(list) - num_peaks = 0 - pos_chr_dist = defaultdict(int) - for peak in macs_f : - pos_chr_dist[peak['chr']] += 1 - peak_stats['length'].append(peak['length']) - peak_stats['tags'].append(peak['tags']) - peak_stats['pvalue'].append(peak['-10*log10(pvalue)']) - peak_stats['fold_enrichment'].append(peak['fold_enrichment']) - peak_stats['fdr'].append(peak['FDR(%)']) - num_peaks += 1 - - peak_json[peak_fn]['positive peaks'] = num_peaks - peak_json[peak_fn]['reads under peaks'] = sum(peak_stats['tags']) - - # extract paired peaks info out of output.txt - output_fn = peak_json[peak_fn]['name']+'_output.txt' - output_regexes = ('#2 number of (paired peaks): (\d+)',) - for l in open(output_fn) : - for regex in output_regexes : - m = re.search(regex,l) - if m is not None : - peak_json[peak_fn][m.group(1)] = int(m.group(2)) - - # do the negative peaks - # negative peak file is now filtered - neg_peak_fns = glob.glob(peak_json[peak_fn]['name']+'_negative_peaks_*.xls') - - #TODO - do check for file exists - if neg_peak_fns : - neg_peak_fn = neg_peak_fns[0] - neg_peak_f = MACSFile(neg_peak_fn) - - neg_peak_stats = defaultdict(list) - num_peaks = 0 - neg_chr_dist = defaultdict(int) - for peak in neg_peak_f : - neg_chr_dist[peak['chr']] += 1 - neg_peak_stats['length'].append(peak['length']) - neg_peak_stats['tags'].append(peak['tags']) - neg_peak_stats['pvalue'].append(peak['-10*log10(pvalue)']) - neg_peak_stats['fold_enrichment'].append(peak['fold_enrichment']) - neg_peak_stats['fdr'].append(peak['FDR(%)']) - num_peaks += 1 - - peak_json[peak_fn]['negative peaks'] = num_peaks - peak_json[peak_fn]['reads under negative peaks'] = sum(peak_stats['tags']) - else : - peak_json[peak_fn]['negative peaks'] = 'NA' - peak_json[peak_fn]['reads under negative peaks'] = 'NA' - - # save the track lines - ucsc_track_fn = peak_json[peak_fn]['name']+'_MACS_wiggle_tracks.txt' - if os.path.exists(ucsc_track_fn) : - peak_json[peak_fn]['ucsc tracks'] = open(ucsc_track_fn).readlines() - - font = {'size':'9'} - mp.rc('font',**font) - - figsize = (3.5,3.5) - subplots_sizes = {'top':0.8,'left':0.15,'right':0.95} - hist_labels = ('+ peaks','- peaks') - # create histograms for each of the attributes - len_hist_name = macs_f.file_info['name']+'_length.png' - len_hist_fn = os.path.join(infosite_img_path,len_hist_name) - len_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+len_hist_name - peak_json[peak_fn]['length distribution url'] = len_hist_url - mp.figure(figsize=figsize) - mp.subplots_adjust(**subplots_sizes) - mp.hist((peak_stats['length'],neg_peak_stats['length']),label=hist_labels,bins=20,log=True) - mp.title('%s\npeak length distribution'%macs_f.file_info['name']) - mp.xlabel('peak length') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(len_hist_fn) - mp.clf() - - tags_hist_name = macs_f.file_info['name']+'_tags.png' - tags_hist_fn = os.path.join(infosite_img_path,tags_hist_name) - tags_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+tags_hist_name - peak_json[peak_fn]['tag distribution url'] = tags_hist_url - mp.figure(figsize=figsize) - mp.subplots_adjust(**subplots_sizes) - mp.hist((peak_stats['tags'],neg_peak_stats['tags']),label=hist_labels,bins=20,log=True) - mp.title('%s\npeak tag count distribution'%macs_f.file_info['name']) - mp.xlabel('# tags') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(tags_hist_fn) - mp.clf() - - pval_hist_name = macs_f.file_info['name']+'_pval.png' - pval_hist_fn = os.path.join(infosite_img_path,pval_hist_name) - pval_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pval_hist_name - peak_json[peak_fn]['pvalue distribution url'] = pval_hist_url - mp.figure(figsize=figsize) - mp.subplots_adjust(**subplots_sizes) - mp.hist((peak_stats['pvalue'],neg_peak_stats['pvalue']),label=hist_labels,bins=20,log=True) - mp.title('%s\npeak -10*log10(p-valuek) distribution'%macs_f.file_info['name']) - mp.xlabel('-10*log10(p-value)') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(pval_hist_fn) - mp.clf() - - fold_hist_name = macs_f.file_info['name']+'_fold.png' - fold_hist_fn = os.path.join(infosite_img_path,fold_hist_name) - fold_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+fold_hist_name - peak_json[peak_fn]['fold distribution url'] = fold_hist_url - mp.figure(figsize=figsize) - mp.subplots_adjust(**subplots_sizes) - mp.hist((peak_stats['fold_enrichment'],neg_peak_stats['fold_enrichment']),label=hist_labels,bins=20,log=True) - mp.title('%s\npeak fold enrichment distribution'%macs_f.file_info['name']) - mp.xlabel('fold enrichment') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(fold_hist_fn) - mp.clf() - - fdr_hist_name = macs_f.file_info['name']+'_fdr.png' - fdr_hist_fn = os.path.join(infosite_img_path,fdr_hist_name) - fdr_hist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+fdr_hist_name - peak_json[peak_fn]['fdr distribution url'] = fdr_hist_url - mp.figure(figsize=figsize) - mp.subplots_adjust(**subplots_sizes) - mp.hist(peak_stats['fdr'],label=hist_labels[0],bins=20,log=True) - mp.title('%s\npeak fdr distribution'%macs_f.file_info['name']) - mp.xlabel('fdr') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(fdr_hist_fn) - mp.clf() - - chr_dist_name = macs_f.file_info['name']+'_chr_dist.png' - chr_dist_fn = os.path.join(infosite_img_path,chr_dist_name) - chr_dist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+chr_dist_name - peak_json[peak_fn]['chr distribution url'] = chr_dist_url - chromos = [] - if json_d.has_key('org') : - chr_sizes_fn = get_org_settings(json_d['org'])['ucsc_chrom_sizes'] - chromos = [r[0] for r in reader(open(chr_sizes_fn),delimiter='\t')] - else : - chromos = list(set(pos_chr_dist.keys()).union(neg_chr_dist.keys())) - standard_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is not None,chromos) - - # hack chrM, chrX and chrY so they sort right - if 'chrM' in standard_chromos : - standard_chromos[standard_chromos.index('chrM')] = 'chr100' - if 'chrX' in standard_chromos : - standard_chromos[standard_chromos.index('chrX')] = 'chr101' - if 'chrY' in standard_chromos : - standard_chromos[standard_chromos.index('chrY')] = 'chr102' - - standard_chromos.sort(key=lambda x: int(x.replace('chr',''))) - - # unhack chrM, chrX and chrY so they display right - if 'chr100' in standard_chromos : - standard_chromos[standard_chromos.index('chr100')] = 'chrM' - if 'chr101' in standard_chromos : - standard_chromos[standard_chromos.index('chr101')] = 'chrX' - if 'chr102' in standard_chromos : - standard_chromos[standard_chromos.index('chr102')] = 'chrY' - - other_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is None,chromos) - - pos_plot_chr_dist = defaultdict(int) - neg_plot_chr_dist = defaultdict(int) - for chrom in standard_chromos : - pos_plot_chr_dist[chrom] += pos_chr_dist.get(chrom,0) - neg_plot_chr_dist[chrom] += neg_chr_dist.get(chrom,0) - for chrom in other_chromos : - pos_plot_chr_dist['Other'] += pos_chr_dist.get(chrom,0) - neg_plot_chr_dist['Other'] += neg_chr_dist.get(chrom,0) - chromos.append('Other') - mp.figure(figsize=figsize) - mp.subplots_adjust(bottom=0.18,**subplots_sizes) - mp.bar(range(len(chromos)), - [pos_plot_chr_dist[k] for k in chromos], - width=0.45, - color='b', - label='Positive' - ) - mp.bar([x+0.45 for x in range(len(chromos))], - [neg_plot_chr_dist[k] for k in chromos], - width=0.45, - color='g', - label='Negative' - ) - mp.xticks([x+0.45 for x in range(len(chromos))],chromos,rotation=90) - mp.title('%s\nPeaks by chromosome'%macs_f.file_info['name']) - mp.xlabel('Chromosome') - mp.ylabel('# peaks') - mp.legend() - mp.savefig(chr_dist_fn) - mp.clf() - - # pos vs neg peaks - pos_v_neg_name = '%s_pos_v_neg.png'%macs_f.file_info['name'] - pos_v_neg_fn = os.path.join(infosite_img_path,pos_v_neg_name) - pos_v_neg_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pos_v_neg_name - peak_json[peak_fn]['pos v neg url'] = pos_v_neg_url - cmd = 'plot_pos_vs_neg_peaks.py --output=%s %s %s'%(pos_v_neg_fn,peak_fn, neg_peak_fn) - sys.stderr.write(cmd+'\n') - r = call(cmd,shell=True) - - # motif stuff - if opts.skip_motif_scan or opts.skip_motif_stuff : - sys.stderr.write('Obediently skipping motif stuff\n') - else : - # not exactly sure the best way to find the filtered macs file yet, - # just take the .xls file with the longest filename? - filtered_peak_fns = glob.glob('%s_peaks_*'%macs_f.file_info['name']) - filtered_peak_fns.sort(key=lambda x: len(x),reverse=True) - filtered_peak_fn = filtered_peak_fns[0] - - motif_results_fns = glob.glob('%s_motifs_beta*_cv*[0-9].tamo'%macs_f.file_info['name']) - motif_results_fn = motif_results_fns[0] - #TODO - do check for file exists - - # motif_scan.py <org> <peak fn> <TAMO motif fn> - fixed_peak_width = '' - if json_d['fixed peak width'] != 'none' : - fixed_peak_width = '--fixed-peak-width=%s'%json_d['fixed peak width'] - - cmd = 'motif_scan.py %s --dir=%s/images/ %s %s %s' - cmd = cmd%(fixed_peak_width,infosite_dir_name,json_d['org'],filtered_peak_fn,motif_results_fn) - sys.stderr.write(cmd+'\n') - call(cmd,shell=True) - - # pot_peaks_vs_motifs.py <peaks fn> <seq score fn> <bg score fn> - - - # 5. build reSt document - reSt_fn = exp_name+'_info.rst' - reSt_path = os.path.join(infosite_path,reSt_fn) - reSt_html_name = exp_name+'_info.html' - reSt_html_path = os.path.join(infosite_path,reSt_html_name) - reSt_url = json_d['stage url'] + '/' + infosite_dir_name + '/' + reSt_html_name - doc = ReStDocument(reSt_path) - doc.add(ReStSection("Infopage for %s"%exp_name)) - - # basic experiment stats table - ident = lambda x: x or 'unknown' - stat_key_labels_fmts = [ - ('org','Organism',ident), - ('analysis path','Analysis Path',ident), - ('experiment path','Experiment Path',ident), - ('control path','Control Path',ident), - ('format','Read Format',ident), - ('FDR filter','FDR filter',ident), - ('mapping type','Gene Mapping Type',ident), - ('mapping window','Gene Mapping Window',lambda x: x and '-%s,%s'%tuple(x)), - ('peaks used by THEME','Peaks used by THEME',ident) - ] - stat_rows = [('**%s**'%label, fmt(json_d.get(key))) for key,label,fmt in stat_key_labels_fmts] - doc.add(ReStSimpleTable(None,stat_rows)) - - doc.add(ReStSection('MACS Peak File Stats',level=2)) - - # go through peak files - peak_recs = json_d['peak files'] - fl_str = lambda x: x and '%.2g'%float(x) - stat_key_labels_fmts = [ - ('paired peaks','*paired peaks*',ident), - ('positive peaks','*positive peaks*',ident), - ('negative peaks','*negative peaks*',ident), - ('reads under peaks','*reads under positive peaks*',ident), - ('total tags in treatment','*Treatment Tags*',ident), - ('tags after filtering in treatment','after filtering',ident), - ('Redundant rate in treatment','redunancy rate',fl_str), - ('maximum duplicate tags at the same position in treatment','max dup. tags',ident), - ('total tags in control','*Control Tags*',ident), - ('tags after filtering in control','after filtering',ident), - ('Redundant rate in control','redunancy rate',fl_str), - ('maximum duplicate tags at the same position in control','max dup. tags',ident), - ('peak tag count filter','*Minimum peak tag count*',ident), - ('d','*MACS d*',ident), - ('band width','*band width*',ident), - ('MACS version','*MACS version*',ident), - ('pvalue cutoff','*p-value cutoff*',lambda x: '1e%d'%int(log(x,10))), - ] - - for peak_fn,peak_stats in peak_recs.items() : - - # add the new section and stats table - doc.add(ReStSection(peak_fn,level=3)) - stat_rows = [('*%s*'%label, fmt(peak_stats.get(key))) for key,label,fmt in stat_key_labels_fmts] - doc.add(ReStSimpleTable(None,stat_rows)) - - # link to the peaks file - peak_infosite_name = os.path.join(infosite_dir_name,peak_fn) - peak_infosite_path = os.path.abspath(peak_infosite_name) - peak_infosite_url = json_d['stage url'] + '/' + peak_infosite_name - call('cp %s %s'%(peak_fn,os.path.join(infosite_dir_name,peak_fn)),shell=True) - doc.add(ReStSimpleTable(None,[('**MACS Peaks File**','`%s`_'%peak_infosite_url)])) - doc.add(ReStHyperlink(peak_infosite_url,url=peak_infosite_url)) - - # UCSC track info - if peak_stats.has_key('ucsc tracks') : - ucsc_tbl = ReStSimpleTable(('**UCSC Genome Browser Track Lines**',), - [[x] for x in peak_stats['ucsc tracks']]) - doc.add(ucsc_tbl) - else : - doc.add(ReStSimpleTable(None,[['UCSC integration was not enabled for this experiment']])) - - # peak quality plots - img_tbl1 = ReStSimpleTable(None, [ - [ - ReStImage(peak_stats['pos v neg url'],options={'width':'600px','align':'center'}), - ] - ] - ) - doc.add(img_tbl1) - - img_tbl2 = ReStSimpleTable(None, [ - [ - ReStImage(peak_stats['length distribution url'],options={'width':'250px','align':'center'}), - ReStImage(peak_stats['tag distribution url'],options={'width':'250px','align':'center'}), - ReStImage(peak_stats['pvalue distribution url'],options={'width':'250px','align':'center'}) - ], - [ - ReStImage(peak_stats['fold distribution url'],options={'width':'250px','align':'center'}), - ReStImage(peak_stats['fdr distribution url'],options={'width':'250px','align':'center'}), - ReStImage(peak_stats['chr distribution url'],options={'width':'250px','align':'center'}) - ] - ] - ) - doc.add(img_tbl2) - - # gene info - gene_fn = peak_stats['name']+'_genes.txt' - gene_link = os.path.join(infosite_dir_name,gene_fn) - if not os.path.exists(gene_link) : - shutil.copyfile(gene_fn,gene_link) - gene_url = json_d['stage url']+'/'+gene_link - - # gather other gene mapping stats - # knownGeneID - # geneSymbol - # chr - # start - # end - # length - # summit - # tags - # -10*log10(pvalue) - # fold_enrichment - # FDR(%) - # peak - # loc - # dist - # from - # feature - # score - # map - # type - # map - # subtype - - gene_reader = DictReader(open(gene_fn),delimiter='\t') - gene_stats = defaultdict(set) - gene_pvals = defaultdict(float) - for rec in gene_reader : - gene_stats['num knownGenes'].add(rec['knownGeneID']) - gene_stats['num geneSymbols'].add(rec['geneSymbol']) - gene_pvals[rec['geneSymbol']] = max(gene_pvals[rec['geneSymbol']],float(rec['-10*log10(pvalue)'])) - gene_pvals = gene_pvals.items() - gene_pvals.sort(key=lambda x: x[1],reverse=True) - for k,v in gene_pvals[:20]: - print k,v - gene_mapping_data = [('**# knownGenes mapped**',len(gene_stats['num knownGenes'])), - ('**# gene symbols mapped**',len(gene_stats['num geneSymbols'])), - ('**Top 10 gene symbols**',','.join([x[0] for x in gene_pvals[:10]])), - ('**All gene mappings**','`%s`_'%gene_url) - ] - - # plots from plot_peak_loc_dist.py - gene_pie_name = exp_name+'_gene_map.png' - peak_pie_name = exp_name+'_peak_map.png' - hist_name = exp_name+'_peak_dist.png' - pval_bar_name = exp_name+'_pval_bar.png' - peak_loc_d = {'out_dir':infosite_path, - 'gene_pie_fn':os.path.join(infosite_path,'images',gene_pie_name), - 'peak_pie_fn':os.path.join(infosite_path,'images',peak_pie_name), - 'pval_bar_fn':os.path.join(infosite_path,'images',pval_bar_name), - 'hist_fn':os.path.join(infosite_path,'images',hist_name), - 'peak_fn':peak_fn, - 'gene_name':gene_fn - } - cmd = 'plot_peak_loc_dist.py --save -d %(out_dir)s -g %(gene_pie_fn)s ' \ - '-p %(peak_pie_fn)s -f %(hist_fn)s -b %(pval_bar_fn)s ' \ - '%(peak_fn)s %(gene_name)s' - sys.stderr.write(cmd%peak_loc_d+'\n') - call(cmd%peak_loc_d,shell=True) - peak_stats['gene map url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+gene_pie_name - peak_stats['peak map url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+peak_pie_name - peak_stats['pval bar url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+pval_bar_name - peak_stats['dist url'] = json_d['stage url']+'/'+infosite_dir_name+'/images/'+hist_name - - # make links to the different peaks files - feature_patts = ('promoter.txt','gene_exon.txt','gene_intron.txt','after.txt','intergenic.xls') - feature_data = [] - feature_urls = [] - - for patt in feature_patts : - feature_fn = '%s_*_%s'%(peak_stats['name'],patt) - feature_path = glob.glob(os.path.join(infosite_dir_name,feature_fn)) - if len(feature_path) == 0 : - sys.stderr.write('Warning: %s could not be found, skipping feature type\n'%os.path.join(infosite_dir_name,feature_fn)) - continue - feature_path = feature_path[0] - feature_url = json_d['stage url']+'/'+feature_path - - # create UCSC formatted versions of the files - if patt.endswith('.txt') : # these have gene columns - feature_type = patt.replace('.txt','') - ucsc_feature_fn = feature_fn.replace('.txt','_ucsc.txt') - st,en = 2,4 - elif patt.endswith('.xls') : - feature_type = patt.replace('.xls','') - ucsc_feature_fn = feature_fn.replace('.xls','_ucsc.xls') - st,en = 0,2 - - ucsc_feature_path = os.path.join(infosite_dir_name,ucsc_feature_fn) - ucsc_feature_f = open(ucsc_feature_path,'w') - ucsc_feature_writer = writer(ucsc_feature_f,delimiter='\t') - for l in reader(open(feature_path),delimiter='\t') : - rec = l[0:st] + \ - ['%s:%s-%s'%tuple(l[st:en+1])] + \ - l[en+1:] - ucsc_feature_writer.writerow(rec) - ucsc_feature_f.close() - - ucsc_feature_url = json_d['stage url']+'/'+ucsc_feature_path - - feature_data.append(('**%s peaks**'%feature_type,'`%s`_ `UCSC %s`_'%(feature_url,feature_type))) - feature_urls.append(ReStHyperlink(feature_url,url=feature_url)) - feature_urls.append(ReStHyperlink('UCSC %s'%feature_type,url=ucsc_feature_url)) - - gene_mapping_data.extend(feature_data) - feat_tbl = ReStSimpleTable(('**Gene mapping data**',''),gene_mapping_data) - doc.add(feat_tbl) - doc.add(ReStHyperlink(gene_url,url=gene_url)) - for url in feature_urls : - doc.add(url) - - img_tbl3 = ReStSimpleTable(None, [ - [ - ReStImage(peak_stats['gene map url'],options={'align':'center'}), - ReStImage(peak_stats['peak map url'],options={'align':'center'}) - ], - [ - ReStImage(peak_stats['pval bar url'],options={'align':'center'}), - ReStImage(peak_stats['dist url'],options={'align':'center'}) - ] - ] - ) - doc.add(img_tbl3) - - # now put some motif stuff up there - - - if opts.skip_motif_stuff : - sys.stderr.write('Obediently skipping even more motif stuff\n') - else : - # THEME refines all motifs, display the top 30 - - # for now, just list a table of the top 30 significant, unrefined motifs - doc.add(ReStSection('%s Top 30 Refined Motif Results'%peak_stats['name'],level=3)) - motif_results_fns = glob.glob('%s_motifs_beta*_cv*[0-9].txt'%macs_f.file_info['name']) #catRun_mfold10,30_pval1e-5_motifs_beta0.0_cv5.txt - #TODO - do check for file exists - - motif_results_fn = motif_results_fns[0] - - motif_reader = reader(open(motif_results_fn),delimiter='\t') - - motif_header = motif_reader.next() - motif_data = [] - top_n = 30 - motif_fmts = (ident,ident,int,fl_str,fl_str,fl_str,fl_str,fl_str,fl_str) - motif_plot_urls = [] - for rec in motif_reader : - motif_data.append([f(x) for f,x in zip(motif_fmts,rec)]) - """ - if rec[2] in motif_sig_inds_d.keys() : - from_id = motif_sig_inds_d[rec[2]] - try : - old_id_fn = glob.glob(infosite_dir_name+'/images/*_%d_peakmot.png'%from_id)[0] - new_id_fn = old_id_fn.replace('_%d_'%from_id,'_%s_'%rec[2]) - os.rename(old_id_fn,new_id_fn) - except : - sys.stderr.write("Couldn't rename file for pattern %s, just " \ - "assuming its there\n"%(infosite_dir_name+'/images/*_%d_peakmot.png'%from_id)) - """ - new_id_fn = glob.glob(infosite_dir_name+'/images/*_%s_peakmot.png'%rec[2])[0] - motif_plot_urls.append(json_d['stage url']+'/'+new_id_fn) - - doc.add(ReStSimpleTable(['**%s**'%x for x in motif_header],motif_data[:top_n])) - - # create another file with the full table - motif_results_base, motif_results_ext = os.path.splitext(motif_results_fn) - motif_doc_fn = motif_results_base+'.rst' - motif_doc_path = os.path.join(infosite_path,motif_doc_fn) - motif_doc_html_fn = motif_results_base+'.html' - motif_doc_html_path = os.path.join(infosite_path,motif_doc_html_fn) - motif_doc_url = json_d['stage url']+'/'+infosite_dir_name+'/'+motif_doc_html_fn - motif_doc = ReStDocument(motif_doc_path) - motif_doc.add(ReStSection('%s Full Motif Results'%peak_stats['name'])) - motif_doc.add('`Back to main infopage`_') - motif_doc.add(ReStSimpleTable(['**%s**'%x for x in motif_header],motif_data)) - motif_doc.add('`Back to main infopage`_') - motif_doc.add(ReStHyperlink('Back to main infopage',url=reSt_url)) - motif_doc.write() - motif_doc.close() - rst2html_call = 'rst2html.py --stylesheet-path=/nfs/antdata/web_stage/css/lsr.css ' \ - '%s %s'%(motif_doc_path,motif_doc_html_path) - sys.stderr.write(rst2html_call+'\n') - r = call(rst2html_call,shell=True) - doc.add('`All refined motifs`_') - doc.add(ReStHyperlink('All refined motifs',url=motif_doc_url)) - - # individual motif plots - plt_tbl = [] - for i,url in enumerate(motif_plot_urls[:30]) : - if i%3 == 0 : - plt_tbl.append([]) - plt_tbl[-1].append(ReStImage(url)) - - doc.add(ReStSimpleTable(('**Peak strength vs refined motif strength**','(based on top 2000 peak sequences by pvalue)',''),plt_tbl)) - - doc.write() - doc.close() - - # 6. convert reSt to PDF and HTML - rst2html_call = 'rst2html.py --stylesheet-path=/nfs/antdata/web_stage/css/lsr.css ' \ - '%s %s'%(reSt_path,reSt_html_path) - sys.stderr.write(rst2html_call+'\n') - r = call(rst2html_call,shell=True) - - pdf_name = exp_name+'_info.pdf' - pdf_path = os.path.join(infosite_path,pdf_name) - r = call('rst2pdf %s -o %s'%(reSt_path,pdf_path),shell=True) - - # 7. write out url to infosite - print json_d['stage url']+'/'+infosite_dir_name+'/'+reSt_html_name - open(infosite_dir_name+'_url.txt','w').write(json_d['stage url']+'/'+infosite_dir_name+'/'+reSt_html_name+'\n')
--- a/chipsequtil-master/scripts/chipseq_pipeline.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,331 +0,0 @@ -#!/usr/bin/env python - -import os -from subprocess import Popen, PIPE -import string -import sys -from optparse import OptionParser, OptionGroup, SUPPRESS_HELP - -from pypeline import Pypeline, ProcessPypeStep as PPS, PythonPypeStep as PyPS, parse_steplist -from chipsequtil import get_file_parts, get_org_settings -from chipsequtil.util import MultiLineHelpFormatter -from TAMO import MotifTools -from TAMO.MD.THEME import parser as theme_parser - -usage = "%prog [options] <organism> <experiment alignment filename> [<control alignment filename>]" -description = """1st generation ChIPSeq analysis pipeline: - - - runs MACS to find peaks and sorts peaks by p-value - - sorts peaks by pvalue and isolates top *n* - - maps peaks to genes - - extracts fasta files for gene peaks in experiments - - constructs background sequences matching foreground distribution - - runs THEME.py on input sequences w/ refinement - - builds an infosite with stats from this analysis - -Control input file is optional. *organism* argument is passed to the -*org_settings.py* command to specify organism specific parameters, ensure -that the following commands return valid paths: - -If running MACS: - - org_settings.py <organism> genome_size - - org_settings.py <organism> genome_dir - - org_settings.py <organsim> refgene_anno_path - -If running THEME: - - org_settings.py <organism> theme_hypotheses - - org_settings.py <organism> theme_markov - -""" - -epilog = """Note: it is advised to leave the --*-args arguments unchanged -unless you really know what you're doing.""" - -parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) -parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') -parser.add_option('--steplist',dest='steplist',default='',help='with --auto, run specific steps') -parser.add_option('--exp-name',dest='exp_name',default=os.path.basename(os.getcwd()),help='name for the experiment/pipeline, used for convenience [default: current directory name]') -parser.add_option('--bed-args',dest='bed_args',default='--stdout --chromo-strip=.fa',help='double quote wrapped arguments for gerald_to_bed.py [default: %default]') -#parser.add_option('--stats-args',dest='stats_args',default='',help='double quote wrapped arguments for gerald_stats.py [default: %default]') -parser.add_option('--macs-exec',dest='macs_exec',default='macs14',help='the executable to use for MACS, if not an absolute path it needs to be on your shell environment path [default: %default]') -parser.add_option('--macs-args',dest='macs_args',default='--pvalue=1e-5',help='double quote wrapped arguments for macs, only changing --mfold, --tsize, --bw, and --pvalue recommended [default: %default]') -parser.add_option('--map-args',dest='map_args',default='--tss --upstream-window=10000 --downstream-window=10000',help='double quote wrapped arguments for mapping peaks to genes [default: %default]') -parser.add_option('--filter-peaks-args',dest='filter_peaks_args',default="--sort-by=pvalue --top=1000 -f 'tags>20'",help='double quote wrapped arguments for filter_macs_peaks.py [default: %default]') -parser.add_option('--filter-neg-peaks-args',dest='filter_neg_peaks_args',default="-f 'tags>20'",help='double quote wrapped arguments for filter_macs_peaks.py applied to negative peaks [default: %default]') -parser.add_option('--peaks-to-fa-args',dest='peaks_to_fa_args',default='--fixed-peak-width=200',help='double quote wrapped arguments for peaks_to_fasta.py [default: %default]') -parser.add_option('--bg-exec',dest='bg_exec',default='rejection_sample_fasta.py',help='the executable to use for generating background sequences for THEME, if not an absolute path it needs to be on your shell environment path [default: %default]') -parser.add_option('--bg-args',dest='bg_args',default='--num-seq=2.1x',help='double quote wrapped arguments for background sequence generation utility [default: %default]') -parser.add_option('--theme-args',dest='theme_args',default='--beta=0.7 --cv=5 --trials=25',help='double quote wrapped arguments for THEME.py [default: %default]') -parser.add_option('--motif-pval-cutoff',dest='motif_pval',type='float',default=1e-5,help='the p-value cutoff for sending non-refined enrichmed motifs to THEME for refinement') -parser.add_option('--parallelize',dest='parallelize',action='store_true',help='parallelize portions of the pipeline using qsub, only works from SGE execution hosts') -parser.add_option('--ucsc',dest='ucsc',action='store_true',default=False,help='perform tasks for automated integration with UCSC genome browser [default:%default]') -parser.add_option('--build-infosite-args',dest='infosite_args',default='',help='arguments to pass to build_chipseq_infosite.py [default: None]') - -ucsc_group = OptionGroup(parser,"UCSC Integration Options (with --ucsc)") -ucsc_group.add_option('--stage-dir',dest='stage_dir',default='./',help='root directory where UCSC integration files should be made available [default: %default]') -ucsc_group.add_option('--stage-url',dest='stage_url',default='http://localhost/',help='URL where UCSC integration files will be made available over the web [default: %default]') -parser.add_option_group(ucsc_group) - -#parallel_group = OptionGroup(parser,"Parallelization Options (with --parallelize)",description="These options are relevant to parallelization of the pipeline, functionality is in beta status until further notice") -#parallel_group.add_option('--split-args',dest='split_args',default='--type=count --arg=16',help='double quote wrapped arguments for split_file.py [default: %default]') -#parallel_group.add_option('--qsub-args',dest='qsub_args',default='--die-on-err',help='double quote wrapped arguments for split_qsub.py [default: %default]') -#parser.add_option_group(parallel_group) - -parser.add_option('--print-args',dest='print_args',action='store_true',help=SUPPRESS_HELP) # secret ninja option - - -if __name__ == '__main__' : - - # parse command line arguments - opts, args = parser.parse_args(sys.argv[1:]) - - # stick it up here, so when we print out args it's updated - if opts.ucsc and opts.macs_args.find('--wig') == -1 : - opts.macs_args += " --wig" - - # just print out all options as passed in for script generating purposes - if opts.print_args : - opts_strs = [] - all_opts = [] - all_opts.extend(parser.option_list) - all_opts.extend(*[x.option_list for x in parser.option_groups]) - for opt in all_opts : - opt_str = opt.get_opt_string() - if opt_str in ['--help','--print-args'] : - pass - elif opt_str == '--steplist' and not opts.auto : - pass - #elif opt_str in ['--stage-dir','--stage-url'] and not opts.ucsc : - # pass - #elif opt_str in ['--split-args','--qsub-args'] and not opts.parallelize : - # pass - elif opt.action == 'store' : - arg = str(getattr(opts,opt.dest)) - if arg.count(' ') > 0 or arg.find(' -') != -1 or arg.startswith('-') or arg.find('--') != -1 : - opts_strs.append(' %s="%s"'%(opt.get_opt_string(),str(getattr(opts,opt.dest)))) - else : - opts_strs.append(' %s=%s'%(opt.get_opt_string(),str(getattr(opts,opt.dest)))) - elif opt.action == 'store_true' and getattr(opts,opt.dest) : - opts_strs.append(' %s'%opt.get_opt_string()) - opts_strs.append(' $@') - sys.stdout.write(' \\\n'.join(opts_strs)+'\n') - sys.exit(0) - - if len(args) < 2 : - parser.error('Must provide two non-option arguments') - - # filenames and paths - organism, experiment_fn = args[0:2] - control_fn = None - if len(args) > 2 : - control_fn = args[2] - - org_settings = get_org_settings(organism) - refgene_fn = org_settings['refgene_anno_path'] - kg_ref = org_settings['known_gene_anno_path'] - kg_xref = org_settings['known_gene_xref_path'] - - exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) - exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) - - if control_fn : - cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) - cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) - - # the pipeline - #log_fn = os.path.join(opts.exp_name+'_pipeline.log') - pipeline = Pypeline('Analysis pipeline for %s'%opts.exp_name) - - steps = [] - - #if opts.parallelize : - # # split up files - # calls = ["mkdir %s"%exp_wrk_dir, - # "split_file.py %s --outdir=%s %s"%(opts.split_args,exp_wrk_dir,experiment_fn),] - # if control_fn : - # calls.extend(["mkdir %s"%cnt_wrk_dir, - # "split_file.py %s --outdir=%s %s"%(opts.split_args,cnt_wrk_dir,control_fn), - # ]) - # steps.append(PPS('Split files',calls,env=os.environ)) - - ############################################################################ - # run macs - ############################################################################ - cnt_flag = '' - if control_fn : - cnt_flag = '-c %s'%control_fn - - # parse macs_args so we can extract mfold and pvalue...in a rather silly way - macs_mfold = [x for x in opts.macs_args.split(' ') if 'mfold' in x] - macs_mfold = macs_mfold[0].split('=',1)[1] if len(macs_mfold) >= 1 else 'DEF' - - macs_pvalue = [x for x in opts.macs_args.split(' ') if 'pvalue' in x] - macs_pvalue = macs_pvalue[0].split('=',1)[1] if len(macs_pvalue) >= 1 else 'DEF' - macs_name = opts.exp_name+'_mfold%s_pval%s'%(macs_mfold,macs_pvalue) - - macs_peaks_fn = macs_name+'_peaks.xls' - macs_neg_peaks_fn = macs_name+'_negative_peaks.xls' - macs_screen_output_fn = macs_name+'_output.txt' - - macs_d = {'exp_fn':experiment_fn, - 'cnt_flag':cnt_flag, - 'name':macs_name, - 'macs_exec':opts.macs_exec, - 'macs_args':opts.macs_args, - 'macs_out':macs_screen_output_fn, - 'gsize':org_settings['genome_size'], - } - calls = ["%(macs_exec)s --gsize=%(gsize)s -t %(exp_fn)s %(cnt_flag)s --name=%(name)s %(macs_args)s 2>&1 | tee %(macs_out)s"%macs_d] - steps.append(PPS('Run MACS',calls,env=os.environ)) - - - ############################################################################ - # process and stage wiggle files - ############################################################################ - if opts.ucsc : - wiggle_dir = macs_name+'_MACS_wiggle' - ucsc_d = {'org':organism, - 'stage_dir':opts.stage_dir, - 'stage_url':opts.stage_url, - 'macs_dir':wiggle_dir, - } - - calls = ["integrate_macs_ucsc.py --auto %(org)s %(stage_dir)s %(stage_url)s %(macs_dir)s"%ucsc_d] - steps.append(PPS("UCSC Integration",calls)) - - - ############################################################################ - # map peaks to genes - ############################################################################ - map_fn = "%s_genes.txt"%macs_name - map_stats_fn = "%s_genes_stats.xls"%macs_name - map_d = {'kg_ref':kg_ref, - 'kg_xref':kg_xref, - 'peaks_fn':macs_peaks_fn, - 'bed_peaks_fn':macs_name+'_peaks.bed', - 'map_fn':map_fn, - 'map_stats_fn':map_stats_fn, - 'map_args':opts.map_args - } - # make sure peak files don't have .fa at the end of their chromosomes - calls = ["sed -i 's/\.fa//g' %(peaks_fn)s %(bed_peaks_fn)s"%map_d] - c = "map_peaks_to_known_genes.py %(map_args)s --map-output=%(map_fn)s " + \ - "--detail --stats-output=%(map_stats_fn)s %(kg_ref)s %(kg_xref)s " + \ - "%(peaks_fn)s" - calls.append(c%map_d) - steps.append(PPS('Map peaks to genes',calls,env=os.environ)) - - - ############################################################################ - # filter macs peaks - ############################################################################ - filtered_d = {'filter_peaks_args':opts.filter_peaks_args, - 'filter_neg_peaks_args':opts.filter_neg_peaks_args, - 'peaks_fn':macs_peaks_fn, - 'neg_peaks_fn':macs_neg_peaks_fn - } - c = "filter_macs_peaks.py --print-encoded-fn --encode-filters " \ - "%(filter_peaks_args)s %(peaks_fn)s" - filtered_peaks_fn = Popen(c%filtered_d,shell=True,stdout=PIPE).communicate()[0] - filtered_neg_peaks_fn = macs_name + '_negative_peak_filt.xls' - calls = ["filter_macs_peaks.py --encode-filters %(filter_peaks_args)s %(peaks_fn)s"%filtered_d] - if control_fn is not None : - calls.append("filter_macs_peaks.py --encode-filters %(filter_neg_peaks_args)s %(neg_peaks_fn)s"%filtered_d) - steps.append(PPS('Filter MACS peaks',calls,env=os.environ)) - - - ############################################################################ - # THEME - ############################################################################ - # extract foreground and generate background sequences - fg_fn = filtered_peaks_fn.replace('.xls','.fa') - fg_d = {'opts':opts.peaks_to_fa_args, - 'organism':organism, - 'fg_fn':fg_fn, - 'peaks_fn':filtered_peaks_fn} - calls = ["peaks_to_fasta.py %(opts)s --output=%(fg_fn)s %(organism)s %(peaks_fn)s"%fg_d] - steps.append(PPS('Peaks to Fasta',calls,env=os.environ)) - - bg_fn = "%s_bg.fa"%macs_name - bg_d = {'opts':opts.bg_args, - 'organism':organism, - 'fg_fn':fg_fn, - 'bg_fn':bg_fn} - calls = ["rejection_sample_fasta.py %(opts)s --output=%(bg_fn)s %(organism)s %(fg_fn)s"%bg_d] - steps.append(PPS('Generate Background Sequences',calls,env=os.environ)) - - # run THEME on fg - theme_opts, theme_args = theme_parser.parse_args(opts.theme_args.split(' ')) - hyp_fn = org_settings['theme_hypotheses'] - markov_fn = org_settings['theme_markov'] - - # run THEME w/ randomization by running each motif individuall - # this is because TAMO.MD has a memory leak - raw_motif_fn = '%s_motifs_beta%s_cv%s.tamo'%(macs_name,theme_opts.beta,theme_opts.cv) - random_cv_fn = '%s_motifs_beta%s_cv%s_rand.txt'%(macs_name,theme_opts.beta,theme_opts.cv) - - # new old THEME call - #Usage: THEME.sh [options] <FG_FASTA> <BG_FASTA> <HYP_FN> <MARKOV> - # - #Run old THEME version - # - #Options: - # -h, --help show this help message and exit - # --hyp-indices=HYP_INDS - # 0-based indices of hypotheses to run [default: ALL] - # --no-refine do not run with refinement - # --no-parallelize do not use wqsub.py for parallelization - # -v, --verbose print out the commands that are being run - # --dump dump categtories to file - # --output-filename=OUTPUT_FN - # filename to write motif results to [default:dummy.txt] - # --random-output=RANDOM_FN - # filename to write motif results to - # [default:random.txt] - # --motif-file=MOTIF_FN - # filename to write motif results to [default:dummy.out] - # --beta=BETA beta parameter to use [default:0.7] - # --delta=DELTA delta parameter to use [default:0.001] - # --remove-common remove common sequences from analysis - # --randomization run randomization - # --cv=CV number of cross validation folds [default:5] - # --interactive run the script interactively - - motif_fn = '%s_motifs_beta%s_cv%s.txt'%(macs_name,theme_opts.beta,theme_opts.cv) - theme_d = {'opts':opts.theme_args, - 'fg_fn':fg_fn, - 'bg_fn':bg_fn, - 'hyp':hyp_fn, - 'markov':markov_fn, - 'tamo_motif_fn':raw_motif_fn, - 'random_fn':random_cv_fn, - 'motif_fn':motif_fn - } - - theme_call = "THEME.sh %(opts)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s " \ - "--motif-file=%(tamo_motif_fn)s " \ - "--random-output=%(random_fn)s " \ - "--output-filename=%(motif_fn)s " \ - "--randomization" - - calls = [theme_call%theme_d] - steps.append(PPS('Run THEME',calls,env=os.environ)) - - # build infosite - calls = ['build_chipseq_infosite.py %s'%opts.infosite_args] - steps.append(PPS('Build infosite',calls,env=os.environ)) - - # cleanup - rm_str = "rm -f %(d)s/*.out %(d)s/*.err %(d)s/*.script %(d)s/*.stats %(d)s/*.bed" - calls = [rm_str%{'d':exp_wrk_dir}] - - if control_fn : - calls.append(rm_str%{'d':cnt_wrk_dir}) - #steps.append(PPS('Clean up',calls,env=os.environ)) - - pipeline.add_steps(steps) - if opts.auto and opts.steplist is not None : - steplist = parse_steplist(opts.steplist,pipeline) - else : - steplist = None - pipeline.run(interactive=not opts.auto,steplist=steplist)
--- a/chipsequtil-master/scripts/chipseq_pipeline_wo_ctrl.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,172 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from optparse import OptionParser, OptionGroup - -from pypeline import Pypeline, ProcessPypeStep as PPS -from chipsequtil import get_file_parts, get_org_settings -from chipsequtil.util import MultiLineHelpFormatter - -usage = "%prog [options] <organism> <experiment GERALD alignment filename> [<control GERALD alignment filename>]" -description = """1st generation ChIPSeq analysis pipeline: - - - converts Illumina GERALD alignment files to BED format - - calculates statistics on input alignments - - runs MACS to find peaks - - maps peaks to genes - - extracts fasta files for gene peaks in experiments - - constructs background sequences matching foreground distribution - - runs THEME.py on input sequences - - runs THEME.py randomization - - creates documentation on entire pipeline run - -Control input file is optional. *organism* argument is passed to the -*org_settings.py* command to specify organism specific parameters, ensure -that the following commands return valid paths: - -If running MACS: - - org_settings.py <organism> genome_size - - org_settings.py <organism> genome_dir - - org_settings.py <organsim> annotation_path - -If running THEME: - - org_settings.py <organism> theme_hypotheses - - org_settings.py <organism> theme_markov - -""" - -epilog = """Note: it is advised to leave the --*-args arguments unchanged -unless you really know what you're doing.""" - -parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) -parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') -parser.add_option('--exp-name',dest='exp_name',default=os.path.basename(os.getcwd()),help='name for the experiment/pipeline, used for convenience [default: current directory name]') -parser.add_option('--split-args',dest='split_args',default='--type=count --arg=16',help='double quote wrapped arguments for split_file.py [default: %default]') -parser.add_option('--bed-args',dest='bed_args',default='--stdout --chromo-strip=.fa',help='double quote wrapped arguments for gerald_to_bed.py [default: %default]') -parser.add_option('--stats-args',dest='stats_args',default='',help='double quote wrapped arguments for gerald_stats.py [default: %default]') -parser.add_option('--qsub-args',dest='qsub_args',default='--die-on-err',help='double quote wrapped arguments for split_qsub.py [default: %default]') -parser.add_option('--macs-args',dest='macs_args',default='--mfold=10 --tsize=35 --bw=150 --pvalue=1e-5',help='double quote wrapped arguments for macs, only changing --mfold, --tsize, --bw, and --pvalue recommended [default: %default]') -parser.add_option('--pk-to-fa-args',dest='pk_to_fa_args',default='--bg-type=rej_samp',help='double quote wrapped arguments for peaks_to_fasta.py [default: %default]') -parser.add_option('--theme-args',dest='theme_args',default='--beta=0.7 --cv=5',help='double quote wrapped arguments for THEME.py [default: %default]') - - -if __name__ == '__main__' : - - # parse command line arguments - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 3 : - parser.error('Must provide two non-option arguments') - - # filenames and paths - organism, experiment_fn, control_fn = args[0:3] - control_fn = None - if len(args) > 3 : - control_fn = args[2] - - org_settings = get_org_settings(organism) - refseq_fn = org_settings['annotation_path'] - - exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) - exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) - - if control_fn : - cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) - cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) - - # the pipeline - pipeline = Pypeline() - - steps = [] - - # split up files - calls = ["mkdir %s"%exp_wrk_dir, - "split_file.py %s --outdir=%s %s"%(opts.split_args,exp_wrk_dir,experiment_fn),] - if control_fn : - calls.extend(["mkdir %s"%cnt_wrk_dir, - "split_file.py %s --outdir=%s %s"%(opts.split_args,cnt_wrk_dir,control_fn), - ]) - steps.append(PPS('Split files',calls,env=os.environ)) - - # convert to BED format - exp_bed_fn = "%s_exp.bed"%exp_fbase - calls = ["split_qsub.py %s --ext=.bed gerald_to_bed.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.bed_args,exp_wrk_dir), - "wait_for_qsub.py", - "cat %s/*.bed > %s"%(exp_wrk_dir,exp_bed_fn), - ] - - if control_fn : - cnt_bed_fn = "%s_cnt.bed"%cnt_fbase - calls.extend(["split_qsub.py %s --ext=.bed gerald_to_bed.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.bed_args,cnt_wrk_dir), - "wait_for_qsub.py", - "cat %s/*.bed > %s"%(cnt_wrk_dir,cnt_bed_fn), - ]) - - steps.append(PPS('Convert GERALD to BED format',calls,env=os.environ)) - - #steps.append(PPS('Helloooooooo nurse','echo Helloooooooo nurse')) - # generate alignment statistics - exp_stats_fn = '%s_stats.txt'%exp_fbase - calls = ["split_qsub.py %s --ext=.stats gerald_stats.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.stats_args,exp_wrk_dir), - "wait_for_qsub.py", - "combine_gerald_stats.py %s/*.stats > %s"%(exp_wrk_dir,exp_stats_fn), - ] - - if control_fn : - cnt_stats_fn = '%s_stats.txt'%cnt_fbase - calls.extend(["split_qsub.py %s --ext=.stats gerald_stats.py --util-args=\"%s\" %s/*.[0-9][0-9][0-9][0-9]"%(opts.qsub_args,opts.stats_args,cnt_wrk_dir), - "wait_for_qsub.py", - "combine_gerald_stats.py %s/*.stats > %s"%(cnt_wrk_dir,cnt_stats_fn), - ]) - steps.append(PPS('Calculate alignment statistics',calls,env=os.environ)) - - # run macs - cnt_flag = '' - if control_fn : - cnt_flag = '-c %s'cnt_bed_fn - - macs_d = {'exp_fn':exp_bed_fn, - 'cnt_flag':cnt_flag, - 'name':opts.exp_name, - 'macs_args':opts.macs_args, - 'gsize':org_settings['genome_size'], - } - calls = ["macs --gsize=%(gsize)s -t %(exp_fn)s %(cnt_flag)s --name=%(name)s --format=BED %(macs_args)s"%macs_d] - steps.append(PPS('Run MACS',calls,env=os.environ)) - - # map peaks to genes - peaks_fn = "%s_peaks.bed"%opts.exp_name - map_fn = "%s_genes.txt"%opts.exp_name - map_stats_fn = "%s_genes_stats.txt"%opts.exp_name - calls = ["map_peaks_to_genes.py --peaks-format=BED %(refGene_fn)s %(peaks_fn)s --map-output=%(map_fn)s --stats-output=%(map_stats_fn)s"%{'refGene_fn':refseq_fn,'peaks_fn':peaks_fn,'map_fn':map_fn,'map_stats_fn':map_stats_fn}] - steps.append(PPS('Map peaks to genes',calls,env=os.environ)) - - # THEME - # extract foreground and generate background sequences - fg_fn = "%s_peaks.fa"%opts.exp_name - bg_fn = "%s_bg.fa"%opts.exp_name - nib_dir = org_settings['genome_dir'] - calls = ["peaks_to_fasta.py %(opts)s --output=%(fg_fn)s --bg-fn=%(bg_fn)s %(organism)s %(peaks_fn)s"%{'opts':opts.pk_to_fa_args,'organism':organism,'fg_fn':fg_fn,'bg_fn':bg_fn,'peaks_fn':peaks_fn}] - steps.append(PPS('Peaks to Fasta',calls,env=os.environ)) - - # run THEME on fg - motif_fn = '%s_motifs.txt'%opts.exp_name - hyp_fn = org_settings['theme_hypotheses'] - markov_fn = org_settings['theme_markov'] - calls = ["THEME.py %(opts)s --motif-file=%(motif_fn)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s"%{'opts':opts.theme_args,'motif_fn':motif_fn,'fg_fn':fg_fn,'bg_fn':bg_fn,'hyp':hyp_fn,'markov':markov_fn}] - steps.append(PPS('Run THEME on foreground',calls,env=os.environ)) - - # run THEME randomization - random_motif_fn = '%s_motifs_rand.txt'%opts.exp_name - calls = ["THEME.py %(opts)s --randomization --motif-file=%(motif_fn)s %(fg_fn)s %(bg_fn)s %(hyp)s %(markov)s"%{'opts':opts.theme_args,'motif_fn':random_motif_fn,'fg_fn':fg_fn,'bg_fn':bg_fn,'hyp':hyp_fn,'markov':markov_fn}] - steps.append(PPS('Run THEME randomization',calls,env=os.environ)) - - # cleanup - rm_str = "rm -f %(d)s/*.out %(d)s/*.err %(d)s/*.script %(d)s/*.stats %(d)s/*.bed" - calls = [rm_str%{'d':exp_wrk_dir}, - rm_str%{'d':cnt_wrk_dir}] - steps.append(PPS('Clean up',calls,env=os.environ)) - - pipeline.add_steps(steps) - pipeline.run(interactive=not opts.auto)
--- a/chipsequtil-master/scripts/combine_gerald_stats.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#!/usr/bin/env python - -import sys, re, os -from optparse import OptionParser -from collections import defaultdict as dd - -parser = OptionParser() - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - all_stats = dd(int) - for fn in args : - d = eval(open(fn).read()) - for k,v in d.items() : - all_stats[k] += v - all_stats['tot. aligns'] += v - - keys = all_stats.keys() - keys.sort() - keys.remove('tot. aligns') - - for k in keys : - print k,':',all_stats[k],'(%.4f)'%(float(all_stats[k])/all_stats['tot. aligns']) - - print 'tot. aligns',':',all_stats['tot. aligns']
--- a/chipsequtil-master/scripts/compare_microarray_binding.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -import sys - -from csv import reader, writer -from collections import defaultdict as dd -from optparse import OptionParser -from subprocess import Popen, PIPE - -from chipsequtil import MACSOutput, BEDOutput, AffyBiocFile - -usage = '%prog -m <mapped MACS peaks file>|-b <mapped BED peaks file>|-a <mapped microarray file> [-m <MACS peaks file> ...] [-b <mapped BED peaks file> ...] [-a <mapped microarray file> ...]' -description = """Join all files on the first column, concatenating records with \ -matching entries onto one line per entry. Understands MACS peaks data as mapped \ -with *map_peaks_to_known_genes.py* utility microarray data as mapped by \ -*probeset_to_known_genes.py* utility, passed to program using *-m* and *-a* options \ -respectively. Output is a file where genes with binding data (MACS, BED files) have \ -column with a 1, 0 otherwise, and genes with microarray expression values have logFC \ -and adjusted p-value colums for each microarray file input. Internally, uses \ -*join_mapped_known_genes.py* with --binary-plus option to perform mapping and parses \ -output. MACS fields are listed first, followed by BED fields, followed by microarray \ -fields.""" - -epilog="Note: microarray files should have been created by bioconductor, and all files should have a row of fieldnames as the first line" -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('-a','--affy-file',dest='affy_file',action='append',default=[],help='add a mapped microarray file') -parser.add_option('-b','--bed-file',dest='bed_file',action='append',default=[],help='add a mapped BED formatted peaks (*.bed) file') -parser.add_option('-m','--macs-file',dest='macs_file',action='append',default=[],help='add a mapped default MACS formatted peaks (*.xls) file') -parser.add_option('--output',dest='output',default=None,help='file to output joined records to [default: stdout]') - -if __name__ == '__main__' : - - opts,args = parser.parse_args(sys.argv[1:]) - - if len(args) > 0 : - parser.error('There were non-option command line arguments passed, all files should have a preceeding option indicating filetype') - - if len(opts.macs_file) == 0 and len(opts.affy_file) == 0 : - parser.error('No files were passed in, aborting') - - # call join_mapped_known_genes.py - fn_map = {} - fn_map['macs'] = ' '.join(['-m %s'%fn for fn in opts.macs_file]) - fn_map['bed'] = ' '.join(['-b %s'%fn for fn in opts.bed_file]) - fn_map['array'] = ' '.join(['-a %s'%fn for fn in opts.affy_file]) - join_call = 'join_mapped_known_genes.py --binary-plus %(macs)s %(bed)s %(array)s'%fn_map - p = Popen(join_call, shell=True, stdout=PIPE,stderr=PIPE) - stdout, stderr = p.communicate() - if len(stderr) != 0 : - print stderr - - joined_output = stdout.split('\n') - joined_output = joined_output[:-1] if joined_output[-1] == '' else joined_output - - # determine which fields will end up in the file - header = joined_output[0].split('\t') - - # always want gene and symbol - field_indices = [0,1] - - # macs and bed fields are named by filename - for fn in opts.macs_file+opts.bed_file : - field_indices.append(header.index(fn)) - - # affy fields are index(fn)+5, index(fn)+8 - for fn in opts.affy_file : - # just add all the microarray columns - fn_header_indices = [i for i,x in enumerate(header) if x.find(fn) != -1] - field_indices.extend(fn_header_indices) - - #field_indices.append(header.index(fn)) - #field_indices.append(header.index(fn)+5) - #field_indices.append(header.index(fn)+8) - - out_f = open(opts.output,'w') if opts.output else sys.stdout - for line in joined_output : - line = line.split('\t') - out_f.write('\t'.join([line[i] for i in field_indices])+'\n') - - if opts.output : - out_f.close()
--- a/chipsequtil-master/scripts/construct_bg_fasta.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,235 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -import warnings - -from collections import defaultdict -from optparse import OptionParser - -from chipsequtil import get_org_settings, RefGeneFile -from chipsequtil.nib import NibDB -from chipsequtil.util import MultiLineHelpFormatter -from TAMO.seq import Fasta - -usage='%prog [options] <type> <organism> <foreground fasta>' -description='Create background sequence databses for motif finding, etc.' -parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) - - -def rejection_sampling(fg,settings_dict,gc_bins=20) : - - genm_db = NibDB(settings_dict['genome_dir']) - annot = RefGeneFile(settings_dict['annotation_file']) - - - num_peak_bases = 0 - for header, seq in fg.items() : - num_peak_bases += len(seq) - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 3 : - parser.error('Must provide three non-option arguments') - - sample_type, organism, fg_fn = args[:3] - - settings_dict = get_org_settings(organism) - - fg = Fasta.load(fg_fn) - bg = rejection_sampling(fg,settings_dict) - - -############################################################### -# start Chris' code from rej_samp_bg_rand2.py - the_genes={} #list of distances to nearest TSS - - # for each peak find the chromosome, distance to nearest - # gene, size of peaks in bases, and GC content - the_chrs,dists,sizes,gcs=[],[],[],[] - - # number of bases in the fg sequences - size=0 - - for key in pos_seqs.keys(): - - size+=len(pos_seqs[key]) - - # chromosome first field in fasta headers from bed2seq.bedtoseq - chr=key.split(':')[0] - - # adjust chromosomes in special cases - if re.search('random',chr): - continue - if chr=='chr20': - chr='chrX' - elif chr=='chr21': - chr='chrY' - if not the_genes.has_key(chr): - the_genes[chr]=[] - - # start first int in second field of bed2seq.bedtoseq header - start=int(key.split(':')[1].split('-')[0]) - midpoint=int(start+len(pos_seqs[key])/2) - - # figure out which chromosome we're working on - tss_chr=tss[chr.split('chr')[-1]] - - # D is the distances from all the genes, find minimum - D=[(s[0]-midpoint) for s in tss_chr] - - # best distance for this peak - minD=min([abs(x) for x in D]) - best=[d for d in D if abs(d)==minD] - dists.append(best[0]) - - # chromosome for this peak - the_chrs.append(chr) - seq=pos_seqs[key] - - # calculate # bases and GC content - N=len(seq) - sizes.append(N) - gc=len([x for x in seq if (x=='G')or(x=='C')])/N - gcs.append(gc) - - #bin GC content distribution - bins=20 - - # q is # of peaks w/ x% GC content - q=[0]*bins - - for gc in gcs: - for i in range(bins): - win_start=i/bins - win_end=(i+1)/bins - if gc>=win_start and gc<win_end: - q[i]+=1 - continue - - # q is now % peaks w/ x% GC content - q=[x/Nseqs for x in q] - #print q - - # c is # peaks w/ highest GC content - c=max(q)*Nseqs - - # start generating bg sequences - print "Done assembling distance and gc content distributions" - genome_outfile=open(bg,'w') - - # make twice as many - size=round(size/(2*len(pos_seqs))) - bg_gcs,bg_sizes=[],[] - #for key in the_genes.keys(): - #chrom=key.split('chr')[-1] - #the_genes[key]=[x[0] for x in tss[chrom]] - - # C_TX is a list of all genes in (chromosome,gene start) tuples - C_TX=[] - for key in tss.keys(): - chrom=key.split('chr')[-1] - for x in tss[chrom]: - C_TX.append((chrom,x[0])) - - # generate a bg sequence for every fg sequence - for i in range(Nseqs): - - # propose sequences until one is accepted - keep_going=1 - while keep_going: - #random.shuffle(the_chrs) - - # randomize the list of distances from genes - random.shuffle(dists) - #chr=the_chrs[0] - - # pick the first distance, i.e. at random - d=dists[0] - - #random.shuffle(the_genes[chr]) - - # randomize the gene list - random.shuffle(C_TX) - - # randomize the peak sizes - random.shuffle(sizes) - - # pick a random gene - (chr,coord)=C_TX[0] - - #coord=the_genes[chr][0] - # propose a starting point for the bg sequence - midpoint=coord-d+random.randint(-100,100) - - # propose a starting size for the bg sequence - size=sizes[0] - start=int(midpoint-int(size/2)) - stop=int(midpoint+int(size/2)) - id='chr'+chr.split('chr')[-1]+':'+str(start)+'-'+str(stop) - r=random.random() - - # randomly choose strand - if r<0.5: strand='+' - else: strand='-' - - # extract the proposed sequence - nib_title,seq=nibfrag.sequence('chr'+chr,start, stop,strand) - if not seq: - print 'NOT FOUND', chr,start,stop, - continue - else: - - N,y=0,0 - # calculate the GC content for the proposed sequence - for line in seq: - s=line.upper() - N+=len(line) - y+=len([x for x in s if (x=='G')or(x=='C')]) - if line[0]=='N': continue - x=float(y)/N - - # determine the GC bin for this sequence - #gc=float(len([x for x in seq if (x=='G')or(x=='C')]))/N - for i in range(bins): - win_start=i/bins - win_end=(i+1)/bins - if x>=win_start and x<win_end: - bin=i - continue - - # pick a uniform random number such that it does not exceed - # the maximum GC content distribution over bins - r=random.random()*c/Nseqs - - # if the random number is <= the GC content for this - # proposed sequence, accept, otherwise reject - if r>q[bin]: - #print 'skip' - continue - else: - #print bin - bg_gcs.append(x) - bg_sizes.append(size) - keep_going-=1 - title='>%s\n'%id - genome_outfile.write(title) - for line in seq: - genome_outfile.write(line.upper()+'\n') - print len(gcs) - print len(bg_gcs) - fg_mean,fg_sdev=mean_sdev(gcs) - print fg_mean,fg_sdev - #bg_mean,bg_sdev=mean_sdev(bg_gcs) - bg_mean=scipy.mean(bg_gcs) - bg_sdev=scipy.std(bg_gcs) - print bg_mean,bg_sdev - fg_size_m,fg_size_dev=mean_sdev(sizes) - bg_size_m,bg_size_dev=mean_sdev(bg_sizes) - print fg_size_m,fg_size_dev - print bg_size_m,bg_size_dev - genome_outfile.close() -
--- a/chipsequtil-master/scripts/create_pipeline_script.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,385 +0,0 @@ -#!/usr/bin/env python - -from __future__ import with_statement -import getpass -import json -import os -import textwrap - -try: - import readline - import glob - readline.parse_and_bind("tab: complete") - readline.set_completer_delims('') - - comp_states = {} - def basic_complete_file(text,state) : - #if text.strip() == '' : - # text = './' - options = dict([(i,p) for i,p in enumerate(glob.glob(text+'*'))]) - return options.get(state,None) - - readline.set_completer(basic_complete_file) - -except ImportError: - print "Module readline not available." - -import re -import stat -import sys -from optparse import OptionParser -from subprocess import Popen, PIPE - -import chipsequtil -from chipsequtil import get_global_settings, get_local_settings, check_org_settings, GLOBAL_SETTINGS_FN, LOCAL_SETTINGS_FN -from terminalcontroller import TERM_ESCAPE, announce, warn, error, white, bold - -usage = "%prog" -description = """Script for creating a custom run script for -ChIPSeq/DNAse hypersensitivity experiments. User is asked for -paths and settings required for ChIPSeq analysis using the *chipseq_pipeline.py* -utility and produces an executable run script with helpful information on how to -run it. Also creates a JSON formatted file containing all the parameters for -this pipeline run.""" -epilog = "Note: this script only works in Unix-style environments" -parser = OptionParser(usage=usage,description=description,epilog=epilog) - - -script_template = """\ -#!/bin/bash - -# required parameters for the pipeline -ORG=%(organism)s -EXP_FN=%(exp_path)s -CNT_FN=%(cnt_path)s - -# chipseq_pipeline.py is the main workhorse of this analysis -# you may change any of the arguments below from their defaults - -chipseq_pipeline.py $ORG $EXP_FN $CNT_FN \\ -%(def_args)s -""" - -start_text = """\ -This is an interactive script that creates an executable script to use for -ChIPSeq analyses. When prompted for experiment and control files, tab -completion is available a la bash or tcsh shells. Press Ctrl-C at any time to -quit. -""" - -end_text = """The script %(script_fn)s has been created to run this pipeline. \ -The script can now be run with: - -$> ./%(script_fn)s - -Have a nice day.""" - - - -def wb(st) : - sys.stdout.write(white(bold(st))) - - -def input(st,default=None) : - - if default is None : - default_str = '' - else : - default_str = ' [default: ' + default + ' ] ' - - out = None - while out is None : - out = raw_input(white(bold(st))+default_str+white(bold(':'))+' \n') - if len(out) == 0 : - out = default - - return out - - -if __name__ == '__main__' : - - TERM_ESCAPE = True - - try : - - pipeline_args = {} - - # herro - announce('ChIPSeq Experiment Pipeline Script Generator') - print textwrap.fill(start_text) - - opts, args = parser.parse_args(sys.argv[1:]) - if len(args) > 0 : - warn("Arguments were passed, but this script doesn't accept any arguments, rudely ignoring them...\n") - - # this dictionary will be used to generate a JSON formatted file with - # all the relevant settings for the pipeline - json_dict = {} - - ############################################################################ - # name of the experiment - ############################################################################ - def_path = os.path.basename(os.getcwd()) - exp_name = input('Experiment name',def_path) - exp_name = exp_name.replace(' ','_') # shhhhhhhh... - - json_dict['experiment name'] = exp_name - json_dict['analysis path'] = os.getcwd() - - ############################################################################ - # experiment and control file - ############################################################################ - align_text = "The pipeline can accept either BED, BOWTIE, SAM, or " \ - "ELANDEXPORT formatted alignment files. SAM is the default " \ - "format of files provided by the BMC pipeline. Both experiment " \ - "and control files must have the same format." - print textwrap.fill(align_text) - - align_fmt = input("Which format are the alignment files in?",'SAM') - exp_path = input('Experiment alignment path') - exp_path = exp_path.strip() - - lims_exp_url = input('Experiment LIMS sample URL, if applicable','none') - lims_exp_url = lims_exp_url.strip() - - cntrl_path = input('Control alignment path (leave blank for no control)','none') - cntrl_path = cntrl_path.strip() - - lims_cntrl_url = input('Control LIMS sample URL, if applicable','none') - lims_cntrl_url = lims_cntrl_url.strip() - - if cntrl_path == 'none' : - cntrl_path = '' - - if cntrl_path == '' : - print 'Analysis will be run with no control' - - json_dict['experiment path'] = os.path.realpath(exp_path) - json_dict['experiment lims url'] = lims_exp_url - json_dict['control path'] = os.path.realpath(cntrl_path) if cntrl_path != '' else 'none' - json_dict['control lims url'] = lims_cntrl_url - - ############################################################################ - # organism + settings - ############################################################################ - announce('Organism settings configuration') - global_settings = get_global_settings() - local_settings = get_local_settings() - valid_org_settings = global_settings.keys() + local_settings.keys() - valid_org_settings.sort() - - org_text = """\ -Below are the organism settings available on this system. The pipeline will -use the settings for one organism (e.g. %(org)s) for the entire execution. If -you do not see a set of settings that correspond to files you need you may -add your own to %(local_org)s. See %(glob_org)s for details. -""" - - print textwrap.fill(org_text%{'org':valid_org_settings[0],'local_org':LOCAL_SETTINGS_FN,'glob_org':GLOBAL_SETTINGS_FN},break_long_words=False) - print - - wb('Available settings\n') - # global settings - print 'Global settings: (%s)'%GLOBAL_SETTINGS_FN - org_sets = [(k,global_settings[k]) for k in sorted(global_settings.keys())] - for org, settings in org_sets : - wb(org.ljust(8)) - print ':', settings.get('description','No description') - #for k,v in settings.items() : - # print ' '*4+k+": "+str(v) - - # local settings - print 'Local settings: (%s)'%LOCAL_SETTINGS_FN - org_sets = [(k,local_settings[k]) for k in sorted(local_settings.keys())] - for org, settings in org_sets : - wb(org.ljust(8)) - print ':', settings.get('description','No description') - #for k,v in settings.items() : - # print ' '*4+k+": "+str(v) - org = '' - all_settings = {} - all_settings.update(global_settings) - all_settings.update(local_settings) - - while org not in valid_org_settings : - org = input('Choose organism configuration, one of ('+','.join(valid_org_settings)+')') - - # check for the required settings - required_settings = ['description','genome_dir','refgene_anno_path','theme_hypotheses','theme_markov'] - if not check_org_settings(org,required_settings) : - warn(textwrap.fill('Selected organism settings must have the following settings defined:\n\ - %s\n\ - Either select another organism or define these settings in your local\ - configuration file.'%required_settings)) - org = '' - print - - json_dict['org'] = org - - ############################################################################ - # UCSC - ############################################################################ - - ucsc_text = """The pipeline can include a step to automatically make called -peak data available on the web for integration with UCSC genome browser.""" - - print textwrap.fill(ucsc_text,break_long_words=False) - - ucsc_integrate = input('Would you like to integrate this analysis with UCSC genome browser [y/n]?','y') - ucsc_integrate = False if ucsc_integrate == 'n' else True - ucsc_args = '' - stage_dir = '/nfs/antdata/web_stage/%s'%getpass.getuser() - stage_url = 'http://fraenkel.mit.edu/stage/%s'%getpass.getuser() - if ucsc_integrate : - ucsc_args = ['--ucsc'] - ucsc_args = ' '.join(ucsc_args) - - pipeline_args['--stage-dir'] = stage_dir - pipeline_args['--stage-url'] = stage_url - - json_dict['stage dir'] = stage_dir - json_dict['stage url'] = stage_url - - # TODO - consider letting user set these on script creation time - # any utility specific arguments? - # - MACS - # - THEME - - - ############################################################################ - # various pipeline parameters - ############################################################################ - - # --macs-args - macs_args = ['--mfold=10,30','--format=%s'%align_fmt] - pval = '' - while not re.search('^\de-\d+$',pval) : - pval = input('What p-value should MACS use as a cutoff?','1e-5') - macs_args.append('--pvalue=%s'%pval) - pipeline_args['--macs-args'] = ' '.join(macs_args) - - # --map-args - map_args = [] - tss = '' - while tss.upper() not in ('TSS','GENE') : - tss = input('Should gene mapping be made in relation to transcription start site or full gene coordinates [TSS/gene]?','TSS') - if tss == 'TSS' : - map_args.append('--tss') - - window = '' - while not re.search('^\d+,\d+$',window) : - window = input('What window would you like to use for mapping peaks to genes (upstream bases,downstream bases)?','10000,10000') - upstr, downstr = window.split(',') - map_args.extend(['--upstream-window=%s'%upstr,'--downstream-window=%s'%downstr]) - pipeline_args['--map-args'] = ' '.join(map_args) - - # --filter-peaks-args - filt_args = ['--sort-by=pvalue'] - fdr = '' - while not re.search('^\d+(\.\d+)?',fdr) and fdr != 'none' : - fdr = input('What FDR cutoff should be used, in %?','none') - if fdr != 'none' : - filt_args.append("--filter='fdr<%s'"%fdr) - - top = '' - while not re.search('^\d+$',top) and top != 'ALL' : - top = input('How many peak sequences should be used for motif discovery when sorted by p-value [<# peaks>/ALL]','1000') - if top != 'ALL' : - filt_args.append('--top=%s'%top) - - # tag filter for both pos and neg peaks - tags = '' - filt_neg_args = [] - while not re.search('^\d+$',tags) and tags != 'ALL' : - tags = input('What tag count cutoff should be used as a minimum for positive and negative peaks? [<# peaks>/None]','20') - if tags != 'None' : - filt_args.append("--filter='tags>%s'"%tags) - filt_neg_args.append("--filter='tags>%s'"%tags) - pipeline_args['--filter-peaks-args'] = ' '.join(filt_args) - pipeline_args['--filter-neg-peaks-args'] = ' '.join(filt_neg_args) - - # --peaks-to-fa-args - peaks_to_fa_args = [] - width = '' - while not re.search('^\d+$',width) and width != 'NA' : - width = input('What width around peak summit should be used for motif analysis (NA to use entire peak)? [<# bases>/NA]','200') - if width != 'NA' : - peaks_to_fa_args.append('--fixed-peak-width=%s'%width) - else : - width = 'none' - pipeline_args['--peaks-to-fa-args'] = ' '.join(peaks_to_fa_args) - - # --parallelize - parallel = input('Use cluster parallelization [y/n]?','y') - parallel = '--parallelize' if parallel.lower() != 'n' else '' - - # each user-specified argument gets its own key - json_dict['format'] = align_fmt - json_dict['mapping type'] = tss - json_dict['mapping window'] = (upstr,downstr) - json_dict['FDR filter'] = fdr - json_dict['peaks used by THEME'] = top - json_dict['fixed peak width'] = width - json_dict['parallelize'] = parallel != '' - json_dict['peak tag count filter'] = tags - - # put all the command line utility args in json_dict as its own dict - json_dict['pipeline args'] = pipeline_args - - ############################################################################ - # done with input, creating script and other stuff - ############################################################################ - # if the experiment and control files are in a different directory, - # create symlinks for them - exp_dir,exp_fn = os.path.split(os.path.abspath(exp_path)) - if exp_dir != os.getcwd() : - wb('Creating symlink for experiment file...\n') - if os.path.exists(exp_fn) : - if os.path.realpath(exp_fn) != os.path.abspath(exp_path) : # existing symlink doesn't point to the same file, prompt to overwrite - ans = raw_input('Symlink %s in current directory points to %s but you asked for %s, overwrite symbolic link? y/[n] '%(exp_fn,os.path.realpath(exp_fn),os.path.abspath(exp_path))) - if ans == 'y' : - os.remove(exp_fn) - exp_fn = 'exp_'+exp_fn - os.symlink(exp_path,exp_fn) - else : - exp_fn = 'exp_'+exp_fn - os.symlink(exp_path,exp_fn) - - if cntrl_path != '' : - cntrl_dir,cntrl_fn = os.path.split(os.path.abspath(cntrl_path)) - if cntrl_dir != os.getcwd() : - wb('Creating symlink for control file...\n') - if os.path.exists(cntrl_fn) : - if os.path.realpath(cntrl_fn) != os.path.abspath(cntrl_path) : # existing symlink doesn't point to the same file, prompt to overwrite - ans = raw_input('Symlink %s in current directory points to %s but you asked for %s, overwrite symbolic link? y/[n] '%(cntrl_fn,os.path.realpath(cntrl_fn),os.path.abspath(cntrl_path))) - if ans == 'y' : - os.remove(cntrl_fn) - cntrl_fn = 'cntrl_'+cntrl_fn - os.symlink(cntrl_path,cntrl_fn) - else : - cntrl_fn = 'cntrl_'+cntrl_fn - os.symlink(cntrl_path,cntrl_fn) - else : - cntrl_fn = '' - - # get default chipseq_pipeline.py args - pipeline_args = ' '.join(['%s="%s"'%(k,v) for k,v in pipeline_args.items()]) - print 'chipseq_pipeline.py --exp-name=%s %s %s --print-args'%(exp_name,ucsc_args,pipeline_args) - def_args = Popen('chipseq_pipeline.py --exp-name=%s %s %s %s --print-args'%(exp_name,ucsc_args,parallel,pipeline_args),shell=True,stdout=PIPE,stderr=PIPE).communicate()[0] - - wb('Creating script...\n') - script_fn = '%s_pipeline.sh'%exp_name - with open(script_fn,'w') as script_f : - script_f.write(script_template%{'exp_path':exp_fn,'cnt_path':cntrl_fn,'organism':org,'exp_name':exp_name,'def_args':def_args}) - os.chmod(script_f.name,stat.S_IRWXU|stat.S_IRWXG|stat.S_IROTH) - - print end_text%{'script_fn':script_fn} - - wb('Creating parameter file...\n') - json_fn = '%s_params.json'%exp_name - with open(json_fn,'w') as json_f : - json.dump(json_dict,json_f,indent=4) - - except KeyboardInterrupt : - sys.stderr.write('\n') - error('Script creation interrupted, aborting')
--- a/chipsequtil-master/scripts/extract_promoters.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -#!/usr/bin/env python - -import re -import sys -from csv import writer -from optparse import OptionParser - -from collections import defaultdict - -from chipsequtil import get_org_settings, RefGeneFile -from chipsequtil.nib import NibDB -from chipsequtil.util import MultiLineHelpFormatter as MF - -usage = "%prog [options] <organism>" -description = """Extract the promoter sequences in FASTA format from all genes -or a list of genes specified in an input file. Gene annotation is RefGene -corresponding to the organism passed in, paths returned by: - -$> org_settings.py <organism> refgene_anno_path -$> org_settings.py <organism> genome_dir - -must be valid.""" -parser = OptionParser(usage=usage,description=description,formatter=MF()) -parser.add_option('-u','--upstream',type='int',default=3000,help='upstream window from TSS to extract [default: %default]') -parser.add_option('-d','--downstream',type='int',default=1000,help='downstream window from TSS to extract [default: %default]') -parser.add_option('-l','--gene-list',dest='gene_list',default=None, - help='file containing a list of gene identifiers to extract, one per line [default: %default]') -gene_type_choices = ['symbol','refgene'] -parser.add_option('-t','--gene-type',dest='gene_type',type='choice', - choices=gene_type_choices,default=gene_type_choices[0], - help='type of gene identifier in gene list, choose from %s [default: %%default]'%gene_type_choices) -parser.add_option('-o','--output',dest='output',default=None, - help='file to write fasta records to [default: stdout]') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 1 : - parser.error('Exactly one argument is required') - - org_settings = get_org_settings(args[0]) - - refgene_fn = org_settings['refgene_anno_path'] - refgene_f = RefGeneFile(refgene_fn) - - nib_db = NibDB(nib_dirs=[org_settings['genome_dir']]) - - gene_list = None - if opts.gene_list : - gene_list = [x.strip() for x in open(opts.gene_list).readlines()] - - id_index = 'bin' - if opts.gene_type != gene_type_choices[0] : - if opts.gene_type == 'refgene' : - id_index = 'name' - - seq_recs = [] - gene_map = defaultdict(list) - for rec in refgene_f : - if gene_list and rec[id_index] not in gene_list : continue # skip this one - st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases']) - key = (rec['chrom'],st,end,rec['strand']) - seq_recs.append(key) - gene_map[key[:-1]].append(rec['bin']+'/'+rec['name']) - - fasta_recs = nib_db.get_fasta_batch(seq_recs) - - out_f = open(opts.output,'w') if opts.output else sys.stdout - header_regex = re.compile('^.*(chr[0-9MXY]+).*:([0-9]+)-([0-9]+).*$') - for header, seq in zip(*fasta_recs) : - # map sequences back to gene names using the header - reg_obj = header_regex.search(header) - if reg_obj is not None : - chrm,st,end = reg_obj.groups() - gene_names = gene_map.get((chrm,int(st),int(end))) - if gene_names is not None : - header = header.strip()+':'+','.join(gene_names)+'\n' - out_f.write(header+seq+'\n')
--- a/chipsequtil-master/scripts/filter_bed_by_position_count.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -#!/usr/bin/env python - -import sys - -from csv import reader, writer -from optparse import OptionParser - -usage = '%prog [options] <bed file>' -description = """Analyze BED file and filter out alignments above some threshold \ -that align to a single genomic position.""" -epilog="Note: only works if BED file is sorted!" -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('-n','--max-count',dest='max_count',default=5,type='int',help='max tag count at a given position, filter above [default: %default]') -parser.add_option('--output',dest='output',default=None,help='write output to file') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 1 : - parser.error('Exactly one sorted .bed file is required') - - bed_fn = args[0] - - bed_reader = reader(open(bed_fn),delimiter='\t') - out_f = open(opts.output,'w') if opts.output else sys.stdout - bed_writer = writer(out_f,delimiter='\t') - - curr_key, curr_key_count = None, 0 - for rec in bed_reader : - key = rec[:3] # chromosome, start, end - if key != curr_key : - curr_key, curr_key_count = key, 0 - if curr_key_count < opts.max_count : - bed_writer.writerow(rec) - curr_key_count += 1 - else : - continue - if opts.output : out_f.close()
--- a/chipsequtil-master/scripts/filter_gps_peaks.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,215 +0,0 @@ -#!/usr/bin/env python - -import re -import os -import sys -from collections import defaultdict -from optparse import OptionParser, SUPPRESS_HELP -from random import shuffle - -from chipsequtil import GPSFile, get_file_parts -from chipsequtil.util import MultiLineHelpFormatter as MF -from terminalcontroller import warn - -usage = "%prog [options] <GPS peak file>" -description = """\ -Filter GPS peaks by supplied criteria. Available filter features are: - -IP -Control -Fold -qvalue -pvalue -IPvsEMP -IPvsCTR - -Filters are provided as expressions using the [-f |--filter] option, e.g. the command - -%prog -f "IP>100" --filter="pvalue<=1e-9" <GPS peak file> - -finds only peaks with more than 100 tags and a pvalue of less than 1e9. Any -number of filters may be provided, and only peaks that match *all* filters pass. \ -User is warned if filters result in zero results. Only inequality operators are \ -valid. Invoking with no filter arguments returns all peaks. To sort, use the \ ---sort-by option, e.g. - -%prog -f "pvalue<=1e-9" --sort-by=pvalue <GPS peak file> - -sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. All fields \ -are sorted ascending by default. Output is prepended with comments describing what \ -the file contains, i.e. which filters are applied, how many records there are, etc. - -Note: GPS P_-log10 and Q_-log10 values are converted to normal pvalues and qvalues -""" - -parser = OptionParser(usage=usage,description=description,formatter=MF()) -parser.add_option('-f','--filter',dest='filters',default=[],action='append',help='add filter expression') -parser.add_option('--sort-by',dest='sort_by',default='',help='comma delimited list of features to sort by, filtered peaks are not sorted by default, if provided peaks are sorted ascending by default') -parser.add_option('--sort-dir',dest='sort_dir',type='choice',choices=['ASCEND','DESCEND'],default='ASCEND',help='direction to sort [default: %default]') -parser.add_option('--top',dest='top',type='int',default=None,help='accepts an integer, output at most this many peaks [default: all]') -parser.add_option('--output',dest='output',default=None,help='filename to output filtered peaks to [default: stdout]') -parser.add_option('--encode-filters',dest='encode_filters',action='store_true',help='write out records to a file <GPS peaks file>_<filters>.xls (incompatible with --output option)') -parser.add_option('--summary',dest='summary',action='store_true',help='only print out summary information for the filter') -parser.add_option('--no-header',dest='no_header',action='store_true',help='do not print out header or metadata info') -parser.add_option('--shuffle',dest='shuffle',action='store_true',help='shuffle order of filtered records, useful for selecting random peaks') - -parser.add_option('--print-encoded-fn',dest='print_encoded_fn',action='store_true',help="print out the filename that would be created by --encode-filters") - -# make condition function objects using closures -_lt = lambda x,y : x < y -_lte = lambda x,y : x <= y -_gt = lambda x,y : x > y -_gte = lambda x,y : x >= y -_cond_map = {'<':_lt,'<=':_lte,'>':_gt,'>=':_gte,None:None} - -def make_condition(low_val=None,low_test=_gte,high_val=None,high_test=_lte) : - if low_val and not high_val : - return lambda x: low_test(low_val,x) - elif not low_val and high_val : - return lambda x: high_test(x,high_val) - elif low_val and high_val : - return lambda x: low_test(low_val,x) and high_test(x,high_val) - else : - return lambda x: True # identity with no constraints - -# regex and function for parsing filter strings -numeric_regex_str = r'\d+(?:\.\d*)?(?:(?:e|E)-?\d+)?' # matches numeric-looking patterns, e.g. 1, 1.234, 1e4, 1.234E-5, etc. -separator_regex_str = r'(?:>|>=|<|<=)' -ids_regex_str = r'(?:IP|Control|Fold|qvalue|pvalue|IPvsEMP|IPvsCTR)' -filter_regex = re.compile('^(%(num)s)?(%(sep)s)?(%(id)s)(%(sep)s)?(%(num)s)?$'%{'num':numeric_regex_str,'sep':separator_regex_str,'id':ids_regex_str}) - -class FilterException(Exception) : pass - -def parse_filter(filter_str) : - match = filter_regex.search(filter_str.strip()) - if match is None : - raise FilterException('Filter %s is formatted incorrectly'%filter_str) - low_val, low_test, field, high_test, high_val = match.groups() - low_val = float(low_val) if low_val else low_val - high_val = float(high_val) if high_val else high_val - return field, make_condition(low_val,_cond_map[low_test],high_val,_cond_map[high_test]) - -_sort_keys = {'length': lambda x: int(x[3]), - 'tags': lambda x: int(x[5]), - 'pvalue': lambda x: 10**(float(x[6])/-10), - 'fold_enrichment': lambda x: float(x[7]), - 'fdr': lambda x: float(x[8]), - } - - -summary_str = """\ -# This output was generated by filter_macs_peaks.py, filtered from %(macs_fn)s -# Number of peaks: %(num_recs)d -# Filters: %(filters)s -# Sorted by: %(sort_by)s -# Shuffled: %(shuffled)s -""" -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 1 : - parser.error('Must provide one GPS peaks file') - - if opts.output is not None and opts.encode_filters : - parser.error('--output and --encode-filters options are mutually exclusive') - - # set where to write output - if opts.encode_filters : - # construct filename additions - fn_str = '' - opts.filters.sort() - for filt in opts.filters : - filter_str = filt.replace(' ','') - filter_str = filter_str.replace('>=','_GTE_') - filter_str = filter_str.replace('<=','_LTE_') - filter_str = filter_str.replace('>','_GT_') - filter_str = filter_str.replace('<','_LT_') - fn_str += '_%s'%filter_str - - if opts.top is not None : - fn_str += '_top%d'%opts.top - - if len(opts.sort_by) != 0 : - fn_str += '_sortby_%s'%opts.sort_by - - if opts.shuffle : - fn_str += '_shuffled' - - macs_path,macs_fn,macs_basefn,macs_ext = get_file_parts(args[0]) - encoded_fn = os.path.join(macs_path,macs_basefn+fn_str+macs_ext) - if opts.print_encoded_fn : - sys.stdout.write(encoded_fn) - sys.exit(0) - else : - out_f = open(encoded_fn,'w') - elif opts.output : - out_f = open(opts.output,'w') - else : - out_f = sys.stdout - - # parse the filters - field_filters = defaultdict(list) - for filter in opts.filters : - field, filter_cond = parse_filter(filter) - field_filters[field].append(filter_cond) - - # start processing GPS file - peaks = GPSFile(args[0]) - - # filter the records - pass_recs = [] - for peak in peaks : - # test each of the fields, if any one fails skip the record - if not all([c(int(peak['IP'])) for c in field_filters['IP']]) or \ - not all([c(int(peak['Control'])) for c in field_filters['Control']]) or \ - not all([c(float(peak['Fold'])) for c in field_filters['Fold']]) or \ - not all([c(10**(float(peak['Q_-lg10'])/-10)) for c in field_filters['qvalue']]) or \ - not all([c(10**(float(peak['P_-lg10'])/-10)) for c in field_filters['pvalue']]) or \ - not all([c(float(peak['IPvsEMP'])) for c in field_filters['IPvsEMP']]) or \ - not all([c(float(peak['IPvsCTR'])) for c in field_filters['IPvsCTR']]) : - continue - else : - pass_recs.append([peak[k] for k in GPSFile.FIELD_NAMES]) - - if len(pass_recs) == 0 : - warn('WARNING: no records remain after filtering\n') - sys.exit(1) - - # sorting - if opts.sort_by : - pass_recs.sort(key=_sort_keys[opts.sort_by],reverse=opts.sort_dir != 'ASCEND') - - # top records - num_recs = len(pass_recs) if not opts.top else min(len(pass_recs),opts.top) - - # construct the summary string - filters_str = 'none' if len(opts.filters) == 0 else ', '.join(opts.filters) - sort_str = 'original order' if not opts.sort_by else opts.sort_by+', '+opts.sort_dir - shuffled_str = str(opts.shuffle) - summary = summary_str%{'macs_fn':args[0],'num_recs':num_recs, - 'filters':filters_str, - 'sort_by':sort_str, - 'shuffled':shuffled_str} - - # print summary only - if opts.summary : - sys.stdout.write(summary) - sys.exit(0) - - # write out the header cuz it's a nice thing to do - if not opts.no_header : - out_f.write(summary) - out_f.write('\t'.join(GPSFile.FIELD_NAMES)+'\n') - - # write out records - if opts.shuffle : - shuffle(pass_recs) - out_recs = pass_recs[:num_recs] - - for rec in out_recs : - # rec[0] is a tuple of (chromosome,start pos,original string) - out_f.write('\t'.join([rec[0][2]]+map(str,rec[1:]))+'\n') - - # good programming practice - out_f.close()
--- a/chipsequtil-master/scripts/filter_macs_peaks.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,210 +0,0 @@ -#!/usr/bin/env python - -import re -import os -import sys -from collections import defaultdict -from optparse import OptionParser, SUPPRESS_HELP -from random import shuffle - -from chipsequtil import MACSFile, MACSOutput, get_file_parts -from chipsequtil.util import MultiLineHelpFormatter as MF -from terminalcontroller import warn - -usage = "%prog [options] <MACS peak file>" -description = """\ -Filter MACS peaks by supplied criteria. Available filter features are: - -length -tags -pvalue -fold_enrichment -fdr - -Filters are provided as expressions using the [-f |--filter] option, e.g. the command - -%prog -f "tags>100" --filter="pvalue<=1e-9" --filter="100<length<=200" <MACS peak file> - -finds only peaks with more than 100 tags, a pvalue of less than 1e9, and a length \ -between 100, exclusive, and 200, inclusive. Any number of filters may be provided, \ -and only peaks that match *all* filters pass. User is warned if filters result in \ -zero results. Only inequality operators are valid. Invoking with no filter arguments \ -returns all peaks. To sort, use the --sort-by option, e.g. - -%prog -f "pvalue<=1e-9" --sort-by=pvalue <MACS peak file> - -sorts peaks with a pvalue smaller than 1e-9 with the smallest pvalue peaks. All fields \ -are sorted ascending by default. Output is prepended with comments describing what \ -the file contains, i.e. which filters are applied, how many records there are, etc. - -Note: MACS -10*log10(pvalue) values are converted to normal pvalues -""" - -parser = OptionParser(usage=usage,description=description,formatter=MF()) -parser.add_option('-f','--filter',dest='filters',default=[],action='append',help='add filter expression') -parser.add_option('--sort-by',dest='sort_by',default='',help='comma delimited list of features to sort by, filtered peaks are not sorted by default, if provided peaks are sorted ascending by default') -parser.add_option('--sort-dir',dest='sort_dir',type='choice',choices=['ASCEND','DESCEND'],default='ASCEND',help='direction to sort [default: %default]') -parser.add_option('--top',dest='top',type='int',default=None,help='accepts an integer, output at most this many peaks [default: all]') -parser.add_option('--output',dest='output',default=None,help='filename to output filtered peaks to [default: stdout]') -parser.add_option('--encode-filters',dest='encode_filters',action='store_true',help='write out records to a file <MACS peaks file>_<filters>.xls (incompatible with --output option)') -parser.add_option('--summary',dest='summary',action='store_true',help='only print out summary information for the filter') -parser.add_option('--no-header',dest='no_header',action='store_true',help='do not print out header or metadata info') -parser.add_option('--shuffle',dest='shuffle',action='store_true',help='shuffle order of filtered records, useful for selecting random peaks') - -parser.add_option('--print-encoded-fn',dest='print_encoded_fn',action='store_true',help="print out the filename that would be created by --encode-filters") - -# make condition function objects using closures -_lt = lambda x,y : x < y -_lte = lambda x,y : x <= y -_gt = lambda x,y : x > y -_gte = lambda x,y : x >= y -_cond_map = {'<':_lt,'<=':_lte,'>':_gt,'>=':_gte,None:None} - -def make_condition(low_val=None,low_test=_gte,high_val=None,high_test=_lte) : - if low_val and not high_val : - return lambda x: low_test(low_val,x) - elif not low_val and high_val : - return lambda x: high_test(x,high_val) - elif low_val and high_val : - return lambda x: low_test(low_val,x) and high_test(x,high_val) - else : - return lambda x: True # identity with no constraints - -# regex and function for parsing filter strings -numeric_regex_str = r'\d+(?:\.\d*)?(?:(?:e|E)-?\d+)?' # matches numeric-looking patterns, e.g. 1, 1.234, 1e4, 1.234E-5, etc. -separator_regex_str = r'(?:>|>=|<|<=)' -ids_regex_str = r'(?:tags|pvalue|fold_enrichment|fdr|length)' -filter_regex = re.compile('^(%(num)s)?(%(sep)s)?(%(id)s)(%(sep)s)?(%(num)s)?$'%{'num':numeric_regex_str,'sep':separator_regex_str,'id':ids_regex_str}) - -class FilterException(Exception) : pass - -def parse_filter(filter_str) : - match = filter_regex.search(filter_str.strip()) - if match is None : - raise FilterException('Filter %s is formatted incorrectly'%filter_str) - low_val, low_test, field, high_test, high_val = match.groups() - low_val = float(low_val) if low_val else low_val - high_val = float(high_val) if high_val else high_val - return field, make_condition(low_val,_cond_map[low_test],high_val,_cond_map[high_test]) - -_sort_keys = {'length': lambda x: int(x[3]), - 'tags': lambda x: int(x[5]), - 'pvalue': lambda x: 10**(float(x[6])/-10), - 'fold_enrichment': lambda x: float(x[7]), - 'fdr': lambda x: float(x[8]), - } - - -summary_str = """\ -# This output was generated by filter_macs_peaks.py, filtered from %(macs_fn)s -# Number of peaks: %(num_recs)d -# Filters: %(filters)s -# Sorted by: %(sort_by)s -# Shuffled: %(shuffled)s -""" -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 1 : - parser.error('Must provide one MACS peaks file') - - if opts.output is not None and opts.encode_filters : - parser.error('--output and --encode-filters options are mutually exclusive') - - # set where to write output - if opts.encode_filters : - # construct filename additions - fn_str = '' - opts.filters.sort() - for filt in opts.filters : - filter_str = filt.replace(' ','') - filter_str = filter_str.replace('>=','_GTE_') - filter_str = filter_str.replace('<=','_LTE_') - filter_str = filter_str.replace('>','_GT_') - filter_str = filter_str.replace('<','_LT_') - fn_str += '_%s'%filter_str - - if opts.top is not None : - fn_str += '_top%d'%opts.top - - if len(opts.sort_by) != 0 : - fn_str += '_sortby_%s'%opts.sort_by - - if opts.shuffle : - fn_str += '_shuffled' - - macs_path,macs_fn,macs_basefn,macs_ext = get_file_parts(args[0]) - encoded_fn = os.path.join(macs_path,macs_basefn+fn_str+macs_ext) - if opts.print_encoded_fn : - sys.stdout.write(encoded_fn) - sys.exit(0) - else : - out_f = open(encoded_fn,'w') - elif opts.output : - out_f = open(opts.output,'w') - else : - out_f = sys.stdout - - # parse the filters - field_filters = defaultdict(list) - for filter in opts.filters : - field, filter_cond = parse_filter(filter) - field_filters[field].append(filter_cond) - - # start processing MACS file - peaks = MACSFile(args[0]) - - # filter the records - pass_recs = [] - for peak in peaks : - # test each of the fields, if any one fails skip the record - if not all([c(int(peak['length'])) for c in field_filters['length']]) or \ - not all([c(int(peak['tags'])) for c in field_filters['tags']]) or \ - not all([c(10**(float(peak['-10*log10(pvalue)'])/-10)) for c in field_filters['pvalue']]) or \ - not all([c(float(peak['fold_enrichment'])) for c in field_filters['fold_enrichment']]) or \ - not all([c(float(peak['FDR(%)'])) for c in field_filters['fdr']]) : - continue - else : - pass_recs.append([peak[k] for k in MACSOutput.FIELD_NAMES]) - - if len(pass_recs) == 0 : - warn('WARNING: no records remain after filtering\n') - sys.exit(1) - - # sorting - if opts.sort_by : - pass_recs.sort(key=_sort_keys[opts.sort_by],reverse=opts.sort_dir != 'ASCEND') - - # top records - num_recs = len(pass_recs) if not opts.top else min(len(pass_recs),opts.top) - - # construct the summary string - filters_str = 'none' if len(opts.filters) == 0 else ', '.join(opts.filters) - sort_str = 'original order' if not opts.sort_by else opts.sort_by+', '+opts.sort_dir - shuffled_str = str(opts.shuffle) - summary = summary_str%{'macs_fn':args[0],'num_recs':num_recs, - 'filters':filters_str, - 'sort_by':sort_str, - 'shuffled':shuffled_str} - - # print summary only - if opts.summary : - sys.stdout.write(summary) - sys.exit(0) - - # write out the header cuz it's a nice thing to do - if not opts.no_header : - out_f.write(summary) - out_f.write('\t'.join(MACSOutput.FIELD_NAMES)+'\n') - - # write out records - if opts.shuffle : - shuffle(pass_recs) - out_recs = pass_recs[:num_recs] - - for rec in out_recs : - out_f.write('\t'.join(map(str,rec))+'\n') - - # good programming practice - out_f.close()
--- a/chipsequtil-master/scripts/filter_mapped_known_genes.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python - -import re -import sys - -from csv import reader, writer -from collections import defaultdict as dd -from optparse import OptionParser - -from chipsequtil.util import MultiLineHelpFormatter as MF - -usage = '%prog [options] <mapped known genes file>' -description = """Filter columns and rows from *join_mapped_known_genes.py* output which was \ -invoked with *--binary-plus* and *--field-types* flags. Specify full column names for either \ -binding or expression data with the *--bind-cols* and *--affy-cols* arguments, respectively. \ -The special fieldname *MAPPED* from *join_mapped_known_genes.py* is used to determine whether \ -a file contains a mapping for each gene. To filter genes by their associated binding or \ -expression data, specify *--bind-filter* or *--affy-filter* as follows: - - - *any* - report gene if at least one input file maps to the gene - - *all* - report gene if every input file maps to the gene - - *absent* - report gene if no input file maps to the gene - - *none* - do not filter genes at all (default) - -Results of binding and expression filters are 'and'ed together, e.g.: - ---bind-filter=all --affy-filter=absent - -returns only genes for which all binding files and none of the expression files map. -""" -epilog='Note: when specifying column names, be sure to escape characters like (,),&,*,etc... \ -that shells interpret with a \\, e.g. --bind-cols=-10\\*log10\\(pvalue\\)' -parser = OptionParser(usage=usage,description=description,epilog=epilog, formatter=MF()) -parser.add_option('--bind-cols',dest='bind_cols',default='',help='comma delimited list of binding data column names to include, [default: all]') -parser.add_option('--affy-cols',dest='affy_cols',default='',help='comma delimited list of expression data column names to include, [default: all]') -parser.add_option('--bind-filter',dest='bind_filt',type='choice',choices=['any','all','absent','none'],default='none',help='gene set to include based on binding data [default: %default]') -parser.add_option('--affy-filter',dest='affy_filt',type='choice',choices=['any','all','absent','none'],default='none',help='gene set to include based on expression data [default: %default]') -parser.add_option('--output',dest='output',default=None,help='write output to file') - - -def match_headers(patts,field) : - for p in patts : - if field.endswith(p) : return True - return False - -def filter_vector(type,vec) : - if type == 'any' : - return '1' in vec - elif type == 'all' : - return all([x=='1' for x in vec]) - elif type == 'absent' : - return not ('1' in vec) - else : - return True - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 1 : - parser.error('Exactly one mapped file must be provided') - - map_fn = args[0] - - map_reader = reader(open(map_fn),delimiter='\t') - headers = map_reader.next() - bind_headers = [i for i,x in enumerate(headers) if x.startswith('BIND:')] - bind_map_headers = [i for i,x in enumerate(headers) if x.startswith('BIND:') and x.endswith('MAPPED')] - affy_headers = [i for i,x in enumerate(headers) if x.startswith('AFFY:')] - affy_map_headers = [i for i,x in enumerate(headers) if x.startswith('AFFY:') and x.endswith('MAPPED')] - - if len(bind_headers) == 0 and len(affy_headers) == 0 : - parser.error('No BIND: or AFFY: columns were found in the mapping, was *join_mapped_known_genes.py* run with the *--field-types* option?') - - # figure out which columns user wants - header_indices = [0,1] # always output knowngene and symbol - - bind_header_patts = opts.bind_cols.split(',') - header_indices += [i for i in bind_headers if match_headers(bind_header_patts,headers[i])] - - affy_header_patts = opts.affy_cols.split(',') - header_indices += [i for i in affy_headers if match_headers(affy_header_patts,headers[i])] - - out_f = open(opts.output,'w') if opts.output else sys.stdout - map_writer = writer(out_f,delimiter='\t') - - map_writer.writerow([headers[i] for i in header_indices]) - for rec in map_reader : - bind_vector = [rec[i] for i in bind_map_headers] - bind_pass = filter_vector(opts.bind_filt,bind_vector) - - affy_vector = [rec[i] for i in affy_map_headers] - affy_pass = filter_vector(opts.affy_filt,affy_vector) - - if bind_pass and affy_pass : - map_writer.writerow([rec[i] for i in header_indices]) - - if opts.output : out_f.close()
--- a/chipsequtil-master/scripts/generate_stats_doc.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -#!/usr/bin/env python - -from matplotlib.pyplot import * - -from reStUtil import * - -if __name__ == '__main__' : - - # read stats - # - common read sequences - # - overall quality scores - - - # alignment stats - # - # alignments - # - uniquely aligned - # - multi reads - # - fail filter - # - alignments per chromosome bar chart - - - # peak stats - - - # motif stats and plots
--- a/chipsequtil-master/scripts/gerald_stats.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -#!/usr/bin/env python - -import sys, re, os -from datetime import datetime -from optparse import OptionParser -from collections import defaultdict as dd -#from progressbar import ProgressBar -from csv import reader, writer - -from chipsequtil import get_file_parts -from chipsequtil.util import MultiLineHelpFormatter as MF -from reStUtil import ReStDocument, ReStSimpleTable - -usage = "%prog [options] <filename> [<filename>...]" -description="""\ -Outputs various stats about the GERALD formatted file(s) input. If multiple -files are provided statistics are aggregated according to the specified output -format. Output formats available via --format=X : - - # *python* - print an eval()'able python dictionary w/ counts - # *rst* - print statistics in a reStructured text table (default) - # *tab* - print statistics in a tab delimited form w/ header names - -Except for *python* format, each input file has its own output line. *python* -summarizes all alignments. -""" - -parser = OptionParser(usage=usage,description=description,formatter=MF()) -parser.add_option('--output',dest='output',default=None,help='write output to file [default: stdout]') -parser.add_option('--format',dest='format',type='choice',choices=['python','rst','tab'],default='rst',help='format to print out stats [default: %default]') - -def log(st) : - print datetime.now().isoformat()+' - '+st - -re_digits_nondigits = re.compile(r'\d+|\D+') -def format_with_commas(value,format='%s'): - parts = re_digits_nondigits.findall(format % (value,)) - for i in xrange(len(parts)): - s = parts[i] - if s.isdigit(): - parts[i] = _commafy(s) - break - return ''.join(parts) - -def _commafy(s): - - r = [] - for i, c in enumerate(reversed(s)): - if i and (not (i % 3)): - r.insert(0, ',') - r.insert(0, c) - return ''.join(r) - -if __name__ == '__main__' : - - opts,args = parser.parse_args(sys.argv[1:]) - - gerald_fns = args - - all_stats = dd(int) - stat_dicts = {} - stats_fields = ["sample", - "total alignments", - "% align unique", - "# reads aligned unique", - "% align repeat", - "# reads align repeat", - "% align none", - "# reads align none" - ] - - - data_rows = [] - for gerald_fn in gerald_fns : - stats = stat_dicts[gerald_fn] = dd(int) - - fnpath,fn,fnbase,fnext = get_file_parts(gerald_fn) - gerald_lines = reader(open(gerald_fn),delimiter='\t') - for row in gerald_lines : - m = re.match('^(\d+):(\d+):(\d+)$',row[10]) - if m is not None : - stats['multiread'] += 1 - all_stats['multiread'] += 1 - else : - stats[row[10]] += 1 - all_stats[row[10]] += 1 - - tot_reads = sum(stats.values())/1.-stats.get('QC',0) - unique_reads = sum([v for k,v in stats.items() if k.startswith('chr')]) - repeat_reads = stats.get('multiread',0) - nomap_reads = stats.get('NM',0) - data_row = [fn,format_with_commas(int(tot_reads)), - '%.1f'%(unique_reads/tot_reads*100),format_with_commas(unique_reads), - '%.1f'%(repeat_reads/tot_reads*100),format_with_commas(repeat_reads), - '%.1f'%(nomap_reads/tot_reads*100),format_with_commas(nomap_reads)] - - data_rows.append(data_row) - - out_f = open(opts.output,'w') if opts.output is not None else sys.stdout - - if opts.format == 'python' : - out_f.write(dict(all_stats)) - elif opts.format == 'rst' : - doc = ReStDocument(out_f) - table = ReStSimpleTable(header=stats_fields,data=data_rows) - doc.add(table) - doc.write() - elif opts.format == 'tab' : - out_w = writer(out_f,delimiter='\t') - out_w.writerow(stats_fields) - out_w.writerows(data_rows) - - if opts.output is not None : out_f.close()
--- a/chipsequtil-master/scripts/gerald_to_bed.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -#!/usr/bin/env python - -import os -import re -import sys - -from optparse import OptionParser -from csv import DictReader, DictWriter -from chipsequtil import get_file_parts, GERALDOutput - -usage = "%prog [options] <GERALD file> [<GERALD file>...]" - -description = """\ -Convert the GERALD alignment formatted files into BED format. Input file named -<path>/<filename>.<ext> is translated into <path>/<filename>.bed unless --output -or --stdout is specified, in which case formatted lines are written to file or -standard output, respectively. If multiple input files are supplied with the ---output or --stdout option all formatted lines are concatenated together. -Formatting only occurs for GERALD input lines that have a valid Match Position -field (i.e. successfully aligned somewhere).""" - -parser = OptionParser(usage=usage, description=description) -parser.add_option('--output',dest='output',default=None,help='write all records to file') -parser.add_option('--stdout',dest='stdout',action='store_true',help='write out all formatted lines to stdout') -parser.add_option('--min-fields',dest='min_fields',action='store_true',help='only format the first three fields') -parser.add_option('--pass-only',dest='pass_only',action='store_true',help='only format lines with Y in the Pass Filtering field') -parser.add_option('--chromo-strip',dest='chromo_strip',default='.fa',help='pattern to remove from chromo field in BED output (e.g. --chromo-strip=.fa to remve .fa from chrX.fa) [default: %default]') - - - -if __name__ == '__main__' : - - opts,args = parser.parse_args(sys.argv[1:]) - - if len(args) == 0 : - parser.print_usage() - sys.exit(1) - - gerald_fns = args - - # step through the files - for gerald_fn in gerald_fns : - path,fn,fnbase,fnext = get_file_parts(gerald_fn) - bed_lines = [] - - - # where to write output to - if opts.stdout : - f_out = sys.stdout - else : - f_out = open(os.path.join(path,fnbase+'.bed'),'w') - - # process input - gerald_d = DictReader(open(gerald_fn),fieldnames=GERALDOutput.FIELD_NAMES,delimiter='\t') - for line_d in gerald_d : - if (opts.pass_only and line_d['filtering'] == 'Y' and line_d['match_pos'] != '') or (not opts.pass_only and line_d['match_pos'] != '') : - - if opts.chromo_strip is not None : - line_d['match_chromo'] = line_d['match_chromo'].replace(opts.chromo_strip,'') - - outline = [line_d['match_chromo'], # chromosome - line_d['match_pos'], # start - str(int(line_d['match_pos'])+len(line_d['read'])), # end - line_d['read'], # read - '0', # score - '+' if line_d['match_strand'] == 'F' else '-', # strand - '-', # thickStart - '-', # thickEnd - '0,0,255' if line_d['match_strand'] == 'F' else '255,0,0', # itemRgb - ] - outline = '\t'.join(outline) - f_out.write(outline+'\n') - #bed_lines.append(bed) - - # this is the slow way - #for line in open(gerld_fn) : - # grld = GERALDOutput(line) - # if (opts.pass_only and grld.filtering == 'Y' and grld.match_pos != '') or (not opts.pass_only and grld.match_pos != '') : - # bed = gerald_to_bed(grld,opts.min_fields) - # f_out.write(bed.output_format()) - # #bed_lines.append(bed) -
--- a/chipsequtil-master/scripts/integrate_macs_ucsc.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from optparse import OptionParser -from pypeline import Pypeline, ProcessPypeStep as PPS, PythonPypeStep as PyPS - -from chipsequtil import get_org_settings - -usage = "%prog <org> <stage dir> <stage url> <MACS wiggle directory>" -description = """Process a MACS wiggle directory when macs is invoked -with --wig option, convert all gzipped chromosome wiggle files to -bigWig format, copy to web staging directory <stage dir>, and create -track lines for adding to UCSC genome browser. Requires a <org> argument -that has a path using *org_settings.py <org> ucsc_chrom_sizes* that -points to a sizes file as created by UCSC's *fetchChromSizes <org>* -tool.""" - -parser = OptionParser(usage=usage,description=description) -parser.add_option('--auto',dest='auto',action='store_true',help='run all steps non-interactively (for batch mode, e.g.)') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 4 : - parser.error('Exactly four non-option arguments required') - - organism, stage_dir, stage_url, macs_dir = args - - pipeline = Pypeline('UCSC Integration',log='ucsc_integ.log') - - steps = [] - - org_settings = get_org_settings(organism) - - macs_path, macs_wiggle_path = os.path.dirname(macs_dir), os.path.basename(macs_dir) - macs_name = macs_wiggle_path.replace('_MACS_wiggle','') - wiggle_dir = macs_name+'_MACS_wiggle' - bigwig_fn = macs_name+'_%s_all_chr.bw' - d = {'wiggle_dir':macs_name+'_MACS_wiggle', - 'chrom_sizes':org_settings['ucsc_chrom_sizes'], - 'treat_bigwig_fn':macs_name+'_treat_all_chr.bw', - 'control_bigwig_fn':macs_name+'_control_all_chr.bw', - 'stage_dir':stage_dir, - 'stage_url':stage_url, - 'pwd':os.getcwd(), - } - - # create bigWig files - zcat_treat_call = "zcat %(wiggle_dir)s/treat/*.gz | " + \ - "grep -v '^track' | " + \ - "sed 's/\.fa//g' | " + \ - "wigToBigWig -clip stdin %(chrom_sizes)s " + \ - "%(wiggle_dir)s/treat/%(treat_bigwig_fn)s" - zcat_control_call = "zcat %(wiggle_dir)s/control/*.gz | " + \ - "grep -v '^track' | " + \ - "sed 's/\.fa//g' | " + \ - "wigToBigWig -clip stdin %(chrom_sizes)s " + \ - "%(wiggle_dir)s/control/%(control_bigwig_fn)s" - steps.append(PPS('Convert wig to bigWig',[zcat_treat_call%d,zcat_control_call%d])) - - # create the staging directory - mk_stage_dir_call = "mkdir -p %(stage_dir)s/%(wiggle_dir)s"%d - steps.append(PPS('Create staging directory',[mk_stage_dir_call])) - - # stage bigWig files to staging directory (create links) - stage_treat_call = "ln -fs %(pwd)s/%(wiggle_dir)s/treat/%(treat_bigwig_fn)s " + \ - "%(stage_dir)s/%(wiggle_dir)s/%(treat_bigwig_fn)s" - stage_control_call = "ln -fs %(pwd)s/%(wiggle_dir)s/control/%(control_bigwig_fn)s " + \ - "%(stage_dir)s/%(wiggle_dir)s/%(control_bigwig_fn)s" - steps.append(PPS('Stage bigWig files',[stage_treat_call%d,stage_control_call%d])) - - # generate track lines for treatment and control - treat_track_d = ['track', - 'type=bigWig', - 'name="Treatment"', - 'description="%s Treatment"'%macs_name, - 'bigDataUrl=%(stage_url)s/%(wiggle_dir)s/%(treat_bigwig_fn)s'%d] - treat_track = ' '.join(treat_track_d) - - control_track_d = ['track', - 'type=bigWig', - 'name="Control"', - 'description="%s Control"'%macs_name, - 'bigDataUrl=%(stage_url)s/%(wiggle_dir)s/%(control_bigwig_fn)s'%d] - control_track = ' '.join(control_track_d) - track_str = '\n'.join([treat_track, - control_track]) - - track_fn = wiggle_dir+'_tracks.txt' - def track_call(track_fn, track_str) : - f = open(track_fn,'w') - f.write(track_str+'\n') - f.close() - steps.append(PyPS('Generate track lines file',track_call, - callable_args=(track_fn,track_str)) - ) - - #calls = [zcat_treat_call, - # zcat_control_call, - # mk_stage_dir_call, - # stage_treat_call, - # stage_control_call, - # track_call - # ] - - #print calls - #steps.append(PPS('Stage Wiggle',calls)) - - pipeline.add_steps(steps) - pipeline.run(interactive=not opts.auto)
--- a/chipsequtil-master/scripts/join_mapped_known_genes.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -#!/usr/bin/env python - -import sys -import warnings - -from csv import reader, writer -from collections import defaultdict as dd -from optparse import OptionParser - -usage = '%prog -b <mapped DNA binding file>|-a <mapped microarray file> [-b <mapped DNA binding file> ...] [-a <mapped microarray file> ...]' -description = """Join all files on the first column, concatenating records with \ -matching entries onto one line per entry. Understands DNA binding data as mapped \ -with *map_peaks_to_known_genes.py* utility microarray data as mapped by \ -*probeset_to_known_genes.py* utility, passed to program using *-b* and *-a* options \ -respectively. If a file contains more than one mapping to a gene additional columns \ -are added. At least one file of either type is required. Field names are written as \ -<filename>.<original field name>.<map number> -""" -epilog="Note: microarray files should have been created by bioconductor, and all files should have a row of fieldnames as the first line" -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('-a','--affy-file',dest='affy_file',action='append',default=[],help='add a mapped microarray file') -parser.add_option('-b','--bind-file',dest='bind_file',action='append',default=[],help='add a mapped DNA binding file (e.g. MACS, BED)') -#parser.add_option('-b','--bed-file',dest='bed_file',action='append',default=[],help='add a mapped BED formatted peaks file') -parser.add_option('-m','--macs-file',dest='macs_file',action='append',default=[],help='DEPRECATED: use -b instead, add a mapped default MACS formatted peaks (*.xls) file') -parser.add_option('--output',dest='output',default=None,help='file to output joined records to [default: stdout]') -#parser.add_option('--intersect',dest='intersect',action='store_true',help='only output records common to all file passed in') -parser.add_option('--first-only',dest='first_only',action='store_true',help='only output the first mapping to a gene from each file') -parser.add_option('--binary',dest='binary',action='store_true',help='output only one column per file with a 0 or 1 to indicate whether a mapping exists in that file') -parser.add_option('--binary-plus',dest='binary_plus',action='store_true',help='output one column per file with a 0 or 1 to indicate whether a mapping exists in that file in addition to all other columns') -parser.add_option('--field-types',dest='field_types',action='store_true',help='prepend BIND or AFFY to the beginning of all appropriate columns') -#parser.add_option('--symbols',dest='symbols',action='store_true',help='mapped files contain symbols in second column (per map_peaks_to_known_genes.py|probeset_to_known_gene.py --symbol-xref option)') - -if __name__ == '__main__' : - - opts,args = parser.parse_args(sys.argv[1:]) - - if len(args) > 0 : - parser.error('There were non-option command line arguments passed, all files should have a preceeding option indicating filetype') - - if len(opts.macs_file) != 0 : - warnings.warn('The -m option is deprecated, please replace these flags with -b instead. Adding MACS filenames to binding filename list.',DeprecationWarning) - opts.bind_file.extend(opts.macs_file) - - if len(opts.bind_file) == 0 and len(opts.affy_file) == 0 : - parser.error('No files were passed in, aborting') - - # union of all genes - all_genes = set() - - # TODO - fix intersect w/ binary - opts.intersect = False - - # TODO - actually make this an option, or the default - opts.symbols = True - if opts.symbols : - symbol_map = {} - - # read all the files in - def get_file_dict(fns,header_prefix='') : - file_map = dd(lambda: dd(list)) - out_fieldnames = [] - blank_entry = [] - for fn in fns : - max_maps = 0 - f = reader(open(fn),delimiter='\t') - #f = open(fn) - fieldnames = f.next() - fieldnames = fieldnames[2:] # we don't want existing knownGeneID or geneSymbol - # read in the data, create a dictionary - for l in f : - if opts.symbols : - gene, symbol, data = l[0],l[1],l[2:] - symbol_map[gene] = symbol - else : - gene, data = l.split('\t',1) - file_map[fn][gene].append(data) - max_maps = max(max_maps,len(file_map[fn][gene])) - all_genes.add(gene) - - # if we're adding a binary column, do it - if opts.binary_plus : - out_fieldnames.append(header_prefix+fn+'.MAPPED') - - # construct the fieldnames for this file - for i in range(max_maps) : - out_fieldnames.extend(['%s%s.%d.%s'%(header_prefix,fn,i,h) for h in fieldnames]) - - # pad out data entries w/ fewer than max_maps - for gene,data in file_map[fn].items() : - while len(data) < max_maps : - data.append(['']*len(fieldnames)) - file_map[fn]['blank'] = [['']*len(fieldnames) for _ in range(max_maps)] - return file_map,out_fieldnames - - #macs_file_map, macs_fieldnames = get_file_dict(opts.macs_file) - #bed_file_map, bed_fieldnames = get_file_dict(opts.bed_file) - bind_prefix = 'BIND:' if opts.field_types else '' - affy_prefix = 'AFFY:' if opts.field_types else '' - bind_file_map, bind_fieldnames = get_file_dict(opts.bind_file,bind_prefix) - affy_file_map, affy_fieldnames = get_file_dict(opts.affy_file,affy_prefix) - - # prepare output objects - out_f = open(opts.output,'w') if opts.output else sys.stdout - map_fieldnames = ['knownGeneID'] - if opts.symbols : - map_fieldnames.append('geneSymbol') - #all_fieldnames = map_fieldnames+macs_fieldnames+bed_fieldnames+affy_fieldnames - all_fieldnames = map_fieldnames+bind_fieldnames+affy_fieldnames - if opts.binary : - #all_fieldnames = map_fieldnames+opts.macs_file+opts.bed_file+opts.affy_file - all_fieldnames = [x+'.MAPPED' for x in map_fieldnames+opts.bind_file+opts.affy_file] - join_writer = writer(out_f,delimiter='\t') - join_writer.writerow(all_fieldnames) - - # go through all the genes and print out lines - for gene in all_genes : - gene_line = [gene] - if opts.symbols : - gene_line.append(symbol_map[gene]) - #for filetype_data,fns in zip([macs_file_map,bed_file_map,affy_file_map],[opts.macs_file,opts.bed_file,opts.affy_file]) : - for filetype_data,fns in zip([bind_file_map,affy_file_map],[opts.bind_file,opts.affy_file]) : - for fn,recs in [(fn,filetype_data[fn]) for fn in fns] : - #for fn,recs in d.items() : - if recs.has_key(gene) : - # only output the first entry - if opts.first_only : - gene_line.extend(recs[gene][0]) - # only output a 1 or a zero - elif opts.binary : - gene_line.extend('1') - # else output normally - else : - # add binary column in addition to other output - if opts.binary_plus : - gene_line.extend('1') - for rec in recs[gene] : - gene_line.extend(rec) - else : - # if intersecting, ignore this gene - if opts.intersect : - continue - elif opts.binary : - gene_line.extend('0') - else : - # add binary column in addition to other output - if opts.binary_plus : - gene_line.extend('0') - for blank in filetype_data[fn]['blank'] : - #print len(blank) - gene_line.extend(blank) - #print fn, gene_line[2], len(gene_line), gene_line - join_writer.writerow(gene_line) - - if opts.output : out_f.close()
--- a/chipsequtil-master/scripts/kg_to_gff.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from csv import DictReader, DictWriter, QUOTE_NONE -from optparse import OptionParser - -from chipsequtil import KnownGeneFile, get_file_parts - -#args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-07-08.txt','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] -args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-08-03.gtf','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] -usage = '%prog <knownGene annotation>' -description = 'convert a UCSC knownGene annotation to GFF' -parser = OptionParser(usage=usage,description=description) - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(args) - - kg_path,kg_fn,kg_base,kg_ext = get_file_parts(args[0]) - #kg_f = KnownGeneFile(args[0]) - - # xref for finding gene symbols - kgXref_fn = args[1] - kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','proAcc','description'] - xref_map = dict([(x['kgID'],x) for x in DictReader(open(kgXref_fn),delimiter='\t',fieldnames=kgXref_fieldnames)]) - - gff_headers = ['seqname','source','feature','start','end','score','strand','frame','attributes'] - gff_reader = DictReader(open(args[0]),delimiter='\t',fieldnames=gff_headers) - gff_writer = DictWriter(sys.stdout,delimiter='\t',fieldnames=gff_headers,quotechar='',quoting=QUOTE_NONE,lineterminator='\n') - #gff_writer.writerow(dict([(x,x) for x in gff_headers])) - - for i,rec in enumerate(gff_reader) : - #d = {} - #d['seqname'] = rec['chrom'] - #d['source'] = 'UCSC_knownGene' - #d['feature'] = 'gene' - #d['start'] = rec['txStart'] - #d['end'] = rec['txEnd'] - #d['score'] = '.' - #d['strand'] = rec['strand'] - #d['frame'] = '.' - #gene_name = rec['name'] - - gff_attrs_lst = [x.strip() for x in rec['attributes'].split(';')][:-1] - gff_attrs = {} - for attr in gff_attrs_lst : - k,v = attr.split(' ',1) - gff_attrs[k] = eval(v) - - kg_name = gff_attrs['gene_id'] - - # try to find a gene symbol - gene_id = xref_map[kg_name].get('geneSymbol',None) - #gene_id = kg_name - #if gene_id is None : - # gene_id = xref_map[kg_name].get('mRNA',None) - #if gene_id is None : - # gene_id = xref_map[kg_name].get('refseq',None) - if gene_id is None : # I give up - gene_id = kg_name - - gff_attrs_lst += ['gene_name "%s"'%gene_id] - rec['attributes'] = '; '.join(gff_attrs_lst) - gff_writer.writerow(rec) - - # now write the exons - #d['feature'] = 'exon' - #for j,(st,en) in enumerate(zip(rec['exonStarts'],rec['exonEnds'])) : - # d['start'] = st - # d['end'] = en - # d['attributes'] = '; '.join(['gene_id "%s"'%gene_id,'transcript_id "%s"'%rec['name'],'exon_number "%d"'%(j+1),'ID "%s.exon_%d"'%(rec['name'],j),'PARENT "%s"'%rec['name']]) - # gff_writer.writerow(d) - - - # version with knownGene in gene_name - # version with symbol in gene_name
--- a/chipsequtil-master/scripts/map_intervals.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#!/usr/bin/env python - -import sys - -from collections import defaultdict -from csv import reader -from optparse import OptionParser - -from bx.intervals.intersection import IntervalTree, Interval - -usage = '%prog [options] <from> <to>' -description = """Find records in <to> interval file that map to records in -<from> interval file. Files should be tab delimited and are expected to have -a chromosome column, a start column, and an end column. The indices of these -columns can be specified on the command line but by default are the first -three columns, respectively. Prints out to stdout by default one new line -separated row per row in <from> with a line from <to> where there is a mapping. -If no mapping is found (e.g. when specifying a maximum margin to search within) -the word None is printed. By default only prints nearest record, with ties -settled by smallest line number in <to>.""" -parser = OptionParser(usage=usage,description=description) -parser.add_option('-w','--window',dest='window',type="float",nargs=2, - default=(1e9,1e9), - help="window as <int upstream> <int downstream> to search for intervals [default: %default]") -parser.add_option('-f','--from',dest='from_ind',type="int",nargs=3, - default=(0,1,2), - help="coordinates of chromosome, start, stop in <from> file") -parser.add_option('-i','--skip-from-header',dest='skip_fh',action='store_true', - help="<from> has a header that should be skipped") -parser.add_option('-t','--to',dest='to_ind',type="int",nargs=3, - default=(0,1,2), - help="coordinates of chromosome, start, stop in <to> file") -parser.add_option('-j','--skip-to-header',dest='skip_th',action='store_true', - help="<to> has a header that should be skipped") - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 2 : - parser.error('Exactly 2 non-option arguments are required') - - from_fn, to_fn = args - - chr_trees = defaultdict(IntervalTree) - chr_sizes = defaultdict(lambda : dict(minstart=sys.maxint,maxend=0)) - - if any([x > 1e9 for x in opts.window]) : - parser.error('Window maximum is +/- 1e9') - - to_reader = reader(open(to_fn),delimiter='\t') - if opts.skip_th : - to_header = to_reader.next() - - to_chr, to_st, to_en = opts.to_ind - for r in to_reader : - i = Interval(int(r[to_st]), - int(r[to_en]), - value=r, - chrom=r[to_chr] - ) - chr_trees[r[to_chr]].insert_interval(i) - chr_sizes[r[to_chr]]['minstart'] = min(int(r[to_st]),chr_sizes[r[to_chr]]['minstart']) - chr_sizes[r[to_chr]]['maxend'] = max(int(r[to_st]),chr_sizes[r[to_chr]]['maxend']) - - # window default is 1e9 because no chromosome is more than - # ten billion base pairs, right?! - def find_nearest(t,s,e,window=(1e9,1e9)) : - - # look for record within intervals - inside = t.find(s,e) - - if len(inside) >= 1 : # pick the first one, list returned is sorted - return inside[0] - - i = Interval(s,e) - before = t.upstream_of_interval(i,max_dist=window[0]) - after = t.downstream_of_interval(i,max_dist=window[1]) - - before = before[0] if len(before) != 0 else None - after = after[0] if len(after) != 0 else None - - if before and after : - b_dist = min(abs(before.end-s),abs(e-before.start)) - a_dist = min(abs(after.end-s),abs(e-after.start)) - nearest = before if b_dist < a_dist else after - elif before : - nearest = before - elif after : - nearest = after - else : - nearest = None - return nearest - - # now go through the from file - from_reader = reader(open(from_fn),delimiter='\t') - if opts.skip_fh : from_reader.next() - - from_chr, from_st, from_en = opts.from_ind - if opts.skip_th : - print '\t'.join(to_header) - for r in from_reader : - t = find_nearest(chr_trees[r[from_chr]],int(r[from_st]),int(r[from_en]), - window=opts.window) - if t : - print '\t'.join(t.value) - else : - print t - """ - # tests - print 'interval is before any other interval in tree' - t = find_nearest(chr_trees['chr2'],10388500,10388510) - print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-466f-1',t.value),t - print 'interval is after any other interval in tree' - t = find_nearest(chr_trees['chr1'],200000000,200000010) - print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-29c',t.value),t - print 'interval is between intervals' - t = find_nearest(chr_trees['chr3'],89773941,89774021) - print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-190b',t.value),t - print 'interval is inside another interval' - t = find_nearest(chr_trees['chr3'],89873999,89874001) - print '\tCorrect answer: %s, Returned answer: %s'%('mmu-mir-190b',t.value), t - print 'interval is too far from anything to return anything' - t = find_nearest(chr_trees['chr3'],89773941,89774021,window=10) - print '\tCorrect answer: None, Returned answer: %s'%t - """
--- a/chipsequtil-master/scripts/map_peaks_to_genes.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from optparse import OptionParser -from collections import defaultdict as dd -from chipsequtil import MACSOutput, BEDOutput, RefGeneOutput, parse_number -from csv import DictReader, DictWriter - -usage = '%prog [options] <refGene file> <peaks file>' -description = """ -Map the peaks in <peaks file> to genes in <refGene file>. <refGene file> is -format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql. -<peaks file> format is as produced by MACS.""" -epilog = '' -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') -parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') -parser.add_option('--map-output',dest='peak_output',default=sys.stdout,help='filename to output mapped peaks in BED format to [default: stdout]') -parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') -parser.add_option('--peaks-format',dest='peaks_fmt',default='MACS',type='choice',choices=['MACS','BED'],help='format of peaks input file [default: %default]') - -# TODO - options -#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') -#parser.add_option('--capture-intergenic'...) -#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') -#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') - -def parse_gene_ref(ref_gene) : - #FIXME - maybe, if galaxy doesn't work out, figure out how to deal with multiple RefGene mapping formats? - fieldnames = ['geneName','name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts','exonEnds'] - reader = DictReader(ref_gene,fieldnames=fieldnames,delimiter='\t') - gene_ref = dd(list) - for ref_dict in reader : - for k,v in ref_dict.items() : - # coerce numbers where possible - ref_dict[k] = parse_number(v) - - # turn 'x,x,x,...' into a list - ref_dict['exonStarts'] = [parse_number(x) for x in ref_dict['exonStarts'].split(',')] - if ref_dict['exonStarts'][-1] == '' : ref_dict['exonStarts'].remove('') - ref_dict['exonEnds'] = [parse_number(x) for x in ref_dict['exonEnds'].split(',')] - if ref_dict['exonEnds'][-1] == '' : ref_dict['exonEnds'].remove('') - - gene_ref[ref_dict['chrom']].append(ref_dict) - - return gene_ref - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 2 : - parser.error('Must provide two filename arguments') - - gene_ref = parse_gene_ref(open(args[0])) - if opts.peaks_fmt == 'MACS' : - fieldnames = MACSOutput.FIELD_NAMES - chr_field, start_field, end_field = 'chr', 'start', 'end' - elif opts.peaks_fmt == 'BED' : - fieldnames = BEDOutput.FIELD_NAMES - chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' - else : - fieldnames = [] - - peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') - - # default output format: - # <chromo> <peak loc> <accession #> <gene symbol> <strand> <map type> <map subtype> <score> <dist from feature> - # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene - output_fields = ['chromo', - 'peak loc', - 'accession #', - 'gene symbol', - 'strand', - 'map type', - 'map subtype', - 'score', - 'dist from feature', - ] - if opts.peak_output != sys.stdout : - opts.peak_output = open(opts.peak_output,'w') - peaks_writer = DictWriter(opts.peak_output,output_fields,delimiter='\t',lineterminator='\n') - unique_genes = set() - map_stats = dd(int) - for peak in peaks_reader : - - # if this is a comment or header line get skip it - if peak[fieldnames[0]].startswith('#') or \ - peak[fieldnames[0]] == fieldnames[0] or \ - peak[fieldnames[0]].startswith('track') : continue - - # coerce values to numeric if possible - for k,v in peak.items() : peak[k] = parse_number(v) - - # peak assumed to be in the middle of the reported peak range - peak_loc = (peak[start_field]+peak[end_field])/2 - - chrom_genes = gene_ref[peak[chr_field]] - - if len(chrom_genes) == 0 : - sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) - continue - - mapped = False - - # walk through the genes for this chromosome - for gene in chrom_genes : - - # reusable dictionary for output - out_d = {}.fromkeys(output_fields,0) - out_d['map type'] = '' - out_d['chromo'] = peak[chr_field] - out_d['peak loc'] = peak_loc - - # determine intervals for promoter, gene, and downstream - if gene['strand'] == '+' : - promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win - else : - promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing - - # check for promoter - if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : - out_d['map type'] = 'promoter' - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - # check for gene - elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : - # check for intron/exon - exon_coords = zip(gene['exonStarts'],gene['exonEnds']) - in_exon = False - for st,en in exon_coords : - if peak_loc >= st and peak_loc <= en : - in_exon = True - break - out_d['map type'] = 'gene' - out_d['map subtype'] = 'exon' if in_exon else 'intron' - - # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene - gene_len = float(gene_coords[1]-gene_coords[0]) - out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len - - # distance calculated from start of gene - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - map_stats[out_d['map subtype']] += 1 - - # check for downstream - elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : - out_d['map type'] = 'after' - out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc - - # does not map to this gene - else : - pass - - # map type is not blank if we mapped to something - if out_d['map type'] != '' : - - out_d['accession #'] = gene['name'] - out_d['gene symbol'] = gene['geneName'] - out_d['strand'] = gene['strand'] - - map_stats[out_d['map type']] += 1 - peaks_writer.writerow(out_d) - - unique_genes.add(gene['name']) - mapped = True - - """ - print 'Peak:',peak - print 'Gene:',gene - print 'Peak loc:',peak_loc - print promoter_coords - print gene_coords - print downstream_coords - raw_input('Wait for it...') - """ - - # reset map_type - out_d['map type'] = '' - - if not mapped : - #out_d['map type'] = 'intergenic' - #peaks_writer.writerow(out_d) - map_stats['intergenic'] += 1 - - if opts.peak_output != sys.stdout : - opts.peak_output.close() - - if opts.stats_output != sys.stderr : - opts.stats_output = open(opts.stats_output,'w') - - for k,v in map_stats.items() : - opts.stats_output.write('%s: %s\n'%(k,v)) - - if opts.stats_output != sys.stderr : - opts.stats_output.close()
--- a/chipsequtil-master/scripts/map_peaks_to_known_genes.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from optparse import OptionParser -from collections import defaultdict as dd -from csv import DictReader, DictWriter - -from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number -from chipsequtil.util import MultiLineHelpFormatter - -usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>' -description = """ -Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\ -format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ -<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \ -is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail\ -option is provided, the following extra fields are appended to each row: - -peak loc, dist from feature, score, map type, map subtype -""" -epilog = '' -parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) -parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') -parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') -parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') -parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') -parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') -parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') -parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') -parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') -#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') - -# TODO - options -#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') -#parser.add_option('--capture-intergenic'...) -#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') -#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') - -def parse_gene_ref(ref_gene) : - reader = KnownGeneFile(ref_gene) - gene_ref = dd(list) - for ref_dict in reader : - gene_ref[ref_dict['chrom']].append(ref_dict) - - return gene_ref - -def parse_gene_ref_line(l) : - l = map(parse_number, l) # coerce to numbers where possible - l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list - l[10] = map(parse_number, l[10].split(',')) - return l - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 3 : - parser.error('Must provide three filename arguments') - - gene_ref = parse_gene_ref(args[0]) - xref_fn = args[1] - peaks_fn = args[2] - if opts.peaks_fmt == 'auto' : - path,ext = os.path.splitext(peaks_fn) - if ext.lower() == '.xls' : - opts.peaks_fmt = 'MACS' - elif ext.lower() == '.bed' : - opts.peaks_fmt = 'BED' - else : - parser.error('Could not guess peaks file format by extension (%s), aborting'%ext) - - if opts.peaks_fmt == 'MACS' : - peaks_reader_cls = MACSFile - chr_field, start_field, end_field = 'chr', 'start', 'end' - elif opts.peaks_fmt == 'BED' : - peaks_reader_cls = BEDFile - chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' - else : - # should never happen - fieldnames = [] - - #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') - peaks_reader = peaks_reader_cls(peaks_fn) - - # default output format: - if opts.peak_output : - peak_output = open(opts.peak_output,'w') - else : - peak_output = sys.stdout - - fieldnames = peaks_reader.FIELD_NAMES - if opts.detail : - fieldnames += ["peak loc","dist from feature","score","map type","map subtype"] - output_fields = ['knownGeneID']+fieldnames - - # see if the user wants gene symbols too - # TODO - actually make this an option, or make it required - opts.symbol_xref = xref_fn - if opts.symbol_xref : - kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] - symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') - symbol_xref_map = {} - for rec in symbol_xref_reader : - symbol_xref_map[rec['kgID']] = rec - output_fields = ['knownGeneID','geneSymbol']+fieldnames - - peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') - peaks_writer.writerow(dict([(k,k) for k in output_fields])) - unique_genes = set() - map_stats = dd(int) - for peak in peaks_reader : - - # if this is a comment or header line get skip it - if peak[fieldnames[0]].startswith('#') or \ - peak[fieldnames[0]] == fieldnames[0] or \ - peak[fieldnames[0]].startswith('track') : continue - - # coerce values to numeric if possible - for k,v in peak.items() : peak[k] = parse_number(v) - - # MACS output gives us summit - if opts.peaks_fmt == 'MACS' : - peak_loc = peak[start_field]+peak['summit'] - else : # peak assumed to be in the middle of the reported peak range - peak_loc = (peak[start_field]+peak[end_field])/2 - - chrom_genes = gene_ref[peak[chr_field]] - - if len(chrom_genes) == 0 : - sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) - continue - - mapped = False - - # walk through the genes for this chromosome - for gene in chrom_genes : - - # reusable dictionary for output - out_d = {}.fromkeys(output_fields,0) - out_d.update(peak) - out_d['map type'] = '' - out_d['chromo'] = peak[chr_field] - out_d['peak loc'] = peak_loc - - # determine intervals for promoter, gene, and downstream - if gene['strand'] == '+' : - promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 - if opts.tss : - gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) - downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win - else : - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win - else : - promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing - if opts.tss : - gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] - downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing - else : - gene_coords = gene['txStart'], gene['txEnd'] - downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing - - # check for promoter - if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : - out_d['map type'] = 'promoter' - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - # check for gene - elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : - # check for intron/exon - exon_coords = zip(gene['exonStarts'],gene['exonEnds']) - in_exon = False - for st,en in exon_coords : - if peak_loc >= st and peak_loc <= en : - in_exon = True - break - out_d['map type'] = 'gene' - out_d['map subtype'] = 'exon' if in_exon else 'intron' - - # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene - gene_len = float(gene_coords[1]-gene_coords[0]) - out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len - - # distance calculated from start of gene - out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc - - map_stats[out_d['map subtype']] += 1 - - # check for downstream - elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : - out_d['map type'] = 'after' - if opts.tss : - out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc - else : - out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc - - # does not map to this gene - else : - pass - - # map type is not blank if we mapped to something - if out_d['map type'] != '' : - - #out_d = {'knownGeneID':gene['name']} - out_d['knownGeneID'] = gene['name'] - if opts.symbol_xref : - out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] - peaks_writer.writerow(out_d) - - mapped = True - - # reset map_type - out_d['map type'] = '' - - if not mapped : - if opts.intergenic : - out_d['knownGeneID'] = 'None' - out_d['geneSymbol'] = 'None' - out_d['map type'] = 'intergenic' - peaks_writer.writerow(out_d) - map_stats['intergenic'] += 1 - - if peak_output != sys.stdout : - peak_output.close() - - #if opts.stats_output != sys.stderr : - # opts.stats_output = open(opts.stats_output,'w') - - #for k,v in map_stats.items() : - # opts.stats_output.write('%s: %s\n'%(k,v)) - - #if opts.stats_output != sys.stderr : - # opts.stats_output.close()
--- a/chipsequtil-master/scripts/motif_scan.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,330 +0,0 @@ -#!/usr/bin/env python - -import matplotlib -matplotlib.use('AGG') - -import numpy as np -import os -import random -import string -import sys - -from math import log, pow -import matplotlib.pyplot as mp -from multiprocessing import Pool -from optparse import OptionParser -from scipy.stats.stats import pearsonr - -from chipsequtil import MACSFile, get_org_settings -from chipsequtil.nib import NibDB -from chipsequtil.sampling import rejection_sample_bg -from TAMO import MotifTools as mt -from TAMO.MotifTools import load - -usage = "%prog [options] <org> <peaks fn> <TAMO motif fn>" -desc = "Do some motif scanning stuffs" -parser = OptionParser(usage=usage,description=desc) - -parser.add_option('-n','--top-n',dest='top_n',type='int',default=None, - help='use top n peaks by pvalue for sequence scanning [default: all]') -parser.add_option('-i','--motif-indices',dest='motif_ind',default='all', - help='which indices from <TAMO motif fn> to use [default: %default]') -parser.add_option('-d','--dir',dest='dir',default='motif_results', - help='write all results into this directory') -parser.add_option('--fixed-peak-width',dest='fixed_w',type='int',default=None, - help='use only a fixed peak window around the summit instead of whole peak') - -revcomp_map = string.maketrans('ACGT','TGCA') - -def score_sequence(seq,motif) : - ll_max = -sys.maxint - for i in range(len(seq)-len(motif)) : - # forward strand - ll_for_sum = 0 - subseq = seq[i:i+len(motif)].upper() - for n,pos in zip(subseq,motif.ll) : - ll_for_sum += pos[n] - # reverse strand - ll_rev_sum = 0 - subseq = reversed(subseq.translate(revcomp_map)) - for n,pos in zip(subseq,motif.ll) : - ll_rev_sum += pos[n] - ll_max = max(ll_max,ll_for_sum,ll_rev_sum) - - return ll_max - -illegal_fn_chars = '/;& ()' -fn_trans = string.maketrans(illegal_fn_chars,'_'*len(illegal_fn_chars)) - -def fasta_itr(fn) : - f = open(fn) - header = None - seq = None - for l in f : - if l.strip().startswith('>') : - if seq is not None : - yield (header,seq) - seq = None - header = l.strip() - else : - seq = seq+l.strip() if seq is not None else l.strip() - - # last record - yield (header, seq) - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 3 : - parser.error('Exactly 3 non-option arguments must be provided') - - org, peaks_fn, motif_fn = args - - if not os.path.exists(opts.dir) : - os.mkdir(opts.dir) - - peaks_dt = np.dtype([('chr',np.str_,13),('start',np.int32),('end',np.int32),('pvalue',np.float64)]) - if opts.fixed_w is not None : - - all_peaks = np.array([(r['chr'], - r['start']+r['summit']-opts.fixed_w/2., - r['start']+r['summit']+opts.fixed_w/2., - r['-10*log10(pvalue)']) for r in MACSFile(peaks_fn)], - dtype=peaks_dt) - else : - all_peaks = np.array([(r['chr'], - r['start'], - r['end'], - r['-10*log10(pvalue)']) for r in MACSFile(peaks_fn)], - dtype=peaks_dt) - - # -10*log10(pvalue) -> -log10(pvalue) - all_peaks[:]['pvalue'] /= 10. - peak_pvals = all_peaks[:]['pvalue'] - - # find the sorted order of peaks by descending pvalue - peak_pval_inds = peak_pvals.argsort() - peak_pval_inds = peak_pval_inds[::-1] # ascending -> descending - all_peaks = all_peaks[peak_pval_inds,:] - - # for pvalue vs motif score - pval_num_bins = 20 - pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins - # try to take at least 100 sequences, at most 10% of bin size - sample_percent = max(min(1.,100./pval_bin_size),0.1) - pval_bin_memo = {} - - if opts.top_n is not None : - peaks = all_peaks[0:opts.top_n] - peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n] - else : - peaks = all_peaks - - # extract fasta sequences for these peaks - nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir']) - - """ - # get the peak sequences - sys.stderr.write('Getting peak sequences\n') - fasta_batch = [] - for i in range(peaks.size) : - fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+')) - fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch) - - # need a dict for background sampling - # headers have genome_dir and .nib in them, strip that out - sys.stderr.write('Converting nib output to dict\n') - fg_fasta_headers = list(fg_fasta_headers) - fg_fasta_dict = {} - for h,s in zip(fg_fasta_headers,fg_fasta) : - h = h.replace('>'+get_org_settings(org)['genome_dir']+'/','') - h = h.replace('.nib','') - if len(s) > 150 : - fg_fasta_dict[h] = s - - # now sample the background sequences - sys.stderr.write('Sampling bg sequences (len(fg_fasta)==%d)\n'%(len(fg_fasta_dict))) - #bg_fasta_dict = rejection_sample_bg(fg_fasta_dict,org,bg_match_epsilon=1e-3,verbose=True) - bg_fasta_dict = {} - bg_fasta = bg_fasta_dict.values() - """ - - # load the motifs - sys.stderr.write('Movin right along\n') - motifs = load(motif_fn) - - if opts.motif_ind != 'all' : - motif_indices = [int(i) for i in opts.motif_ind.split(',') if len(i) != 0] - motifs = [motifs[i] for i in motif_indices] - else : - motif_indices = xrange(len(motifs)) - - # use all cores w/ a Pool - #pool = Pool(processes=opts.n_procs) - - # go through each motif - job_params = [] - res = [] - #for i,m in zip(motif_indices,motifs) : - # job_params.append((i,m,peak_pvals,fg_fasta,bg_fasta,opts.dir)) - #seq_scores = pool.map(analyze_motif_sequences,job_params) - - seq_scores = [] - for m_i,m in zip(motif_indices,motifs) : - - out_dir = opts.dir - - try : - m_name = m.source.split('\t')[2] - except : - m_name = m.source.split()[0] - - print 'starting',m_name - - # pvalue vs motif score - pval_bin_bounds = [] - pval_bin_pvals = [] - pval_bin_ranges = np.arange(0,all_peaks[:]['pvalue'].size,pval_bin_size) - for st_i in pval_bin_ranges : - - end_i = min(st_i+pval_bin_size,all_peaks[:]['pvalue'].size-1) - st_val = all_peaks[st_i]['pvalue'] - end_val = all_peaks[end_i]['pvalue'] - - #print st_i, end_i, pval_bin_size, st_val, end_val - - # keep track of the pvalue bounds of each bin - pval_bin_bounds.append((st_val,end_val)) - - # we sample sample_percent% of peaks in the bin to score - num_to_sample = int(sample_percent*(end_i-st_i)) - inds_to_sample = random.sample(xrange(st_i,end_i),num_to_sample) - - # we memoize the sequences we've seen before so we don't fetch seqs - # unnecessarily - unmemoed_inds_to_sample = set(inds_to_sample).difference(set(pval_bin_memo.keys())) - - bin_fasta_batch = [] - for peak_i in unmemoed_inds_to_sample : - bin_fasta_batch.append((str(all_peaks[peak_i]['chr']), - int(all_peaks[peak_i]['start']), - int(all_peaks[peak_i]['end']), - '+')) - - if len(bin_fasta_batch) != 0 : - bin_headers, bin_seq = nibDb.get_fasta_batch(bin_fasta_batch) - - for i, ind in enumerate(unmemoed_inds_to_sample) : - pval_bin_memo[ind] = bin_seq[i].upper() - - # score the sequences - pval_bin_pvals.append([]) - for ind in inds_to_sample : - max_score = m.bestscan(pval_bin_memo[ind]) - max_score = (max_score-m.minscore)/(m.maxscore-m.minscore) - pval_bin_pvals[-1].append(max_score) - pval_bin_pvals[-1] = np.array(pval_bin_pvals[-1]) - - - mp.figure(figsize=(4,4)) - font = {'size':'9'} - mp.rc('font',**font) - - # box plot of the bins - mp.boxplot(pval_bin_pvals,positions=np.arange(len(pval_bin_pvals))) - - # plot the means of the bins - #[(x[0]+x[1])/2. for x in pval_bin_bounds] - mp.plot(np.arange(len(pval_bin_pvals)), - [x.mean() for x in pval_bin_pvals],'bo') - mp.title('Sampled motif score vs binned peak pvalue') - mp.xlabel('Binned -log10(pvalue)') - mp.ylabel('Maximum normalized motif score') - - img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_peakmot.png'%m_i) - mp.savefig(img_fn) - mp.clf() - - continue - - fg_ratios = [] - for seq in fg_fasta : - #max_score = score_sequence(seq,m) - max_score = m.bestscan(seq.upper()) - fg_ratios.append((max_score-m.minscore)/(m.maxscore-m.minscore)) - fg_ratios = np.array(fg_ratios) - - bg_ratios = [] - for seq in bg_fasta : - #max_score = score_sequence(seq,m) - max_score = m.bestscan(seq.upper()) - bg_ratios.append((max_score-m.minscore)/(m.maxscore-m.minscore)) - bg_ratios = np.array(bg_ratios) - - fg_mean = sum(fg_ratios)/len(fg_ratios) - fg_std = np.std(fg_ratios) - bg_mean = sum(bg_ratios)/len(bg_ratios) - bg_std = np.std(bg_ratios) - - m_mat = np.array((fg_ratios,bg_ratios,peak_pvals)) - fg_score_sort_inds = m_mat[0,:].argsort() - - motif_score_cnts, motif_score_bins = np.histogram(m_mat[0,:],bins=20) - binned_motif_scores = [] - for st, end in zip(motif_score_bins[:-1],motif_score_bins[1:]) : - binned_motif_scores.append(m_mat[2,(m_mat[0,:]>=st)&(m_mat[0,:]<end)]) - - mp.figure(figsize=(4,4)) - font = {'size':'9'} - mp.rc('font',**font) - - mp.plot(fg_ratios,peak_pvals,'bo') - - # calculate pearson correlation coefficient - pear_r, pear_pval = pearsonr(fg_ratios,peak_pvals) - mp.title('Max motif strength vs peak pvalue\n(r=%.2f,pval=%.2g)'%(pear_r,pear_pval)) - img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_corr.png'%m_i) - mp.savefig(img_fn) - mp.clf() - - # line plot of average peak p-value for binned motif score - mp.title('Average peak p-value for binned motif score\n%s'%m_name) - mp.xlabel('normalized motif score') - mp.ylabel('-log10(pvalue)') - mp.boxplot(binned_motif_scores,positions=np.arange(motif_score_bins.size-1),sym='') - p = mp.plot(np.arange(motif_score_bins.size-1), - [x.mean() for x in binned_motif_scores], - 'bo', - label='Mean fg score') - p = p[0] - - # draw a crosshair - bg_median_ind = np.argwhere(((motif_score_bins<=bg_mean)[:-1] & (motif_score_bins>=bg_mean)[1:])).ravel()[0] - bg_median = np.median(binned_motif_scores[bg_median_ind]) - xlim, ylim = p.axes.get_xlim(), p.axes.get_ylim() - mp.plot([bg_median_ind,bg_median_ind],ylim,'k-',label='Mean bg score=%.2g'%m_mat[1,:].mean()) - mp.plot(xlim,[bg_median,bg_median],'k-') - mp.xticks(np.arange(motif_score_bins.size)[1::5],['%.2f'%x for x in motif_score_bins[1::5]]) - mp.legend(loc='upper left') - - img_fn = os.path.join(out_dir,m_name.translate(fn_trans)+'_%d_peakmot.png'%m_i) - mp.savefig(img_fn) - mp.clf() - - ret_d ={'m_name': m_name, - 'fg_mean': fg_mean, - 'fg_std': fg_std, - 'bg_mean': bg_mean, - 'bg_std': bg_std, - 'fg_scores': fg_ratios, - 'bg_scores': bg_ratios, - #'wmw_pval': WMWtest(fg_ratios,bg_ratios) - } - - # binned pvalue vs sampled motif score - - - print 'done with',m_name - - seq_scores.append(ret_d)
--- a/chipsequtil-master/scripts/nibFrag.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -#!/usr/bin/env python -# nibFrag.py - a python implementation of Jim Kent's nibFrag command line utility - -import sys -import warnings -from optparse import OptionParser, OptionGroup - -from chipsequtil import get_file_parts, BEDFile -from chipsequtil.nib import get_nib_batch, validate_nib_file, NibException, NOMASK, MASK, HARDMASK - -usage = '%prog [options] file.nib start end strand [outfile]\n -- or --\n%prog [options] --batch file.nib batchfile [batchfile ...]' -description = """A python implementation of Jim Kent's nibFrag utility that allows outputting to \ -stdout. Otherwise the functionality is identical for the non-batch usage. Batch mode accepts \ -one or more files containing sets of coordinates to extract from the nib file. Only BED formatting \ -is accepted at the moment. All sequences are concatenated together in FASTA format. To retrieve the \ -entire sequence, use END as the end argument.""" -epilog="Note: When specifying --name optionin batch mode, also specify --dbHeader to ensure unique FASTA headers." -parser = OptionParser(usage=usage,description=description,epilog=epilog) -#parser.add_option('--output',dest='output',default=sys.stdout,help='filename to write output to [default: stdout]') -parser.add_option('--no-header',dest='no_header',action='store_true',help='only output sequence (no fasta header)') -parser.add_option('--wrap-width',dest='wrap_width',type='int',default=50,help='wrap output sequence at this number of bases, 0 indicates no wrap (sequence ends up on single line) [default: %default]') -parser.add_option('--batch',dest='batch',action='store_true',help='run in batch mode, interpret arguments after nib file as queries') -parser.add_option('--batch-format',dest='batch_format',type='choice',choices=['BED'],default='BED',help='format to interpret batch files [default: %default]') -#parser.add_option('--mask-type',dest='mask_type',type='choice',choices=['NOMASK','MASK','HARDMASK'],default='NOMASK',help='how to handle masked positions, correspond to original nibFrag options --masked and --hardMasked [default: %default]') - -# original nibFrag usage: -#nibFrag - Extract part of a nib file as .fa (all bases/gaps lower case by default) -#usage: -# nibFrag [options] file.nib start end strand out.fa -#where strand is + (plus) or m (minus) -#options: -# -masked - use lower case characters for bases meant to be masked out -# -hardMasked - use upper case for not masked-out and 'N' characters for masked-out bases -# -upper - use upper case characters for all bases -# -name=name Use given name after '>' in output sequence -# -dbHeader=db Add full database info to the header, with or without -name option -# -tbaHeader=db Format header for compatibility with tba, takes database name as argument - -# original nibFrag options -nibFrag_grp = OptionGroup(parser,"Original nibFrag options") -nibFrag_grp.add_option('--masked',dest='masked',action='store_true',help='use lower case characters for bases meant to be masked out') -nibFrag_grp.add_option('--hardMasked',dest='hardmasked',action='store_true',help='use upper case for non masked-out and \'N\' characters for masked-out bases') -nibFrag_grp.add_option('--upper',dest='upper',action='store_true',help='use upper case characters for all bases') -nibFrag_grp.add_option('--name',dest='name',default=None,help='Use given name after \'>\' in output sequence') -nibFrag_grp.add_option('--dbHeader',dest='dbHeader',default=None,help='Add full database info to the header, with or without -name option') -nibFrag_grp.add_option('--tbaHeader',dest='tbaHeader',default=None,help='Format header for compatibility with tba, takes database name as argument') -parser.add_option_group(nibFrag_grp) - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 1 : - parser.print_usage() - parser.exit(1) - - # setup - nib_path = args[0] - nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) - - queries = [] - if opts.batch : - - if len(args) < 2 : - parser.error('Two arguments must be supplied in batch mode') - - batch_fns = args[1:] - - for fn in batch_fns : - if opts.batch_format == 'BED' : - for bed in BEDFile(fn) : - if bed['chrom'] != nib_base : - warnings.warn('Chromosome in BED line %s does not match file %s, skipping'%(bed['chrom'],nib_base)) - else : - queries.append((int(bed['chromStart']),int(bed['chromEnd']),bed['strand'])) - else : - - if len(args) < 4 : - parser.error('Four arguments must be supplied in non-batch mode') - - # setup - strand = args[3] - start, end = int(args[1]),args[2] - if end == 'END' : - end = -1 - else : - end = int(end) - if end < start : - parser.error('Stop coordinate %d smaller than start %d'%(end,start)) - - queries.append((start,end,strand)) - - mask_type = NOMASK - if opts.masked : - mask_type = MASK - elif opts.hardmasked : - mask_type = HARDMASK - - # set the output file - if len(args) > 4 : - out_f = open(args[4],'w') - else : - out_f = sys.stdout - - # get the sequences from the .nib file - try : - headers, seqs = get_nib_batch(nib_path,queries,mask_type) - except NibException, e : - sys.stderr.write(e.message+'\n') - sys.exit(1) - - nbases = validate_nib_file(nib_path) - - # output all queries - for header, seq in zip(headers,seqs) : - - # write output - out_f.write(header) - - if opts.upper : - seq = seq.upper() - if opts.wrap_width == 0 : - out_f.write(seq+'\n') - else : - for i in xrange(0,len(seq),opts.wrap_width) : - out_f.write(seq[i:i+opts.wrap_width]+'\n') -
--- a/chipsequtil-master/scripts/org_settings.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from optparse import OptionParser -from ConfigParser import ConfigParser, NoSectionError -from pprint import pformat - -from chipsequtil import get_org_settings, get_global_settings, get_all_settings, get_local_settings, GLOBAL_SETTINGS_FN, LOCAL_SETTINGS_FN - -usage = '%prog [options] [<org key> [<org setting>]]' -description='''Tool for retrieving sets of organism-specific settings and paths. -Original paths are set at install time, and can be overridden in the file ~/.org -settings.cfg. Allows output of settings in a variety of shell environment -syntaxes. The tool attempts to guess which shell environment is being used by -examining the SHELL environment variable unless explicitly set. When run without -an argument, returns a listing of all settings available. -''' -parser = OptionParser(usage=usage,description=description) -parser.add_option('-s','--syntax',dest='syntax',type='choice',\ - choices=['auto','python','bash','tcsh'],default='auto',help='syntax flavor \ - of output to produce [default: %auto]') -parser.add_option('-l','--list',dest='list_sets',action='store_true',help='print \ - all available settings for human consumption') - - -def obj_to_format(obj,format='python') : - '''Convert *obj* into a string that can be evaluated in the environment \ - indicated in *format*. - - obj -- a string, a dict of values, or a dict of dicts of values - format -- python (default), or bash - ''' - - if format == 'auto' : - format = os.environ.get('SHELL','python').split('/')[-1] - - r = '' - if format == 'python' : - r = pformat(obj) - elif format in ['sh','bash','zsh','csh','tcsh'] : - statements = [] - if format in ['sh','bash','zsh'] : - export_tmpl = 'export %s=%s' - elif format in ['csh','tcsh'] : - export_tmpl = 'setenv %s %s' - - # dict - if isinstance(obj,dict) : - for k1, v1 in obj.items() : - # dict of dicts - if isinstance(v1,dict) : - # these should be literal values - for k2, v2 in v1.items() : - statements.append(export_tmpl%('_'.join([k1,k2]).upper(),\ - str(v2))) - else : - v1 = str(v1) - s = '\''+v1+'\'' if v1.count(' ') != 0 else str(v1) - statements.append(export_tmpl%(k1.upper(),str(s))) - else : - return str(obj) - - r = '\n'.join(statements) - - return r - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - # output depends on number of arguments passed - output = '' - - # return everything we know about - if len(args) == 0 : - - if opts.list_sets : - - # always use python formatting when listing - opts.syntax = 'python' - - # global settings - settings = get_global_settings() - output = 'Global settings: (%s)\n'%GLOBAL_SETTINGS_FN - output += obj_to_format(settings,opts.syntax) + '\n' - - # local settings - settings = get_local_settings() - output += 'Local settings: (%s)\n'%LOCAL_SETTINGS_FN - output += obj_to_format(settings,opts.syntax) - else : - settings = get_all_settings() - output += obj_to_format(settings,opts.syntax) - - - # return all records from the specific organism - elif len(args) in (1,2) : - - # make sure our config files have the requested organism - try : - settings = get_org_settings(args[0]) - except NoSectionError : - sys.stderr.write('No entry %s found, available:\n'%args[0]+\ - pformat(get_all_settings().keys())+'\nExiting\n') - sys.exit(1) - - # return the requested field from the specific organism - if len(args) == 2 : - - # make sure the config file has the setting for this organism - try : - output = obj_to_format(settings[args[1]],opts.syntax) - except KeyError : - sys.stderr.write('Setting %s not found for %s, choices:\n'%(args[1],args[0])+ - pformat(settings.keys())+'\nExiting\n') - sys.exit(2) - else : - output = obj_to_format(settings,opts.syntax) - else : - parser.error('Provide zero, one, or two argments, found %s'%args) - - # bon voyage - sys.stdout.write(output+'\n') -
--- a/chipsequtil-master/scripts/peaks_to_fasta.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -import textwrap -import warnings -from optparse import OptionParser - -from chipsequtil import BEDFile, MACSFile, get_file_parts, get_org_settings -from chipsequtil.nib import NibDB -from chipsequtil.sampling import rejection_sample_bg -from chipsequtil.util import MultiLineHelpFormatter -from chipsequtil.seq import write_fasta_to_file - - -usage='%prog [options] <organism> <peak file> [<peak file> ...]' -description='''Extract sequences for peaks in provided peak file(s). Can \ -interpret MACS or BED output, determined automatically by .xls or .bed extensions \ -respectively (force explicit format with --peak-format option). Outputs fasta \ -sequences for the peaks in all files extracted from the reference genome specified \ -by the output of *org_settings.py <organism> genome_dir* to stdout by default.\ -Chromosome names in peak files must match nib filenames without extension (e.g. \ -peak line: chr1 0 100 searches *genome_dir*/chr1.nib). Fasta records have the \ -following format: - -><chromosome>:<start>-<end>;fn=<name of file>:<line number>;db_fn=<db filename>;fmt=<format>;<source alignment info> -<sequence...> - -<db filename> is the filename where the sequence was extracted, <format> is the \ -format of the input file (MACS or BED), and <source alignment info> contains all \ -the fields from the originating alignment according to the source format.''' -parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) -parser.add_option('--min-header',dest='min_header',action='store_true',help='only store <chromosome>:<start>-<end> in header') -parser.add_option('--peak-format',dest='peak_format',type='choice', - choices=['auto','MACS','BED'],default='auto', - help='peak file format, \'auto\' determines format by extension, choices: MACS, BED, auto [default: %default]') -parser.add_option('--output',dest='output',default=None,help='filename to output fasta records to [default: stdout]') -parser.add_option('--fixed-peak-width',dest='fixed_peak_width',type='int',default=None,help='return a fixed number of bases flanking peak summit (*summit* field in MACS, (end-start)/2 in BED), ignoring start/stop coords [default: None]') -parser.add_option('--wrap-width',dest='wrap_width',type='int',default=70,help='wrap fasta sequences to specified width. -1 indicates no wrap [default: %default]') - - -def bed_to_fasta(fn,db,min_header=False) : - #headers,seqs = db.get_fasta_from_bed(fn) - fastas = [] - bed_recs = BEDFile(fn) - for i,rec in enumerate(bed_recs) : - - if opts.fixed_peak_width : - midpoint = (rec['chromEnd']-rec['chromStart'])/2 - start = max(0,midpoint-opts.fixed_peak_width/2) - end = min(midpoint+opts.fixed_peak_width/2,db.db_info[rec['chrom']]['nbases']) - coords = start, end - else : - coords = start,end = int(rec['chromStart']), int(rec['chromEnd']) - - seq = db.get_seq(rec['chrom'], start, end) - seq_fn = db.db_info[rec['chrom']]['path'] - - header = '%s:%s;'%(rec['chrom'],'%d-%d'%(start,end)) - if not min_header : - header = header.strip()+'%s:%d;fmt=BED;'%(fn,i)+ \ - ';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()]) - fastas.append((header,seq)) - - return fastas - - -def macs_to_fasta(fn,db,min_header=False) : - macs_recs = MACSFile(fn) - fasta = [] - for i,rec in enumerate(macs_recs) : - - if opts.fixed_peak_width : - # adjust start and end peak position based on summit, ensuring we don't step outside of the reference sequence bounds - start = max(0, rec['start']+rec['summit']-opts.fixed_peak_width/2) - end = min(rec['start']+rec['summit']+opts.fixed_peak_width/2, db.db_info[rec['chr']]['nbases']) - coords = start, end - else : - start, end = coords = rec['start'], rec['end'] - - seq = db.get_seq(rec['chr'],start,end) - seq_fn = db.db_info[rec['chr']]['path'] - - header = '%s:%s'%(rec['chr'],'%d-%d'%coords) - if not min_header : - header += ';%s:%d;db_fn=%s;fmt=MACS;'%(fn,i,seq_fn) + \ - ';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()]) - fasta.append((header,seq)) - - return fasta - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 2 : - parser.error('Must provide at least two non-option arguments') - - # instantiate the NibDB from the provided directory - organism = args[0] - nib_dir = get_org_settings(organism)['genome_dir'] - nib_db = NibDB(nib_dirs=[nib_dir]) - - # determine specified format - peak_fmt = opts.peak_format - - peak_fns = args[1:] - - # determine if there is an output file - if opts.output : - out_f = open(opts.output,'w') - else : - out_f = sys.stdout - - fasta_recs = [] - for peak_fn in peak_fns : - # if --peak-format is auto, figure format out from extension - if opts.peak_format == 'auto' : - fnbase, fnext = os.path.splitext(peak_fn) - if fnext.lower() == '.bed' : # BED file - peak_fmt = 'BED' - elif fnext.lower() == '.xls' : # MACS file - peak_fmt = 'MACS' - else : - warnings.warn('Peak format specified as auto but file extension \ - not recognized in file %s, skipping'%peak_fn) - continue - - if peak_fmt == 'BED' : - fasta_recs.extend(bed_to_fasta(peak_fn,nib_db,min_header=opts.min_header)) - elif peak_fmt == 'MACS' : - fasta_recs.extend(macs_to_fasta(peak_fn,nib_db,min_header=opts.min_header)) - - # write out foreground to file - if opts.output : - if opts.wrap_width == -1 : - opts.wrap_width = sys.maxint - write_fasta_to_file(dict(fasta_recs),opts.output,linelen=opts.wrap_width) - else : - for header, seq in fasta_recs : - if opts.wrap_width != -1 : - seq = textwrap.fill(seq,opts.wrap_width) - sys.stdout.write('>%s\n%s\n'%(header,seq))
--- a/chipsequtil-master/scripts/plot_peak_loc_dist.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ -#!/usr/bin/env python - -import matplotlib -matplotlib.use('AGG') - -import matplotlib.pyplot as mp -import numpy as np -import os -import sys - -from collections import defaultdict -from csv import reader, writer -from optparse import OptionParser -from StringIO import StringIO - -from chipsequtil import MACSFile, BEDFile - - -usage = '%prog [options] <peaks fn> <gene list fn>' -desc = """Produce a pie chart of the locations of peaks in different bins -(promoter, gene, exon, intron, etc.) and, optionally, save the different -records to their own files for subsequent analysis. Also produce a histogram -of distance from feature values in mapping file. Peaks file is expected -to be as output by MACS, or alternately as a BED file but then the -b plot -is not available. Gene list file is expected to be in the format as -output by peaks_to_known_genes.py script.""" -parser = OptionParser(usage=usage,description=desc) -parser.add_option('-b','--bar-fn',dest='bar_fn',default=None,help='filename for pvalue stacked bar chart') -parser.add_option('-g','--gene-pie-fn',dest='gene_pie_fn',default=None,help='filename for pie chart image') -parser.add_option('-p','--peak-pie-fn',dest='peak_pie_fn',default=None,help='filename for pie chart image') -parser.add_option('-f','--dist-fn',dest='dist_fn',default=None,help='filename for distance from feature image') -parser.add_option('-s','--save',dest='save',action='store_true',help='write out files containing peaks for each category') -parser.add_option('-d','--output-dir',dest='out_dir',default='.',help='output files created by --save option to this directory') -parser.add_option('--no-plot',dest='no_plot',action='store_true',help='dont show (but save) the figure produced') -parser.add_option('--peaks-format',dest='peak_fmt',type='choice',choices=['MACS','BED'],default='MACS',help='format of peaks file, either MACS or BED [default: MACS]') - -GENE_FIELD_NAMES = ['knowngene_id','gene_symbol'] -LOC_FIELD_NAMES = ['peak_loc','dist_from_feature','score','map_type','map_subtype'] -int_or_none = lambda x: int(x) if x != '' else None -float_or_none = lambda x: float(x) if x != '' else None -LOC_FIELD_TYPES = [int_or_none,float_or_none,float_or_none,str,str] - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) != 2 : - parser.error('Exactly 2 non-option argument is required') - - peaks_fn, gene_fn = args - - if opts.peak_fmt == 'BED' : - peaks_f = BEDFile(peaks_fn) - else : - peaks_f = MACSFile(peaks_fn) - - gene_reader = reader(open(gene_fn),delimiter='\t') - gene_recs, macs_recs, loc_recs = [], [], [] - gene_reader.next() # get rid of header - - gene_field_cnt = len(GENE_FIELD_NAMES) - macs_field_cnt = len(MACSFile.FIELD_NAMES) - loc_field_cnt = len(LOC_FIELD_NAMES) - for rec in gene_reader : - - gene_recs.append(dict(zip(GENE_FIELD_NAMES,rec[:gene_field_cnt]))) - - # this automatically coerces recs into correct format - macs_line = [f(x) for f,x in zip(MACSFile.FIELD_TYPES,rec[gene_field_cnt:gene_field_cnt+macs_field_cnt])] - macs_recs.append(dict(zip(MACSFile.FIELD_NAMES,macs_line))) - - loc_line = [f(x) for f,x in zip(LOC_FIELD_TYPES,rec[gene_field_cnt+macs_field_cnt:])] - loc_recs.append(dict(zip(LOC_FIELD_NAMES,loc_line))) - - loc_dist = defaultdict(int) - unique_peaks = defaultdict(set) - exon_scores, intron_scores = [], [] - dist_to_features = defaultdict(list) - pvals = defaultdict(list) - - fn_base, fn_ext = os.path.splitext(gene_fn) - if opts.save : - def get_writer(fn) : - fd = writer(open(fn,'w'),delimiter='\t') - header = MACSFile.FIELD_NAMES - if opts.peak_fmt == 'BED' : - header = BEDFile.FIELD_NAMES - fd.writerow(GENE_FIELD_NAMES+header+LOC_FIELD_NAMES) - return fd - fds = {} - - for gene, peak, loc in zip(gene_recs, macs_recs, loc_recs) : - # weird case, not sure why this happens - if loc['map_subtype'] == '0' : - loc['map_subtype'] = '' - key = loc['map_type']+'_%s'%loc['map_subtype'] if loc['map_subtype'] != '' else loc['map_type'] - loc_dist[key] += 1 - dist_to_features[key].append(int(loc['dist_from_feature'])) - if opts.peak_fmt == 'MACS' : - pvals[key].append(float(peak['-10*log10(pvalue)'])) - - map_key = '%s:%d-%d'%(peak['chr'],peak['start'],peak['end']) - unique_peaks[key].add(map_key) - - if key == 'gene_exon' : - exon_scores.append(loc['score']) - elif key == 'gene_intron' : - intron_scores.append(loc['score']) - - if opts.save : - row = [gene[f] for f in GENE_FIELD_NAMES] + \ - [peak[f] for f in MACSFile.FIELD_NAMES] + \ - [loc[f] for f in LOC_FIELD_NAMES] - if not fds.has_key(key) : - fn = os.path.join(opts.out_dir,fn_base+'_'+key+fn_ext) - fds[key] = get_writer(fn) - fds[key].writerow(row) - - # now find which peaks are intergenic - intergenic = [] - num_peaks = 0 - all_unique_peaks = reduce(lambda x,y: x.union(y), unique_peaks.values()) - for l in peaks_f : - peak = l - map_key = '%s:%d-%d'%(peak['chr'],peak['start'],peak['end']) - if map_key not in all_unique_peaks : - unique_peaks['intergenic'].add(map_key) - intergenic.append(peak) - if opts.peak_fmt == 'MACS' : - pvals['intergenic'].append(peak['-10*log10(pvalue)']) - num_peaks += 1 - - num_int = len(intergenic) - loc_dist['intergenic'] = num_int - if opts.save : - fn = os.path.join(opts.out_dir,fn_base+'_intergenic.xls') - fd = writer(open(fn,'w'),delimiter='\t') - fd.writerow(MACSFile.FIELD_NAMES) - fd.writerows([[x[f] for f in MACSFile.FIELD_NAMES] for x in intergenic]) - - exon_scores, intron_scores = np.array(exon_scores), np.array(intron_scores) - - font = {'size':'9'} - mp.rc('font',**font) - fig = mp.figure(figsize=(4,4)) - - bin_order = ('intergenic','gene_exon','gene_intron','promoter','after') - colors = 'bgrcm' - - # pie chart - #pie_ax_rect = [0.1,0.35, 0.4125, 0.525 ] # left, bottom, width, height - pie_ax = fig.add_axes((0.15,0.15,0.7,0.7)) - pie_ax.set_title('Gene map distribution\n%d peaks'%num_peaks) - pie_labels, pie_values = [], [] - for k in bin_order : - pie_labels.append(k+'\n%d'%(len(unique_peaks[k]))) - pie_values.append(len(unique_peaks[k])) - pie_ax.pie(pie_values,labels=pie_labels) - - img_fn = fn_base+'_gene_loc.png' if opts.gene_pie_fn is None else opts.gene_pie_fn - mp.savefig(img_fn) - mp.clf() - - - fig = mp.figure(figsize=(4,4)) - pie_ax = fig.add_axes((0.15,0.15,0.7,0.7)) - pie_ax.set_title('Peak map distribution\n%d peaks'%num_peaks) - pie_labels, pie_values = [], [] - for k in bin_order : - pie_labels.append(k+'\n%d'%(loc_dist[k])) - pie_values.append(loc_dist[k]) - pie_ax.pie(pie_values,labels=pie_labels) - - img_fn = fn_base+'_peak_loc.png' if opts.peak_pie_fn is None else opts.peak_pie_fn - mp.savefig(img_fn) - mp.clf() - - fig = mp.figure(figsize=(4,4)) - # dist to feature histogram - #hist_ax_rect = [0.65,0.45,0.25,0.45] - hist_ax = fig.add_axes((0.15,0.15,0.7,0.7)) - hist_ax.set_title('Peak distance from TSS') - # join all the lists together - dists = sum(dist_to_features.values(),[]) - pdf, bins, patches = hist_ax.hist(dists,bins=20) - #h = mp.hist(dists,bins=20) - hist_ax.set_xlim((int(min(dists)),int(max(dists)))) - - dist_fn = fn_base+'_dist.png' if opts.dist_fn is None else opts.dist_fn - mp.savefig(dist_fn) - mp.clf() - - if opts.peak_fmt == 'MACS' : - fig = mp.figure(figsize=(4,4)) - bar_ax = fig.add_axes((0.15,0.15,0.7,0.7)) - pval_hists = {} - min_pval, max_pval = min([min(v) for v in pvals.values()]), max([max(v) for v in pvals.values()]) - for key,pvals in pvals.items() : - vals, bins = np.histogram(pvals,range=(0,max_pval),bins=20) - lv = np.log10(vals) - lv[np.isneginf(lv)] = 0.1 - pval_hists[key] = lv - - pval_items = [(k,pval_hists[k]) for k in bin_order if pval_hists.has_key(k)] - bar_width = 0.85*(max_pval-min_pval)/(len(bins)-1) - print max_pval, min_pval, len(bins) - print 'bar_width:',bar_width - bars = [] - b = bar_ax.bar(bins[:-1],pval_items[0][1],width=bar_width,color=colors[0]) - bars.append(b) - - sum_bottoms = pval_items[0][1] - for i, (key, pvals) in enumerate(pval_items[1:]) : - b = bar_ax.bar(bins[:-1],pvals,bottom=sum_bottoms,width=bar_width,color=colors[i+1]) - bars.append(b) - sum_bottoms += pvals - bar_ax.legend([b[0] for b in bars],[x[0] for x in pval_items]) - bar_ax.axis((-10,max(bins),0,max(sum_bottoms))) - bar_ax.set_title('Peak map distribution by pvalue') - bar_ax.set_xlabel('-10*log10(pvalue)') - bar_ax.set_ylabel('relative log10(# peaks)') - - pval_fn = fn_base+'_pval_bar.png' if opts.bar_fn is None else opts.bar_fn - mp.savefig(pval_fn)
--- a/chipsequtil-master/scripts/plot_pos_vs_neg_peaks.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -#!/usr/bin/env python - -import os -import sys - -import matplotlib -matplotlib.use('AGG') - -from matplotlib.pyplot import * -from numpy import arange, log10 -from optparse import OptionParser - -from chipsequtil import MACSFile - -usage = '%prog [options] <pos peaks fn> <neg peaks fn>' -parser = OptionParser(usage=usage) -parser.add_option('-o','--output',dest='out_fn',default=None,help='filename of output image') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - pos_fn, neg_fn = args - - pos_f, neg_f = MACSFile(pos_fn), MACSFile(neg_fn) - - pos_peaks = [] - pos_pvals = [] - for pk in pos_f : - pos_pvals.append(float(pk['-10*log10(pvalue)'])/10.) - pos_peaks.append((pk['-10*log10(pvalue)'],pk)) - - pos_peaks.sort() - - neg_peaks = [] - neg_pvals = [] - for pk in neg_f : - neg_pvals.append(float(pk['-10*log10(pvalue)'])/10.) - neg_peaks.append((pk['-10*log10(pvalue)'],pk)) - - neg_peaks.sort() - - min_pval, max_pval = min(pos_pvals+neg_pvals), max(pos_pvals+neg_pvals) - - pval_rng = arange(min_pval,max_pval,(max_pval-min_pval)/100.) - - # construct cdfs - pos_cdf, neg_cdf = [], [] - for pval in pval_rng : - pos_cdf.append(len(filter(lambda x: x >= pval,pos_pvals))) - neg_cdf.append(len(filter(lambda x: x >= pval,neg_pvals))) - - # normalize cdfs - pos_cdf_norm = [1.*x/max(pos_cdf) for x in pos_cdf] - neg_cdf_norm = [1.*x/max(neg_cdf) for x in neg_cdf] - - # log of pvals - pos_logs = map(log10,pos_cdf) - neg_logs = map(log10,neg_cdf) - plot(pval_rng,pos_logs) - plot(pval_rng,neg_logs) - ytics, ylabs = yticks() - clf() - - # normalize logs for plotting - pos_logs_norm = [1.-x/max(pos_logs) for x in pos_logs] - neg_logs_norm = [1.-x/max(neg_logs) for x in neg_logs] - - # calculate pos proportion for each pvalue - pos_ratio = [] - pos_only = [] - for pos, neg in zip(pos_cdf,neg_cdf) : - #pos_ratio.append(pos/(pos+neg)) - if neg == 0 : - pos_only.append(pos_ratio[-1]) - #pos_ratio.append(pos_ratio[-1]) - else : - pos_ratio.append(pos/neg) - - subplot(211) - plot(pval_rng, pos_logs, 'b-') - plot(pval_rng, neg_logs, 'g-') - yticks(ytics,[int(10**y) for y in ytics]) - title('positive vs. negative peaks') - legend(('positive','negative'),loc='upper right') - xlabel('-log(p-value)') - ylabel('# Peaks') - axis('tight') - - subplot(212) - plot(pval_rng[:len(pos_ratio)], map(log10,pos_ratio), 'k-') - plot(pval_rng[len(pos_ratio):], map(log10,pos_only),'k--') - #plot(pval_rng,pos_ratio, 'k-') - axis('tight') - xlabel('-log(p-value)') - #ylabel('# pos / (# pos + # neg)') - ylabel('log10(# pos / # neg)') - - if opts.out_fn is None : - pos_base_fn, pos_fn_ext = os.path.splitext(pos_fn) - out_fn = '%s_pos_v_neg.png'%pos_base_fn - else : - out_fn = opts.out_fn - savefig(out_fn)
--- a/chipsequtil-master/scripts/probeset_to_known_gene.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,124 +0,0 @@ -#!/usr/bin/env python - -import gzip -import sys -from collections import defaultdict as dd -from csv import DictReader, DictWriter -from optparse import OptionParser -from sqlite3 import connect - -from chipsequtil import KnownGeneFile - -# TODO make these parameters? -#affy_anno_fn = 'Mouse430A_2.na30.annot.csv' - -usage = '%prog [options] <knownGene annotation> <knownToMOE430 file> <knownGene Xref file> <microarray data file>' -description = 'Maps probset data to knownGene database provided by UCSC. Probesets \ -that map to multiple knownGenes have one record per knownGene with duplicate data \ -otherwise. Output is knownGene id prepended to each record in microarray data file.' -parser = OptionParser(usage=usage,description=description) -parser.add_option('--output',dest='output',default=None,help='file to output mapping to [default: stdout]') -#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the provided kgXref file to output gene symbols as second column') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - #affy_bioc_fn = 'microarray_analysis/cbfb_vector_BH_all.txt' - #knownToMOE_sql_fn = 'knownToMOE430.sql' - #knownToMOE_data_fn = 'knownToMOE430.txt' - - if len(args) < 3 : - parser.error('Incorrect number of arguments provided') - - known_gene_fn = args[0] - knownToMOE_data_fn = args[1] - Xref_fn = args[2] - affy_bioc_fn = args[3] - - # affymetrix file from bioconductor - affy_bioc_f = open(affy_bioc_fn) - affy_bioc = {} - affy_bioc_reader = DictReader(affy_bioc_f,delimiter="\t") - for row in affy_bioc_reader : - affy_bioc[row['ID']] = row - - # knownGene annotation - kg = KnownGeneFile(known_gene_fn) - kg_ids = dict([(x['name'],x) for x in kg]) - - # affy to knownGene - affy_to_kg_map = dd(list) - affy_to_kg_fields = ['kgID','affyID'] - affy_to_kg_f = open(knownToMOE_data_fn) - kg_to_affy_map = dd(list) - for row in DictReader(affy_to_kg_f,fieldnames=affy_to_kg_fields,delimiter="\t") : - affy_to_kg_map[row['affyID'][2:]].append(row['kgID']) - kg_to_affy_map[row['kgID']].append(row['affyID'][2:]) - - if opts.output : - out_f = open(opts.output,'w') - else : - out_f = sys.stdout - - out_header = ['knownGeneID']+affy_bioc_reader.fieldnames - - # see if the user wants gene symbols too - opts.symbol_xref = Xref_fn - if opts.symbol_xref : - kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] - symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') - symbol_xref_map = {} - for rec in symbol_xref_reader : - symbol_xref_map[rec['kgID']] = rec - out_header = ['knownGeneID','geneSymbol']+affy_bioc_reader.fieldnames - - out_writer = DictWriter(out_f,delimiter='\t',fieldnames=out_header,lineterminator='\n') - out_writer.writerow(dict(zip(out_header,out_header))) - for probesetID, data in affy_bioc.items() : - kg_ids = affy_to_kg_map[probesetID] - for kg_id in kg_ids : - out_l = {'knownGeneID':kg_id} - if opts.symbol_xref : - out_l['geneSymbol'] = symbol_xref_map[kg_id]['geneSymbol'] - out_l.update(data) - out_writer.writerow(out_l) - - # figure out if any probsets map to non-overlapping loci - # dirty dirty dirty dirty - if False : - affy_id_loci = {} - for affyID, kgIDs in affy_to_kg_map.items() : - # check all pairwise kgIDs to make sure they all overlap in transcription start sites - kg_id_loci = dd(list) - for i, kgID1 in enumerate(kgIDs) : - kgID1_rec = kg_ids[kgID1] - kg_id_loci[kgID1].append(kgID1_rec) - for j, kgID2 in enumerate(kgIDs) : - kgID2_rec = kg_ids[kgID2] - # these are all gene overlap conditions - #kg1Start = kgID1_rec['txEnd'] if kgID1_rec['strand'] == '-' else kgID1_rec['txStart'] - #kg1End = kgID1_rec['txStart'] if kgID1_rec['strand'] == '-' else kgID1_rec['txEnd'] - #kg2Start = kgID2_rec['txEnd'] if kgID2_rec['strand'] == '-' else kgID2_rec['txStart'] - #kg2End = kgID2_rec['txStart'] if kgID2_rec['strand'] == '-' else kgID2_rec['txEnd'] - kg1Start, kg1End = kgID1_rec['txStart'], kgID1_rec['txEnd'] - kg2Start, kg2End = kgID2_rec['txStart'], kgID2_rec['txEnd'] - if (kg2Start <= kg1Start <= kg2End or \ - kg1Start <= kg2Start <= kg1End or \ - (kg2Start < kg1Start and kg2End > kg1End) or \ - (kg1Start < kg2Start and kg1End > kg2End)) and \ - kgID1_rec['chrom'] == kgID2_rec['chrom'] and \ - i != j : - # we have overlap - pass - elif i != j : - # doesn't overlap oh noes - kg_id_loci[kgID1].append(kgID2_rec) - for kg_id, kg_recs in kg_id_loci.items() : - if len(kg_recs) != 1 : - affy_id_loci[affyID] = (kg_id, len(kg_recs),len(kgIDs),kg_recs,kgIDs) - - if len(affy_id_loci) != 0 : - sys.stderr.write('Warning: %d probeset ids map to non-overlapping loci'%len(affy_id_loci)) - -
--- a/chipsequtil-master/scripts/rejection_sample_fasta.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -import sys - -from optparse import OptionParser - -from chipsequtil import check_org_settings -from chipsequtil.util import MultiLineHelpFormatter -from chipsequtil.sampling import rejection_sample_bg -from chipsequtil.seq import fasta_to_dict, write_fasta_to_file - -usage = '%prog [options] <organism> <fasta file> [<fasta file> ... ]' -description = """Use rejection sampling to generate a set of background/random \ -sequences matching the distance to nearest transcription start site, sequence \ -length, and GC content distributions of the input fasta file(s). Generated \ -sequences are genomic sequences sampled based on these distributions. All sequences \ -from all files are used to generate the background sequences. The following \ -command must output a path to a nib genomic sequence directory and refGene \ -annotation, respectively : - -$> org_settings.py <organism> genome_dir -$> org_settings.py <organism> refgene_anno_path - -Utility prints out generated fasta records to stdout by default. Input sequences \ -from chr20 are mapped to chrX, chr21 are mapped to chrY, and sequences from chrM \ -are not used. -""" -epilog = "Note: script only considers sequences with unique header names, only the last record of those with identical header names is used" -parser = OptionParser(usage=usage,description=description,formatter=MultiLineHelpFormatter()) -parser.add_option('-n','--num-seqs',dest='num_seqs',default='1x', help='number of sequences to generate, either absolute number or factor of # input sequences, e.g. 2.5x for 2.5 times the # of input sequences [default: 1x]') -parser.add_option('--output',dest='output',default=None,help='file to output fasta records to [default: stdout]') -parser.add_option('--bed',dest='bed',action='store_true', help='also produce a BED formatted file representing sampled sequences') -parser.add_option('--bed-output',dest='bed_output',default='output.bed',help='with --bed, file to output BED records to [default: %default]') -parser.add_option('-v','--verbose',dest='verbose',action='store_true',help='print out debug information') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 2 : - parser.error('Must be 2 non-option arguments') - - organism, fasta_fns = args[0], args[1:] - - reqd_settings = ['genome_dir','refgene_anno_path'] - if not check_org_settings(organism,reqd_settings) : - parser.error('The <organism> settings set must contain paths for %s'%reqd_settings) - - # load up all the fasta records - fasta_recs = {} - for fasta_fn in fasta_fns : - fasta = fasta_to_dict(fasta_fn) - fasta_recs.update(fasta) - - # parse --num-seqs argument - if opts.num_seqs.endswith('x') : - num_seq_factor = float(opts.num_seqs[:-1]) - num_seqs = int(len(fasta_recs)*num_seq_factor) - else : - try : - num_seqs = int(opts.num_seqs) - except TypeError : - parser.error("Incorrect format of --num-seqs argument, must either be an integer or a factor ending with x, e.g. 2.5x") - - # generate the sequences - gen_seqs = rejection_sample_bg(fasta_recs,organism,num_samples=num_seqs,verbose=opts.verbose) - - # write out to file - if opts.output : - write_fasta_to_file(gen_seqs,opts.output) - else : - sys.stdout.write(''.join(['>%s\n%s\n'%(k,v) for k,v in gen_seqs.items()])) - - if opts.bed : - bed_f = open(opts.bed_output,'w') - bed_f.write(''.join([k.replace(':','\t').replace('-','\t')+'\n' for k in gen_seqs.keys()])) - bed_f.close() -
--- a/chipsequtil-master/scripts/sort_bed.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -#!/usr/bin/env python -import sys, os -from optparse import OptionParser -from collections import defaultdict as dd -from csv import reader, writer - - -usage = "%prog [options] <BED file> [<BED file> <BED file>...]" -description = """\ -Sort the BED formatted files first by chromosome (field 1) and then by start -coordinate (field 2). Lines from all files submitted are concatenated and -sorted in the final output.""" -parser = OptionParser(usage=usage,description=description) -parser.add_option('--output',dest='output',default=sys.stdout,help='filename to write the sorted BED lines [default: stdout]') - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) == 0 : - parser.error("Must provide at least one file") - - fns = args - chromos = dd(list) - - # load each chromosome separately - for fn in fns : - bed_reader = reader(open(fn),delimiter='\t') - for line in bed_reader : - chromos[line[0]].append(line) - - # determine where we're writing to - if opts.output != sys.stdout : - f = open(opts.output,'w') - else : - f = opts.output - - # write the chromos in lexicographic sorted order - bed_writer = writer(f,delimiter='\t') - for k in sorted(chromos.keys()) : - - # sort each chromosome's BED lines by stat position - chromos[k].sort(key=lambda x: int(line[1])) - bed_writer.writerows(chromos[k])
--- a/chipsequtil-master/scripts/split_file.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -from optparse import OptionParser -from datetime import datetime -from subprocess import Popen, PIPE -import itertools -import sys, os, getpass, re - -usage = "[%prog] [options] filename" -description = """\ -Split <filename> into a set of files with either a specific number of lines -(--split-type=lines, default) or into a specific number of files (--split-type= -count). Files are created with .XXXX appended, indicating the number of file -split. Writes files to current working directory unless otherwise specified. -""" - -parser = OptionParser(usage=usage,description=description) -parser.add_option('--type',dest='split_type',type='choice',choices=['lines','count'],default='lines',help='how to split the file (WARNING: count does not preserve the sequence of lines in the original file when splitting) [default: %default]') -#parser.add_option('--split-arg',dest='split_arg',default='1000',help='integer argument for split type (size specified as Xb, XK, XM, or XG, others are integers) [default: %default]') -parser.add_option('--arg',dest='split_arg',type='int',default=1000,help='integer argument for split type [default: %default]') -parser.add_option('--outdir',dest='outdir',default='.',help='directory to put the split files in [default: %default]') - -def get_file_parts(fn) : - fpath,fname = os.path.split(fn) - fbase,fext = os.path.splitext(fname) - return fpath,fname,fbase,fext - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - if len(args) < 1 : - parser.print_usage() - sys.exit(1) - - filename = args[0] - abs_filename = os.path.abspath(filename) - - # check to ensure filename exists - if not os.path.exists(abs_filename) : - sys.stderr.write('File %s does not exist, exiting\n'%abs_filename) - parser.print_usage() - sys.exit(2) - - # split the file - split_size = opts.split_arg - fpath,fname,fbase,fext = get_file_parts(abs_filename) - if opts.split_type == 'lines' : - curr_split = 0 # for first condition - split_fd = None - for i,l in enumerate(open(abs_filename)) : - if i%split_size == 0 : - if split_fd : split_fd.close() # close it if we aren't on the first split - split_fd = open(os.path.join(opts.outdir,fname)+'.%04d'%curr_split,'w') - curr_split += 1 - split_fd.write(l) - nlines = i - elif opts.split_type == 'count' : - # create split_size split files by writing lines round robin - split_fds = [open(os.path.join(opts.outdir,fname)+'.%04d'%x,'w') for x in range(split_size)] - split_cycle = itertools.cycle(split_fds) - for i,l in enumerate(open(abs_filename)) : - split_cycle.next().write(l) - nlines = i - - # close all the handles - [fd.close() for fd in split_fds] - - elif opts.split_type == 'size' : - # parse split_arg argument, into integer if split_type is 'size' - if opts.split_type == 'size' : - m = re.match('^(\d+)([bKMG])$',opts.split_arg) - if m is None : - sys.stderr.write("Incorrect --split-arg argument for --split-type=size, I understand only X[bKMG], exiting\n") - parser.print_usage() - sys.exit(3) - else : - size_d = {'b':1,'K':1024,'M':pow(1024,2),'G':pow(1024,3)} - split_size = int(m.groups()[0])*size_d[m.groups()[1]] - - fd = open(abs_filename) - curr_split_size = 0 -
--- a/chipsequtil-master/scripts/split_qsub.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env python - -from __future__ import with_statement -import os -import sys -from optparse import OptionParser -from subprocess import Popen, PIPE - -from chipsequtil import get_file_parts - -usage = "[%prog] [options] <utility> <file> [<file> <file> ...]" -description = """\ -Submit a job using qsub for <utility>, each with one <file> as an argument. Any -options specified on the command line that [%prog] cannot interpret are passed -on to the utility for each call.""" -epilog = "Note: this script only works in Unix-style environments" -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('--suffix',dest='suffix',default=None,help='string to append to stdout files, e.g. <filename>_<--suffix>.<--ext> [default: <utility>]') -parser.add_option('--ext',dest='ext',default='.out',help='file extension to use for stdout files') -parser.add_option('--util-args',dest='util_args',default='',help='double quote wrapped arguments to pass to <utility>') -parser.add_option('--keep-stderr',dest='keep_stderr',action='store_true',help='capture stderr files, useful for debugging') -parser.add_option('--keep-scripts',dest='keep_scripts',action='store_true',help='do not delete qsub scripts generated after job submission') -parser.add_option('--die-on-error',dest='die_on_err',action='store_true',help='if any one of the qsub submissions returns non-zero exit status, stop executing') - - -if __name__ == '__main__' : - - opts, args = parser.parse_args(sys.argv[1:]) - - utility, filenames = args[0], args[1:] - - # try to find the utility - abs_utility = os.path.abspath(utility) - if not os.path.exists(abs_utility) : - # look on the path - abs_utility = Popen('which %s'%utility,shell=True,stdout=PIPE,stderr=PIPE).communicate()[0].strip() - if not os.path.exists(abs_utility) : - raise Exception("Utility %s could not be found in the local directory or on the user's path, exiting"%utility) - sys.exit(1) - - upath,uname,ubase,uext = get_file_parts(abs_utility) - - runscript_tmpl = """ -#!/bin/bash - -#$ -N %(jobname)s -#$ -S /bin/sh -#$ -o %(stdout)s -#$ -e %(stderr)s -#$ -cwd -export PYTHONPATH=%(pythonpath)s:${PYTHONPATH} - -%(utility)s %(utilargs)s %(filename)s""" - - suffix = ubase if opts.suffix is None else opts.suffix - for fn in filenames : - abs_fn = os.path.abspath(fn) - fpath,fname,fbase,fext = get_file_parts(abs_fn) - stdout = os.path.join(fpath,fname+'_'+suffix+opts.ext) - stderr = '/dev/null' if not opts.keep_stderr else os.path.join(fpath,fname+'_'+suffix+'.err') - call_script = runscript_tmpl%{'jobname':fname,'utility':abs_utility,'filename':abs_fn,'stdout':stdout,'stderr':stderr,'utilargs':opts.util_args,'pythonpath':os.environ.get('PYTHONPATH','')} - f = open('%s'%abs_fn+'_'+utility+'.script','w') - f.write(call_script) - f.close() - p = Popen('qsub %s'%f.name,shell=True) - p.wait() - if not opts.keep_scripts : - os.remove(f.name) - - if opts.die_on_err and p.returncode != 0 : - with open(stderr,'w') as f : - f.write('qsub returned non-zero exit code for file %s, aborting\n'%fn) - sys.exit(1)
--- a/chipsequtil-master/scripts/wait_for_jobid.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -import re -import sys -import time - -from optparse import OptionParser -from subprocess import Popen, PIPE - -usage = '%prog [options] <job id> [<job id>...]' -desc = 'Poll qstat and wait until all <job id>s are finished' -parser = OptionParser(usage=usage,description=desc) - -array_job_match = '^(\d+)\[\]\.(.*)' -array_job_regex = '^%s\[[0-9]\+\]' - -def is_job_done(jobid) : - - done = False - - # have to handle array jobs differently than standalone - array_match = re.search(array_job_match,jobid) - if array_match is not None : - idnum, rest = array_match.groups() - jobid_regex = array_job_regex%idnum - qstat_p = Popen('qstat -t | grep "%s" | cut -f 1 -d " "'%jobid_regex,shell=True,stdout=PIPE) - stdout, stderr = qstat_p.communicate() - done = len(stdout) == 0 - - else : - # -j is only for SGE - qstat_p = Popen('qstat -j %s'%jobid,shell=True,stdout=PIPE,stderr=PIPE) - qstat_p.wait() - if qstat_p.returncode == 0 : - pass - # assume any != 0 return code means job is done - else : - done = True - - return done - -if __name__=='__main__': - - opts, args = parser.parse_args(sys.argv[1:]) - - jobids = map(lambda x: x.strip(), args) - - # wait for all of them - sys.stderr.write('Waiting for jobs to complete\n') - jobs_done = [False]*len(jobids) - try : - while not all(jobs_done) : - jobs_not_done = filter(lambda x: not x[1], enumerate(jobs_done)) - for i, jid in jobs_not_done : - jobs_done[i] = is_job_done(jobids[i]) - sys.stderr.write('Jobs done: %d/%d\r'%(sum(jobs_done),len(jobs_done))) - time.sleep(2) - sys.stderr.flush() - except KeyboardInterrupt : - sys.stderr.write('\n') - resp = raw_input('Caught keyboard interrupt, kill all jobs? [y/N] ') - if resp.lower() == 'y' : - Popen('kill_all_jobs.sh',shell=True) - - sys.stderr.write('done\n')
--- a/chipsequtil-master/scripts/wait_for_qsub.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#!/usr/bin/env python -import time -from subprocess import Popen, PIPE - -if __name__ == '__main__' : - - # this is gross, but it works when you need to stall a pipeline until all your jobs are done - done = False - while not done : - qstat_output = Popen('qstat',shell=True,stdout=PIPE).communicate()[0] - if qstat_output == '' : - done = True - else : - time.sleep(1)
--- a/chipsequtil-master/scripts/wqsub.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -#!/usr/bin/env python - -from __future__ import with_statement -import os -import re -import sys -import time -from optparse import OptionParser -from subprocess import Popen, PIPE - -from chipsequtil import get_file_parts - -usage = "[%prog] [options] command" -description = """Wrap the specified command into a qsub script and submit it -for execution. Script captures both stdout and stderr to the current directory. -By default, all of the user's environment variables are put into the script -(compatible with SGE only ATM).""" -epilog = "Note: this script only works in Unix-style environments." -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('--wqsub-name',dest='wqsub_name',default='wqsub',help='job name to submit as <--wqsub-name>_<first non-whitespace chars in command> [default: %default]') -parser.add_option('--wqsub-ext',dest='wqsub_ext',default='.out',help='file extension to use for stdout files') -parser.add_option('--wqsub-keep-script',dest='wqsub_keep_script',action='store_true',help='do not delete qsub script generated after job submission') -parser.add_option('--wqsub-no-env',dest='wqsub_no_env',action='store_true',help='do not include any local environment variables in the script') -parser.add_option('--wqsub-no-submit',dest='wqsub_no_sub',action='store_true',help='create script but do not submit job (useful for generating scripts)') -parser.add_option('--wqsub-drm',dest='drm',default='SGE',type='choice',choices=['SGE','TORQUE'],help='the DRM to generate scripts for [default: %default]') -parser.add_option('--wqsub-drm-arg',dest='drm_args',action='append',default=[],help='arguments to pass as parameters in the job script specific to the DRM, use multiple option flags to specify multiple parameters') -parser.add_option('--wqsub-wait',dest='wait',action='store_true',help='poll the DRM and do not return control until job is finished (only works for TORQUE)') - -templates = { -'TORQUE': """\ -#!/bin/bash - -#PBS -N %(jobname)s -#PBS -o %(stdout)s -#PBS -e %(stderr)s -#PBS -d %(cwd)s -%(env)s -%(addnl)s - -%(command)s -""", -'SGE':"""\ -#!/bin/bash - -#$ -N %(jobname)s -#$ -S /bin/bash -#$ -o %(stdout)s -#$ -e %(stderr)s -#$ -cwd -%(env)s -%(addnl)s - -%(command)s -""" -} - -drm_symb = { -'TORQUE': 'PBS', -'SGE': '$' -} - -if __name__ == '__main__' : - - # get the wqsub args out first - wqsub_args = [] - other_args = [] - for arg in sys.argv : - if arg.count('wqsub') != 0 or arg in ['-h','--help'] : - wqsub_args.append(arg) - else : - other_args.append(arg) - - opts, args = parser.parse_args(wqsub_args) - - if len(other_args) == 0 : - parser.error('Must provide a command') - - command = ' '.join(other_args) - runscript_tmpl = templates[opts.drm] - # set up job parameters - cmd_exe = os.path.basename(other_args[0]) - jobname = opts.wqsub_name+'_'+cmd_exe - stdout_fn = jobname+opts.wqsub_ext - stdout = os.path.abspath(stdout_fn) - fpath,fname,fbase,fext = get_file_parts(stdout) - stderr = os.path.abspath(os.path.join(jobname+'.err')) - - # get the user's current environment and put it into the execute script - if opts.wqsub_no_env : - env_str = '# local environment variables omitted' - else : - env_str = '#%s -V'%drm_symb[opts.drm] - - # construct the script - addnl_params = [] - for addnl in opts.drm_args : - addnl_params.append('#%s %s'%(drm_symb[opts.drm],addnl)) - addnl_params = '\n'.join(addnl_params) - - job_dict = {'jobname':fname, - 'stdout':stdout, - 'stderr':stderr, - 'command':command, - 'env':env_str, - 'cwd':os.getcwd(), - 'addnl':addnl_params} - - call_script = runscript_tmpl%job_dict - # write the script to file - script_fn = os.path.abspath(jobname+'.script') - with open(script_fn,'w') as f : - f.write(call_script) - - if not opts.wqsub_no_sub : - p = Popen('qsub %s'%f.name,shell=True,stdout=PIPE) - p.wait() - stdout, stderr = p.communicate() - if not opts.wqsub_keep_script : - os.remove(f.name) - if opts.wait : - done = False - print 'Waiting on job id %s'%stdout.strip() - while not done : - qstat_p = Popen('qstat %s'%stdout,shell=True,stdout=PIPE,stderr=PIPE) - qstat_p.wait() - if opts.drm == 'TORQUE' : - done = False if qstat_p.returncode != 153 else True - elif opts.drm == 'SGE' : - done = False if qstat_p.returncode != 1 else True - time.sleep(3) # wait three seconds because it's nice - else : - if opts.drm == 'TORQUE' : - print stdout.strip() - elif opts.drm == 'SGE' : - qsub_output_patt = 'Your job (\d+)' - m = re.match(qsub_output_patt,stdout.strip()) - if m is not None: - print m.group(1) - sys.exit(0) - - # might be an array job - qsub_output_patt = 'Your job-array (\d+)\.' - m = re.match(qsub_output_patt,stdout.strip()) - if m is not None: - print m.group(1)
--- a/chipsequtil-master/scripts/wqsub_drmaa.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python - -from __future__ import with_statement -import os -import sys -from optparse import OptionParser -from subprocess import Popen, PIPE - -import drmaa - -from chipsequtil import get_file_parts - -usage = "[%prog] [options] command" -description = """Submit *command* to a DRMAA-enabled job queueing system. -Output of the command goes to file, stderr is ignored unless specified -as an option. By default, all of the user's environment -variables are imported into job environment.""" -epilog = "Note: this script only works in Unix-style environments." -parser = OptionParser(usage=usage,description=description,epilog=epilog) -parser.add_option('--wqsub-name',dest='wqsub_name',default='wqsub',help='job name to submit as <--wqsub-name>_<first non-whitespace chars in command> [default: %default]') -parser.add_option('--wqsub-stdout',dest='wqsub_stdout',default=None,help='name of file to write stdout to (equivalent to -o argument in SGE) [default: <wqsub-name>_<command>.out]') -parser.add_option('--wqsub-stderr',dest='wqsub_stderr',default=None,help='name of file to write stderr to (equivalent to -e argument in SGE) [default: <wqsub-name>_<command>.err]') -parser.add_option('--wqsub-join',dest='wqsub_join',action='store_true',help='join stdout and stderr into file indicated by --wqsub-stdout (equivalent to -j flag in SGE)') -parser.add_option('--wqsub-no-env',dest='wqsub_no_env',action='store_true',help='do not include any local environment variables in the script') -parser.add_option('--wqsub-wait',dest='wqsub_wait',action='store_true',help='wait for job to finish executing before returning from script') - - -if __name__ == '__main__' : - - # get the wqsub args out first - wqsub_args = [] - other_args = [] - for arg in sys.argv : - if arg.count('wqsub') != 0 or arg in ['-h','--help'] : - wqsub_args.append(arg) - else : - other_args.append(arg) - - opts, args = parser.parse_args(wqsub_args) - - if len(other_args) == 0 : - parser.error('Must provide a command') - - # set up job parameters - jobname = opts.wqsub_name+'_'+other_args[0] - stdout_fn = jobname+'.out' - if opts.wqsub_stdout : - stdout_fn = opts.wqsub_stdout - stdout = os.path.abspath(stdout_fn) - - if os.path.exists(stdout) : - os.remove(stdout) - - stderr_fn = jobname+'.err' - if opts.wqsub_stderr : - stderr_fn = opts.wqsub_stderr - stderr = os.path.abspath(stderr_fn) - if os.path.exists(stderr) : - os.remove(stderr) - - # drmaa job submission - session = drmaa.Session() - session.initialize() - - # initialize job template - job_template = session.createJobTemplate() - - # construct DRMAA job - command,args = other_args[0],other_args[1:] - job_template.remoteCommand = command - job_template.args = args - job_template.jobName = jobname - job_template.joinFiles = opts.wqsub_join - - # output and error paths apparently require a ':' in front - job_template.outputPath = ':'+stdout - job_template.errorPath = ':'+stderr - - # get the user's current environment and put it into the execute script - if not opts.wqsub_no_env : - job_template.jobEnvironment = os.environ - - # submit the job and wait for it - jobid = session.runJob(job_template) - - if opts.wqsub_wait : - # submit and wait for job to complete, keyboard interrupt aborts job - try : - - retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER) - - except KeyboardInterrupt : - sys.stderr.write('Keyboard interrupt caught (^C), aborting') - pass - - # clean up - session.deleteJobTemplate(job_template) - session.exit()
--- a/chipsequtil-master/src/chipsequtil/__init__.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -""" -This module needs documentation. -""" - -from chipsequtil import *
--- a/chipsequtil-master/src/chipsequtil/chipsequtil.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,718 +0,0 @@ -import math -import os -import re -import string -import sys - -from ConfigParser import ConfigParser -from csv import DictReader -from collections import defaultdict - -import chipsequtil - -# for RefGeneDB -from util import KeyedBinaryTree - - -def get_file_parts(path) : - """For <path>/<basename>.<ext>, returns 4-tuple (<path>,<basename>.<ext>,<basename>,<ext>)""" - path,fn = os.path.split(path) - basename,ext = os.path.splitext(fn) - return path,fn,basename,ext - -def parse_number(n) : - """Try to cast intput first to float, then int, returning unchanged if both fail""" - try : - return float(n) if '.' in n else int(n) - except : - return n - - -def gerald_to_bed(gerald,min_fields=False) : - """Convert a GERALDOutput object into a BEDOutput object - - Keyword argument *min_fields* produces BED alignment with only the first - three fields populated - """ - - d = {}.fromkeys(BEDOutput.FIELD_NAMES,'') - - # required BED fields - d['chrom'] = gerald.match_chromo - d['chromStart'] = gerald.match_pos - d['chromEnd'] = gerald.match_pos+len(gerald.read) - - # load the remaining information - if not min_fields : - d['strand'] = '+' if gerald.match_strand == 'F' else '-' - # TODO consider encoding single-read alignment score into BED score format - # that's it? - return BEDOutput(**d) - - -class GERALDOutput : - """Container for one line of GERALD alignment output as generated by Illumina - pipeline version >= 1.3.""" - - FIELD_NAMES = ['machine', - 'run_number', - 'lane', - 'tile', - 'x_coord', - 'y_coord', - 'index', - 'read_no', - 'read', - 'quality_string', - 'match_chromo', - 'match_contig', - 'match_pos', - 'match_strand', - 'match_desc', - 'single_read_score', - 'paired_read_score', - 'partner_chromo', - 'partner_contig', - 'partner_offset', - 'partner_strand', - 'filtering', - ] - - def __init__(self,line) : - - if type(line) == str : - line = line.strip().split('\t') - - if len(line) != len(GERALDOutput.FIELD_NAMES) : - raise GERALDOutput.FormatException('Expected %d fields in input, \ - found %d in line: %s'% - (len(GERALDOutput.FIELD_NAMES), - len(line), - line)) - - for fn,d in zip(GERALDOutput.FIELD_NAMES,line) : - setattr(self,fn,parse_number(d)) - - def __repr__(self) : - return 'GERALDOutput(%s)'%repr(self.output_format()) - - def output_format(self) : - """Tab delimited string of fields as they would appear in GERALD output file""" - return '\t'.join([str(getattr(self,d)) for d in GERALDOutput.FIELD_NAMES])+'\n' - - class FormatException(Exception) : - """GERALD format exception, raised on malformatted input""" - pass - - -class SmartFileIter : - r"""An 'abstract' class implementing a smart file iterator. It is essentially - a wrapper around a collections.DictReader object that parses fields into - Python datatypes (int, float, tuple, objects, etc) as they are iterated. - The constructor argument *f* can be either a valid filename or a file-like - object. This class should not be directly instantiated - rather it should - be subclassed with FIELD_NAMES and FIELD_TYPES defined. FIELD_NAMES is a - list of strings referring to the names of the fields, FIELD_TYPES is a list - of the same length of callables that will parse the column into the desired - format. Example:: - - >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n') - >>> class IntervalFile(SmartFileIter): - r'''A SmartFileIter for files with lines formatted like: - chrom\tstart\tend\tstrand''' - FIELD_NAMES = ['chrom','start','end','strand'] - FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1] - >>> f = IntervalFile(s) - >>> for r in f : - print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand'] - - ``r['start']`` and ``r['end']`` are automatically available as integers, - so the subraction works as expected. Arbitrary functions that accept a - single argument and return a value may also be specified. - """ - - def __init__(self,f,skip_line_chars='#') : - if not hasattr(self,'FIELD_NAMES') or not hasattr(self,'FIELD_TYPES') : - raise Exception('Subclasses must define class members FIELD_NAMES and FIELD_TYPES') - if isinstance(f,str) : - f = open(f) - self._dict_reader = DictReader(f,delimiter='\t',fieldnames=self.FIELD_NAMES) - self.fieldnames = self.FIELD_NAMES - self.curr_line = self._dict_reader.next() - self.skip_line_chars = skip_line_chars - - # skip initial comment lines - while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : - self.curr_line = self._dict_reader.next() - - if self.FIELD_NAMES[0] in self.curr_line.values() : - self.curr_line = self._dict_reader.next() - - def __iter__(self) : - return self - - def __getattr__(self,attr) : - try: - return self.__dict__[attr] - except KeyError : - return getattr(self._dict_reader,attr) - - def next(self) : - """Emit the next record in the file as a dictionary with parsed values""" - - if self.curr_line is None : - raise StopIteration() - - line = self.curr_line - - # check for comment - while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : - line = self.curr_line = self._dict_reader.next() - - for k,f in zip(self.FIELD_NAMES, self.FIELD_TYPES) : - try : - line[k] = f(line[k]) - except Exception, e : - #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e))) - line[k] = line[k] - - try : - self.curr_line = self._dict_reader.next() - except StopIteration : - self.curr_line = None - - return line - - -class BEDOutput : - """*Deprecated*: Use *BEDFile* instead. - - Container for one line of BED alignment output""" - - FIELD_NAMES = ['chrom', - 'chromStart', - 'chromEnd', - 'name', - 'score', - 'strand', - 'thickStart', - 'thickEnd', - 'itemRgb', - 'blockCount', - 'blockSizes', - 'blockStarts', - ] - - def __init__(self,line='',*args,**kwargs) : - - if type(line) == str : - line = line.strip().split('\t') - - if len(line) < 3 and any([x not in kwargs.keys() for x in ['chrom','chromStart','chromEnd']]) : - raise BEDOutput.FormatException('Format requres at least 3 fields in \ - input, found %d in line: %s'%(len(line),line)) - if len(line) > len(BEDOutput.FIELD_NAMES) : - raise BEDOutput.FormatException('Format requres at most %d fields in \ - input, found %d in line: %s'% - (len(BEDOutput.FIELD_NAMES),len(line),line)) - - empty_fields = ['']*(len(BEDOutput.FIELD_NAMES)-len(line)) - for fn,d in zip(BEDOutput.FIELD_NAMES,line+empty_fields) : - setattr(self,fn,parse_number(d)) - - # kwargs override line input - for k,v in kwargs.items() : - setattr(self,k,parse_number(v)) - - def __repr__(self) : - return 'BEDOutput(%s)'%(repr(self.output_format())) - - def output_format(self) : - """Returns a string for the BED line as it would appear in a file""" - return '\t'.join([str(getattr(self,d)) for d in BEDOutput.FIELD_NAMES])+'\n' - - class FormatException(Exception) : - """BED format exception, raised on malformatted input""" - pass - - -class BEDFile(SmartFileIter) : - '''An iterable object containing the records in the supplied BED formatted - file. Fieldnames are:: - - FIELD_NAMES = ['chrom', - 'chromStart', - 'chromEnd', - 'name', - 'score', - 'strand', - 'thickStart', - 'thickEnd', - 'itemRgb', - 'blockCount', - 'blockSizes', - 'blockStarts', - ] - ''' - - FIELD_NAMES = BEDOutput.FIELD_NAMES - FIELD_TYPES = [str,int,int,str,float,str,int,int,str,lambda x: x.split(','), lambda x: x.split(','), lambda x: x.split(',')] - - -class BEDFile_dictreader(DictReader) : - '''An iterable object (subclasses csv.DictReader) containing the records in - the supplied BED formatted file.''' - FIELD_NAMES = BEDOutput.FIELD_NAMES - def __init__(self,bed) : - '''*bed* is either a filename or a file-like object representing a BED file''' - if isinstance(bed,str) : - bed = open(bed) - DictReader.__init__(self,bed,delimiter='\t', - fieldnames=BEDOutput.FIELD_NAMES) - - -class GPSFile(SmartFileIter) : - '''An iterable object containing the records in the peaks file format - generated by GPS. Fieldnames are:: - - FIELD_NAMES = ["Position", - "IP", - "Control", - "Fold", - "Q_-lg10", - "P_-lg10", - "IPvsEMP", - "IPvsCTR", - "blank" - ] - ''' - - FIELD_NAMES = ["Position", - "IP", - "Control", - "Fold", - "Q_-lg10", - "P_-lg10", - "IPvsEMP", - "IPvsCTR", - "blank" - ] - - FIELD_TYPES = [lambda x: ('chr%s'%x.split(':')[0],int(x.split(':')[1]),x), - float, - float, - float, - float, - float, - float, - float, - str - ] - - def __init__(self,gps_fn) : - f = open(gps_fn) - - SmartFileIter.__init__(self,f) - - -class AffyBiocFile(DictReader) : - '''An iterable object (subclasses csv.DictReader) containing microarray data records in - the supplied bioconductor formatted file.''' - - FIELD_NAMES = [ 'ID', - 'Symbol', - 'Name', - 'M', - 'A', - 't', - 'P.Value', - 'B' - ] - - def __init__(self,affyfn) : - '''*affyfn* is either a filename or a file-like object representing a bioconductor output file''' - if isinstance(affyfn,str) : - bed = open(bed) - DictReader.__init__(self,bed,delimiter='\t', - fieldnames=BEDOutput.FIELD_NAMES) - - -class RefGeneOutput(object) : - # http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql - FIELD_NAMES = ['bin', - 'name', - 'chrom', - 'strand', - 'txStart', - 'txEnd', - 'cdsStart', - 'cdsEnd', - 'exonCount', - 'exonStarts', - 'exonEnds', - 'score', - 'name2', - 'cdsStartStat', - 'cdsEndStat', - 'exonFrames',] - - -class RefGeneFile(DictReader) : - '''An iterable object (subclasses csv.DictReader) containing the records in - the supplied BED formatted file''' - def __init__(self,refGene_fn) : - refGene_f = open(refGene_fn) - # check for header - first_line = refGene_f.next() - if not first_line.strip().startswith('#') : - refGene_f.seek(0) # first line not header, reset the file pointer - DictReader.__init__(self,refGene_f,delimiter='\t',fieldnames=RefGeneOutput.FIELD_NAMES) - -class RefGeneFile_nottested(SmartFileIter) : - '''An iterable object containing the records in the supplied UCSC RefGene - refFlat formatted file (see e.g. - http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql)''' - FIELD_NAMES = ['bin', - 'name', - 'chrom', - 'strand', - 'txStart', - 'txEnd', - 'cdsStart', - 'cdsEnd', - 'exonCount', - 'exonStarts', - 'exonEnds', - 'score', - 'name2', - 'cdsStartStat', - 'cdsEndStat', - 'exonFrames',] - FIELD_TYPES = [str,str,str,str,int,int,int,int,int, - lambda x: [int(y) for y in x.split(',') if len(y) > 0], - lambda x: [int(y) for y in x.split(',') if len(y) > 0], - float, - str,str,str,str] - -class KnownGeneFile(SmartFileIter) : - '''An iterable that parses UCSC's KnownGene gene annotation files. Field - names are:: - - FIELD_NAMES = [ 'name', - 'chrom', - 'strand', - 'txStart', - 'txEnd', - 'cdsStart', - 'cdsEnd', - 'exonCount', - 'exonStarts', - 'exonEnds', - 'proteinID', - 'alignID', - ] -''' - - FIELD_NAMES = [ 'name', - 'chrom', - 'strand', - 'txStart', - 'txEnd', - 'cdsStart', - 'cdsEnd', - 'exonCount', - 'exonStarts', - 'exonEnds', - 'proteinID', - 'alignID', - ] - - # function pointers for correct formatting of field names - FIELD_TYPES = [ str, - str, - str, - int, - int, - int, - int, - lambda x: [int(y) for y in x.split(',') if len(y) > 0], - lambda x: [int(y) for y in x.split(',') if len(y) > 0], - lambda x: [int(y) for y in x.split(',') if len(y) > 0], - str, - str, - ] - - def __init__(self,kg_fn) : - self.meta_data = [] - self.file_info = {} - f = open(kg_fn) - self._dict_reader = DictReader(f,delimiter='\t',fieldnames=KnownGeneFile.FIELD_NAMES) - - def __iter__(self) : - return self - - def next(self) : - line = self._dict_reader.next() - for k,f in zip(self.FIELD_NAMES,self.FIELD_TYPES) : - line[k] = f(line[k]) - return line - - -#TODO maybe, finish this -class RefGeneDB : - '''A class for querying RefGene annotation files. NOT DONE.''' - - def __init__(self,refgene_fn) : - self._chrom_trees = defaultdict(KeyedBinaryTree) - refgene_f = RefGeneFile(refgene_fn) - genes = defaultdict(list) - for gene in refgene_f : - genes[gene['chrom']].append(gene) - - # do stuff to ensure a balanced tree for each chromosome - for chrom,gene_list in genes.items() : - gene_list.sort(key=lambda x: int(x['txStart'])) - first_half, second_half = gene_list[:len(gene_list)/2],gene_list[len(gene_list)/2:] - first_half.reverse() - for i in range(min(len(first_half,second_half))) : - to_add = first_half.pop(i) - self._chrom_trees[chrom].addNode(int(to_add['txStart']),to_add) - - -class MACSFile(SmartFileIter) : - '''An iterable object containing the records in the supplied MACS peak file. - This class parses the comments found in the header of MACS peak files and - extracts metadata into the member dictionary **file_info**. Here is an example - metadata dictionary:: - - >>> f = MACSFile('macs_peaks.xls') - >>> f.file_info - {'ChIP-seq file': 'experiment_read_alignments.sam', - 'MACS version': '1.4.0rc2 20110214', - 'Range for calculating regional lambda': '1000 bps and 10000 bps', - 'Redundant rate in control': 0.72999999999999998, - 'Redundant rate in treatment': 0.080000000000000002, - 'band width': 300, - 'control file': 'control_read_alignments.sam', - 'd': 203, - 'effective genome size': 2110000000.0, - 'format': 'SAM', - 'maximum duplicate tags at the same position in control': 2, - 'maximum duplicate tags at the same position in treatment': 2, - 'model fold': '10,30', - 'name': 'my_awesome_ChIP', - 'pvalue cutoff': 1.0000000000000001e-05, - 'tag size': 36, - 'tags after filtering in control': 7879454, - 'tags after filtering in treatment': 23927336, - 'total tags in control': 29703098, - 'total tags in treatment': 26092366} - - The complete header can be found as a list in the **meta_data** member with - one comment per item. The field names available are:: - - FIELD_NAMES = ['chr', - 'start', - 'end', - 'length', - 'summit', - 'tags', - '-10*log10(pvalue)', - 'fold_enrichment', - 'FDR(%)', - ] - - ''' - FIELD_NAMES = ['chr', - 'start', - 'end', - 'length', - 'summit', - 'tags', - '-10*log10(pvalue)', - 'fold_enrichment', - 'FDR(%)', - ] - - FIELD_TYPES = [str, - int, - int, - int, - int, - int, - float, - float, - float - ] - - _METADATA_REGEXES = [ - u'# This file is generated by (MACS version) (.*)', - u'# (name) = (.*)', - u'# (format) = (.*)', - u'# (ChIP-seq file) = (.*)', - u'# (control file) = (.*)', - u'# (effective genome size) = (.*)', - u'# (band width) = (\d+)', - u'# (model fold) = (.*)', - u'# (pvalue cutoff) = (.*)', - u'# (Range for calculating regional lambda) is: (.*)', - u'# (tag size) is determined as (\d+) bps', - u'# (total tags in treatment): (\d+)', - u'# (tags after filtering in treatment): (\d+)', - u'# (maximum duplicate tags at the same position in treatment) = (\d+)', - u'# (Redundant rate in treatment): (.*)', - u'# (total tags in control): (.*)', - u'# (tags after filtering in control): (.*)', - u'# (maximum duplicate tags at the same position in control) = (\d+)', - u'# (Redundant rate in control): (.*)', - u'# (d) = (\d+)' - ] - - def __init__(self,macs_fn) : - self.meta_data = [] - self.file_info = {} - if isinstance(macs_fn,str) : - f = open(macs_fn) - else : - f = macs_fn - done_with_header = False - while not done_with_header : - l = f.next().strip() - if l.startswith('#') : - for regex in MACSFile._METADATA_REGEXES : - m = re.search(regex,l) - if m is not None : - self.file_info[m.group(1).strip()] = parse_number(m.group(2).strip()) - self.meta_data.append(l) - elif l.startswith('\t'.join(MACSOutput.FIELD_NAMES[:5])) : - self.meta_data.append(l) - done_with_header = True - - SmartFileIter.__init__(self,f) - - -# for backwards compatibility, use MACSFile instead...? -class MACSOutput(object) : - FIELD_NAMES = MACSFile.FIELD_NAMES - -GLOBAL_SETTINGS_FN = os.path.join(os.path.split(chipsequtil.__file__)[0],'org_settings.cfg') -LOCAL_SETTINGS_FN = os.path.expanduser(os.path.join('~','.org_settings.cfg')) -_ALL_SETTINGS, _LOCAL_SETTINGS, _GLOBAL_SETTINGS = range(3) - -def _get_org_settings(org_key=None,addnl_configs=[],src=_ALL_SETTINGS) : - """Utility function used by get_org_settings and get_all_settings, should \ - not be called directly""" - - config = ConfigParser() - chipsequtil_base = conf_fns = [] - if src in [_LOCAL_SETTINGS, _ALL_SETTINGS] : - conf_fns.append(LOCAL_SETTINGS_FN) - if src in [_GLOBAL_SETTINGS, _ALL_SETTINGS] : - conf_fns.append(GLOBAL_SETTINGS_FN) - config.read(conf_fns+addnl_configs) - - d = {} - if org_key is None : - for sec in config.sections() : - # try to cast numeric-looking arguments into float, int - d[sec] = dict([(k,parse_number(v)) for k,v in config.items(sec)]) - else : - d = dict([(k,parse_number(v)) for k,v in config.items(org_key)]) - - return d - - -def get_org_settings(org_key,addnl_configs=[]) : - '''Returns a dict of setting/path values for a given organism as specified - in system-wide and user's settings. *org_key* is the organism name as found - in the config file, *e.g.* mm9. *addnl_configs* are filenames of other - configuration files to add to the set of settings, usually not needed. - Example usage:: - - >>> org_d = get_org_settings('mm9') - >>> org_d - {'affy_to_known_path': '/nfs/genomes/mouse_gp_jul_07/anno/knownToMOE43-mm9.txt', - 'annotation_path': '/nfs/genomes/mouse_gp_jul_07/anno/refFlat-mm9.txt', - 'description': "UCSC mm9 (July '07 build) with full TRANSFAC hypothesis set", - 'genome': 'mm9', - 'genome_dir': '/nfs/genomes/mouse_gp_jul_07', - 'genome_size': 2107000000, - 'known_gene_anno_path': '/nfs/genomes/mouse_gp_jul_07/anno/knownGene-mm9.txt', - 'known_gene_xref_path': '/nfs/genomes/mouse_gp_jul_07/anno/kgXref-mm9.txt', - 'refgene_anno_path': '/nfs/genomes/mouse_gp_jul_07/anno/refFlat-mm9.txt', - 'theme_hypotheses': '/nfs/vendata/cwng/TRANSFAC/2010_transfac_vert_all_filtic9.tamo', - 'theme_markov': '/nfs/data/cwng/chipseq/hypotheses/Mouse.markov', - 'ucsc_chrom_sizes': '/nfs/genomes/mouse_gp_jul_07/mm9.chrom.sizes'} - >>> get_org_settings('mm9')['genome_dir'] - '/nfs/genomes/mouse_gp_jul_07' - - ''' - return _get_org_settings(org_key,addnl_configs=addnl_configs) - - -def get_all_settings(addnl_configs=[]) : - '''Returns a dict of setting/path values for every organism as specified in - system-wide and user's settings.''' - return _get_org_settings(None,addnl_configs=addnl_configs) - - -def get_global_settings() : - '''Returns a dict of the global setting/path values installed with the - package.''' - return _get_org_settings(None,src=_GLOBAL_SETTINGS) - - -def get_local_settings() : - '''Returns a dict of the current user's setting/path values taken from - ~/.org_settings.cfg if it exists.''' - return _get_org_settings(None,src=_LOCAL_SETTINGS) - - -def check_org_settings(org_key,setting_list) : - '''Returns true if all setting names in *setting_list* are found in the - org settings for organism *org_key* and false otherwise. Mostly used - internally to sanity check org settings.''' - settings = get_org_settings(org_key) - return all([s in settings.keys() for s in setting_list]) - - -RC_MAP = string.maketrans('acgtACGT','tgcaTGCA') -def reverse_complement(seq) : - """Reverse complements nucleotide string *seq*. Leaves non-nucleotide characters uneffected.""" - return seq.translate(RC_MAP)[::-1] - - -def get_gc_content(seq) : - '''returns the GC content of a DNA sequence as python string''' - seq = seq.lower() - return (seq.count('c')+seq.count('g'))/float(len(seq)) - - -def get_gc_content_distribution(sequences,bins=100) : - '''returns a list of - provided sequences. Approximation is performed by binning.''' - gc_contents = [get_gc_content(s) for s in sequences] - gc_contents.sort() - - # count up the sequences for each bin - bin_counts = [0.]*bins - for c in gc_contents : - sample_bin = int(math.floor(c*bins)) - bin_counts[sample_bin] += 1 - - # normalize bin counts - norm_bins = [x/len(sequences) for x in bin_counts] - - # create a closure for this set of sequences - #def f(seq) : - # gc = get_gc_content(seq) - # return norm_bins[int(math.floor(gc*bins))] - - return norm_bins - - -def get_size_distribution(sequences) : - return (len(s) for s in sequences) - - -
--- a/chipsequtil-master/src/chipsequtil/motiftools.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2064 +0,0 @@ -""" -There is a large number of functions and member fucntions here. To get started, -a motif can be instantiated by providing an ambiguity code, a set of aligned DNA -sequences, or from matrices of counts, probabilities or log-likelihoods (akaPSSMs). - ->>> m = MotifTools.Motif_from_text('TGAAACANNSYWT') ->>> print m.oneletter() -TGAAACA..sywT - -Lower case reflects lower information content. For a more detailed view of the distribution -of information, try this:: - - >>> m.textlogo() - # -- 2.30 bits - # - # TGAAACA T - # TGAAACA T - # TGAAACA T - # TGAAACA T - # TGAAACA CCAT - # TGAAACA CCAT - # TGAAACA GTTT - # TGAAACA GTTT -- 0.23 bits - # ------------- - # TGAAACA..sywT - - -Motif objects may be manipulated largely like text strings (with pythonic -indexing):: - - >>> print m[4:5].oneletter - A - >>> print m[4:7].oneletter - ACA - >>> print (m[4:7] + m[1:2]).oneletter - ACAG - >>> print (m[4:7] + m[1:7]).oneletter - ACAGAAACA - -and even padded with blanks:: - - >>> print m[-4:7] - ...TGAAACA - -.. Copyright (2005) Whitehead Institute for Biomedical Research -.. All Rights Reserved - -Author: David Benjamin Gordon - -Modified by: Adam Labadorf - -""" -import copy -import math -import os -import pickle -import re -import string -import sys -import tempfile - -pysum = sum - -from random import random,shuffle -from subprocess import call - -from chipsequtil import reverse_complement -class MotifToolsException(Exception) : pass - -one2two = { 'W':'AT', 'M':'AC', 'R':'AG', - 'S':'CG', 'Y':'CT', 'K':'GT'} -two2one = { 'AT': 'W', 'AC': 'M', 'AG': 'R', - 'CG': 'S', 'CT': 'Y', 'GT': 'K'} -revcomp = { 'A':'T', 'T':'A', 'C':'G', 'G':'C', - 'W':'W', 'S':'S', 'K':'M', 'M':'K', - 'Y':'R', 'R':'Y', 'N':'N', - 'B':'N', 'D':'N', 'H':'N', 'V':'N', ' ':'N'} #[12-11-02] Needs fixing - -ACGT = list('ACGT') -YEAST_BG = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} #Yeast default background freqs - -revcomplement_memo = {'A':'T'} -revcompTBL = string.maketrans("AGCTagctWSKMYRnN", "TCGAtcgaWSMKTYnN") -def revcomplement(seq): - """A quick reverse-complement routine that memo-izes queries, understands - IUPAC ambiguity codes, and preserves case.""" - global revcomplement_memo - try: - rc = revcomplement_memo[seq] - except KeyError: - #_t = map(lambda x,D=revcomp: D[x], seq) - #get = revcomp.get - #_t = map(get, seq) - _t = list(seq.translate(revcompTBL)) - _t.reverse() - rc = ''.join(_t) - revcomplement_memo[seq] = rc - revcomplement_memo[rc] = seq - return rc - - -def Motif_from_ll(ll): - """Constructs a motif object from a log-likelihood matrix, which is in the - form of a list of dictionaries.""" - m = Motif(None,None) - m.compute_from_ll(ll) - return m - -def Motif_from_counts(countmat,beta=0.01,bg={'A':.25,'C':.25,'G':.25,'T':.25}): - """ - Construct a Motif object from a matrix of counts (or probabilities or frequencies). - A default set of uniform background frequencies may be overridden. - - beta refers to the number of pseudocounts that should be distributed over each position - of the PSSM.""" - m = Motif('',bg) - m.compute_from_counts(countmat,beta) - return m - -def Motif_from_text(text,beta=0.05,source='',bg=None): - """Construct a Motif object from a text string constructed from IUPAC - ambiguity codes. - - A default set of uniform background frequencies may be overridden with - a dictionary of the form {'A':.25,'C':.25,'G':.25,'T':.25}). - - beta refers to the number of pseudocounts that should be distributed over each position - of the PSSM.""" - if not bg: bg={'A':.25,'C':.25,'G':.25,'T':.25} - m = Motif('',bg) - m.compute_from_text(text,beta) - m.source = source - return m - -def copy(motif): - """Utility routine for copying motifs""" - a = copy.deepcopy(motif) - #a.__dict__ = motif.__dict__.copy() - return a - -class Motif: - """A pssm model, with scanning, storing, loading, and other operations. A - uniform nucleotide background is assumed if none is provided.""" - def __init__(self,list_of_seqs_or_text=[],backgroundD=None): - self.MAP = 0 - self.evalue = None - self.oneletter = '' - self.nseqs = 0 - self.counts = [] - self.width = 0 - self.fracs = [] - self.logP = [] - self.ll = [] - self.bits = [] - self.totalbits = 0 - self.maxscore = 0 - self.minscore = 0 - self.pvalue = 1 - self.pvalue_rank = 1 - self.church = None - self.church_rank = 1 - self.Cpvalue = 1 - self.Cpvalue_rank= 1 - self.Cchurch = 1 - self.Cchurch_rank= 1 - self.binomial = None - self.binomial_rank=1 - self.E_seq = None - self.frac = None - self.E_site = None - self.E_chi2 = None - self.kellis = None - self.MNCP = None - self.ROC_auc = None - self.realpvalue = None - self.Cfrac = None - self.CRA = None - self.valid = None - self.seeddist = 0 - self.seednum = -1 - self.seedtxt = None - self.family = None - self.source = None - self.threshold = None - self._bestseqs = None - self.bgscale = 1 - self.best_pvalue = None - self.best_factor = None - self.gamma = None - self.nbound = 0 - self.matchids = [] - self.overlap = None - self.cumP = [] - self.numbound = 0 - self.nummotif = 0 - self.numboundmotif = 0 - self.dataset = None - self.bgfile = None - self.cverror = None - self.beta = None - self.match_thresh = None - self.progscore = None - if backgroundD: - self.background = backgroundD - else: - #self.background = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31} #Yeast Default - self.background = {'A':.25,'C':.25,'G':.25,'T':.25} # uniform background - - if type(list_of_seqs_or_text) == type(''): - self.seqs = [] - text = list_of_seqs_or_text - self.compute_from_text(text) - else: - self.seqs = list_of_seqs_or_text - if self.seqs: - self._parse_seqs(list_of_seqs_or_text) - self._compute_ll() - self._compute_oneletter() - #self._compute_threshold(2.0) - - def __repr__(self): - return "%s (%d)"%(self.oneletter, self.nseqs) - - def __str__(self): - return "%s (%d)"%(self.oneletter, self.nseqs) - - def summary(self): - """return a text string one-line summary of motif and its metrics""" - m = self - txt = "%-34s (Bits: %5.2f MAP: %7.2f D: %5.3f %3d) E: %7.3f"%( - m, m.totalbits, m.MAP, m.seeddist, m.seednum, nlog10(m.pvalue)) - if m.binomial!=None: txt = txt + ' Bi: %6.2f'%(nlog10(m.binomial)) - if m.church != None: txt = txt + ' ch: %6.2f'%(nlog10(m.church)) - if m.frac != None: txt = txt + ' f: %5.3f'%(m.frac) - if m.E_site != None: txt = txt + ' Es: %6.2f'%(nlog10(m.E_site)) - if m.E_seq != None: txt = txt + ' Eq: %6.2f'%(nlog10(m.E_seq)) - if m.MNCP != None: txt = txt + ' mn: %6.2f'%(m.MNCP) - if m.ROC_auc!= None: txt = txt + ' Ra: %6.4f'%(m.ROC_auc) - if m.E_chi2 != None: - if m.E_chi2 == 0: m.E_chi2=1e-20 - txt = txt + ' x2: %5.2f'%(nlog10(m.E_chi2)) - if m.CRA != None: txt = txt + ' cR: %6.4f'%(m.CRA) - if m.Cfrac != None: txt = txt + ' Cf: %5.3f'%(m.Cfrac) - if m.realpvalue != None: txt = txt + ' P: %6.4e'%(m.realpvalue) - if m.kellis != None: txt = txt + ' k: %6.2f'%(m.kellis) - if m.numbound : txt = txt + ' b: %3d'%(m.numbound) - if m.nummotif : txt = txt + ' nG: %3d'%(m.nummotif) - if m.numboundmotif : txt = txt + ' bn: %3d'%(m.numboundmotif) - - return txt - - def minimal_raw_seqs(self): - '''return minimal list of seqs that represent consensus ''' - seqs = [[], []] - for letter in self.oneletter: - if one2two.has_key(letter): - seqs[0].append(one2two[letter][0]) - seqs[1].append(one2two[letter][1]) - else: - seqs[0].append(letter) - seqs[1].append(letter) - if ''.join(seqs[0]) == ''.join(seqs[1]): - return [''.join(seqs[0])] - else: - return [''.join(seqs[0]), ''.join(seqs[0])] - def _compute_oneletter(self): - """set the oneletter member variable""" - letters = [] - for i in range(self.width): - downcase = None - if self.bits[i] < 0.25: - letters.append('.') - continue - if self.bits[i] < 1.0: downcase = 'True' - tups = [(self.ll[i][x],x) for x in ACGT if self.ll[i][x] > 0.0] - if not tups: #Kludge if all values are negative (can this really happen?) - tups = [(self.ll[i][x],x) for x in ACGT] - tups.sort() - tups.reverse() - tups = [tups[0]] - downcase = 'True' - tups.sort() #Rank by LL - tups.reverse() - bases = [x[1] for x in tups[0:2]] - bases.sort() - if len(bases) == 2: L = two2one[''.join(bases)] - else: L = bases[0] - if downcase: L = L.lower() - letters.append(L) - self.oneletter = ''.join(letters) - def _parse_seqs(self, LOS): - """build a matrix of counts from a list of sequences""" - self.nseqs = len(LOS) - self.width = len(LOS[0]) - for i in range(self.width): - Dc = {'A': 0, 'C': 0, 'T': 0, 'G': 0, 'N': 0} - for seq in LOS: - key = seq[i] - Dc[key] = Dc[key] + 1 - del(Dc['N']) - self.counts.append(Dc) - - def _compute_ll(self): - """compute the log-likelihood matrix from the count matrix""" - self.fracs = [] - self.logP = [] - self.ll = [] - for i in range(self.width): - - Dll = {'A': 0, 'C': 0, 'T': 0, 'G': 0} - Df = {'A': 0, 'C': 0, 'T': 0, 'G': 0} - DlogP= {'A': 0, 'C': 0, 'T': 0, 'G': 0} - - for nuc in self.counts[i].keys(): - - #print i,nuc,self.counts[i][nuc],self.nseqs - # Dll[nuc] = log2( position nucleotide count/background sequence count ) - # Dll[nuc] = log2( (count[nuc]+bgscale*bg[nuc])/(bg[nuc]*(num_seqs+bgscale)) ) - - pos_nuc_count = self.counts[i][nuc] + self.bgscale*self.background.get(nuc,0.) - adj_all_nuc_count = (self.nseqs + self.bgscale) * self.background.get(nuc,1e-10) - - Dll[nuc] = math.log(pos_nuc_count/adj_all_nuc_count,2) - - Pij = self.counts[i][nuc] / float(self.nseqs) - Df [nuc] = Pij - if Pij > 0: - DlogP[nuc] = math.log(Pij) / math.log(2.) - else: - DlogP[nuc] = -100 #Near zero - - self.fracs.append(Df) - self.logP.append (DlogP) - self.ll.append (Dll) - self.P = self.fracs - self._compute_bits() - self._compute_ambig_ll() - self._maxscore() - - - def compute_from_ll(self,ll): - """build motif from an inputed log-likelihood matrix - - (This function reverse-calculates the probability matrix and background frequencies - that were used to construct the log-likelihood matrix) - """ - self.ll = ll - self.width = len(ll) - self._compute_bg_from_ll() - self._compute_logP_from_ll() - self._compute_ambig_ll() - self._compute_bits() - self._compute_oneletter() - self._maxscore() - - def _computeP(self): - """compute the probability matrix (from the internal log-probability matrix)""" - P = [] - for i in range(self.width): - #print i, - _p = {} - for L in ACGT: _p[L] = math.pow(2.0,self.logP[i][L]) - P.append(_p) - #print - self.P = P - - def _compute_bits(self): - """set m.totbits to the number of bits and m.bits to a list of bits at - each position""" - bits = [] - totbits = 0 - bgbits = 0 - bg = self.background - UNCERT = lambda x: x*math.log(x)/math.log(2.0) - for letter in ACGT: - bgbits = bgbits + UNCERT(bg[letter]) - for i in range(self.width): - tot = 0 - for letter in ACGT: - Pij = pow(2.0, self.logP[i][letter]) - tot = tot + UNCERT(Pij) - #bit = Pij * self.ll[i][letter] - #if bit > 0: - # tot = tot + bit - #print tot, bgbits, tot-bgbits - bits.append(max(0,tot-bgbits)) - totbits = totbits + max(0,tot-bgbits) - self.bits = bits - self.totalbits = totbits - - - def denoise(self,bitthresh=0.5): - """set low-information positions (below bitthresh) to Ns""" - for i in range(self.width): - tot = 0 - for letter in ACGT: - if self.logP: - Pij = pow(2.0, self.logP[i][letter]) - else: - Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] - if Pij > 0.01: - bit = Pij * self.ll[i][letter] - tot = tot + bit - if tot < bitthresh: #Zero Column - for letter in ACGT: - self.ll[i][letter] = 0.0 - self.compute_from_ll(self.ll) - - def giflogo(self,id,title=None,scale=0.8,info_str=''): - """make a gif sequence logo""" - return giflogo(self,id,title,scale) - - def printlogo(self,norm=2.3, height=10.0): - """print a text-rendering of the Motif Logo - - norm - maximum number of bits to show - height - number of lines of text to use to render logo - """ - self._print_bits(norm,height) - def print_textlogo(self,norm=2.3, height=8.0): - """print a text-rendering of the Motif Logo - - norm - maximum number of bits to show - height - number of lines of text to use to render logo - """ - self._print_bits(norm,height) - def _print_bits(self,norm=2.3, height=8.0): - """print a text-rendering of the Motif Logo - - norm - maximum number of bits to show - height - number of lines of text to use to render logo - """ - bits = [] - tots = [] - str = [] - for i in range(self.width): - D = {} - tot = 0 - for letter in ['A', 'C', 'T', 'G']: - if self.logP: - Pij = pow(2.0, self.logP[i][letter]) - else: - Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] - if Pij > 0.01: - '''Old''' - D[letter] = Pij * self.ll[i][letter] - #'''new''' - #Q = self.background[letter] - #D[letter] = ( Pij * math.log(Pij) - Pij * math.log(Q) ) / math.log(2.0) - '''for both old and new''' - tot = tot + D[letter] - bits.append(D) - tots.append(tot) - for i in range(self.width): - s = [] - _l = bits[i].keys() - _l.sort(lambda x,y,D=bits[i]: cmp(D[y],D[x])) - for key in _l: - for j in range(int(bits[i][key] / norm * height)): - s.append(key) - str.append(''.join(s)) - fmt = '%%%ds'%height - print '# %s'%('-'*self.width) - for h in range(int(height)): - sys.stdout.write("# ") - for i in range(self.width): - sys.stdout.write((fmt%str[i])[h]) - if h == 0: - sys.stdout.write(' -- %4.2f bits\n'%norm) - elif h == height-1: - sys.stdout.write(' -- %4.2f bits\n'%(norm/height)) - else: - sys.stdout.write('\n') - print '# %s'%('-'*self.width) - print '# %s'%self.oneletter - - def _compute_ambig_ll(self): - """extend log-likelihood matrix to include ambiguity codes - e.g. What the score of a 'S'? Here we use the max of C and G.""" - for Dll in self.ll: - for L in one2two.keys(): - Dll[L] = max(Dll[one2two[L][0]], Dll[one2two[L][1]] ) - Dll['N'] = 0.0 - Dll['B'] = 0.0 - - def compute_from_nmer(self,nmer,beta=0.001): #For reverse compatibility - """See compute_from_text. Here for reverse compatibility""" - self.compute_from_text(nmer,beta) - - def compute_from_text(self,text,beta=0.001): - """compute a matrix values from a text string of ambiguity codes. - Use Motif_from_text utility instead to build motifs on the fly.""" - prevlett = {'B':'A', 'D':'C', 'V':'T', 'H':'G'} - countmat = [] - text = re.sub('[\.\-]','N',text.upper()) - for i in range(len(text)): - D = {'A': 0, 'C': 0, 'T':0, 'G':0} - letter = text[i] - if letter in ['B', 'D', 'V', 'H']: #B == no "A", etc... - _omit = prevlett[letter] - for L in ACGT: - if L != _omit: D[L] = 0.3333 - elif one2two.has_key(letter): #Covers WSMYRK - for L in list(one2two[letter]): - D[L] = 0.5 - elif letter == 'N': - for L in D.keys(): - D[L] = self.background[L] - elif letter == '@': - for L in D.keys(): - D[L] = self.background[L]-(0.0001) - D['A'] = D['A'] + 0.0004 - else: - D[letter] = 1.0 - countmat.append(D) - self.compute_from_counts(countmat,beta) - - def new_bg(self,bg): - """change the ACGT background frequencies to those in the supplied dictionary. - Recompute log-likelihood, etc. with new background. - """ - counts = [] - for pos in self.logP: - D = {} - for L,lp in pos.items(): - D[L] = math.pow(2.0,lp) - counts.append(D) - self.background = bg - self.compute_from_counts(counts,0) - - def addpseudocounts(self,beta=0): - """add pseudocounts uniformly across the matrix""" - self.compute_from_counts(self.counts,beta) - - def compute_from_counts(self,countmat,beta=0): - """build a motif object from a matrix of letter counts.""" - self.counts = countmat - self.width = len(countmat) - self.bgscale = 0 - - maxcount = 0 - #Determine Biggest column - for col in countmat: - tot = pysum(col.values()) - if tot > maxcount : - maxcount = tot - - #Pad counts of remaining columns - for col in countmat: - tot = pysum(col.values()) - pad = maxcount - tot - for L in col.keys(): - col[L] = col[L] + pad * self.background.get(L,0.) - - self.nseqs = maxcount - nseqs = maxcount - - #Add pseudocounts - if beta > 0: - multfactor = {} - bgprob = self.background - pcounts= {} - for L in bgprob.keys(): - pcounts[L] = beta*bgprob[L]*nseqs - for i in range(self.width): - for L in countmat[i].keys(): - _t = (countmat[i][L] + pcounts[L]) #Add pseudo - _t = _t / (1.0 + beta) #Renomalize - countmat[i][L] = _t - - #Build Motif - self.counts = countmat - self._compute_ll() - self._compute_oneletter() - self._maxscore() - - - def _compute_bg_from_ll(self): - """compute background model from log-likelihood matrix - by noting that: pA + pT + pC + pG = 1 - and bgA + bgT + bgC + bgG = 1 - and bgA = bgT, bgC = bgG - and so bgA = 0.5 - bgC - and pA = lA * bgA, etc for T, C, G - so... - (lA + lT)bgA + (lC + lG)bgC = 1 - (lA + lT)bgA + (lC + lG)(0.5 - bgA) = 1 - (lA + lT - lC - lG)bgA +(lC +lG)*0.5 = 1 - bgA = {1 - 0.5(lC + lG)} / (lA + lT - lC - lG) - + Gain accuracy by taking average of bgA over all positions of PSSM - """ - - pow = math.pow - bgATtot = 0 - nocount = 0 - near0 = lambda x:(-0.01 < x and x < 0.01) - for i in range(self.width): - _D = self.ll[i] - ATtot = pow(2,_D['A']) + pow(2,_D['T']) - GCtot = pow(2,_D['C']) + pow(2,_D['G']) - if near0(_D['A']) and near0(_D['T']) and near0(_D['G']) and near0(_D['C']): - nocount = nocount + 1 - continue - if near0(ATtot-GCtot): #Kludge to deal with indeterminate case - nocount = nocount + 1 - continue - bgAT = (1.0 - 0.5*GCtot)/(ATtot - GCtot) - if (bgAT < 0.1) or (bgAT > 1.1): - nocount = nocount + 1 - continue - bgATtot = bgATtot + bgAT - if nocount == self.width: #Kludge to deal with different indeterminate case - self.background = {'A':0.25, 'C':0.25, 'G':0.25, 'T':0.25} - return - bgAT = bgATtot / (self.width - nocount) - bgGC = 0.5 - bgAT - self.background = {'A':bgAT, 'C':bgGC, 'G':bgGC, 'T':bgAT} - - def _compute_logP_from_ll(self): - """compute self's logP matrix from the self.ll (log-likelihood)""" - log = math.log - logP = [] - for i in range(self.width): - D = {} - for L in ACGT: - ''' if ll = log(p/b) then - 2^ll = p/b - and ll = log(p) - log(b) - so log(p) = ll + log(b)''' - #Pij = pow(2.0, self.ll[i][letter]) * self.background[letter] - D[L] = self.ll[i][L] + log(self.background[L])/log(2.) - logP.append(D) - self.logP = logP - - def _print_ll(self): - """print log-likelihood (scoring) matrix""" - print "# ", - for i in range(self.width): - print " %4d "%i, - print - for L in ['A', 'C', 'T', 'G']: - print "#%s "%L, - for i in range(self.width): - print "%8.3f "%self.ll[i][L], - print - def _print_p(self): - """print probability (frequency) matrix""" - print "# ", - for i in range(self.width): - print " %4d "%i, - print - for L in ['A', 'C', 'T', 'G']: - print "#%s "%L, - for i in range(self.width): - print "%8.3f "%math.pow(2,self.logP[i][L]), - print - def _print_counts(self): - """print count matrix""" - print "# ", - for i in range(self.width): - print " %4d "%i, - print - for L in ['A', 'C', 'T', 'G']: - print "#%s "%L, - for i in range(self.width): - print "%8.3f "%self.counts[i][L], - print - - def _maxscore(self): - """sets self.maxscore and self.minscore""" - total = 0 - lowtot= 0 - for lli in self.ll: - total = total + max(lli.values()) - lowtot= lowtot+ min(lli.values()) - self.maxscore = total - self.minscore = lowtot - - def _compute_threshold(self,z=2.0): - """for Motif objects assembled from a set of sequence, - compute a self.threshold with a z-score based on the distribution - of scores in among the original input sequences. - """ - scoretally = [] - for seq in self.seqs: - matches,endpoints,scores = self.scan(seq,-100) - scoretally.append(scores[0]) - ave,std = avestd(scoretally) - self.threshold = ave - z *std - #print '#%s: threshold %5.2f = %5.2f - %4.1f * %5.2f'%( - # self, self.threshold, ave, z, std) - - def bestscanseq(self,seq): - """return score,sequence of the best match to the motif in the supplied sequence""" - matches,endpoints,scores = self.scan(seq,-100) - t = zip(scores,matches) - t.sort() - bestseq = t[-1][1] - bestscore = t[-1][0] - return bestscore, bestseq - - def bestscore(self,seq): - """return the score of the best match to the motif in the supplied sequence""" - return m.bestscan(seq) - - def bestscan(self,seq): - """return the score of the best match to the motif in the supplied sequence""" - matches,endpoints,scores = self.scan(seq,-100) - if not scores: return -100 - scores.sort() - best = scores[-1] - return best - - def matchstartorient(self,seq, factor=0.7): - """returns list of (start,orientation) coordinate pairs of matches to - the motif in the supplied sequence. Factor is multiplied by m.maxscore - to get a match threshold. - """ - ans = [] - txts,endpoints,scores = self.scan(seq,factor=factor) - for txt, startstop in zip(txts,endpoints): - start, stop = startstop - rctxt = reverse_complement(txt) - orient = (self.bestscore(txt,1) >= self.bestscore(rctxt,1)) - ans.append((start,orient)) - return ans - - def scan(self, seq, threshold = '', factor=0.7): - """ - Scan the sequence. Returns three lists: matching sequences, endpoints, - and scores. The value of 'factor' is multiplied by m.maxscore to get a - match threshold if none is supplied - """ - if len(seq) < self.width: - return self._scan_smaller(seq,threshold) - else: - return self._scan(seq,threshold,factor=factor) - - def scansum(self,seq,threshold = -1000): - """ - Sum of scores over every window in the sequence. Returns - total, number of matches above threshold, average score, sum of exp(score) - """ - ll = self.ll - sum = 0 - width = self.width - width_r = range(width) - width_rcr = range(width-1,-1,-1) - width_ranges = zip(width_r,width_rcr) - seqcomp = seq.translate(revcompTBL) - - total = 0 - hits = 0 - etotal= 0 - for offset in range(len(seq)-width+1): - total_f = 0 - total_r = 0 - for i,ir in width_ranges: - pos = offset+i - total_f = total_f + ll[i][ seq[pos]] - total_r = total_r + ll[i][seqcomp[pos]] - total_max = max(total_f,total_r) - if total_max >= threshold: - total = total + total_max - etotal = etotal + math.exp(total_max) - hits = hits + 1 - if not hits: - ave = 0 - else: - ave = float(total)/float(hits) - return total,hits,ave,math.log(etotal) - - def score(self, seq, fwd='Y'): - """returns the score of the first w-bases of the sequence, where w is the motif width.""" - matches, endpoints, scores = self._scan(seq,threshold=-100000,forw_only=fwd) - return scores[0] - - def bestscore(self,seq, fwd=''): - """returns the score of the best matching subsequence in seq.""" - matches, endpoints, scores = self._scan(seq,threshold=-100000,forw_only=fwd) - if scores: return max(scores) - else: return -1000 - - def _scan(self, seq,threshold='',forw_only='',factor=0.7): - """internal tility function for performing sequence scans""" - ll = self.ll #Shortcut for Log-likelihood matrix - if not threshold: threshold = factor * self.maxscore - - #print '%5.3f'%(threshold/self.maxscore) - matches = [] - endpoints = [] - scores = [] - width = self.width - width_r = range(width) - width_rcr = range(width-1,-1,-1) - width_ranges = zip(width_r,width_rcr) - - seqcomp = seq.translate(revcompTBL) - - for offset in range(len(seq)-self.width+1): #Check if +/-1 needed - total_f = 0 - total_r = 0 - for i,ir in width_ranges: - pos = offset+i - total_f = total_f + ll[i ][ seq[pos]] - total_r = total_r + ll[ir][seqcomp[pos]] - - if 0 and total_f > 1: - for i,ir in width_ranges: - print seq[offset+i],'%6.3f'%ll[i ][ seq[offset+i] ],' ', - print '= %7.3f'%total_f - - if 0: - print "\t\t%s vs %s: F=%6.2f R=%6.2f %6.2f %4.2f"%(seq[offset:offset+self.width], - self.oneletter,total_f,total_r, - self.maxscore, - max([total_f,total_r])/self.maxscore) - if total_f > threshold and ((total_f > total_r) or forw_only): - endpoints.append( (offset,offset+self.width-1) ) - scores.append(total_f) - matches.append(seq[offset:offset+self.width]) - elif total_r > threshold: - endpoints.append( (offset,offset+self.width-1) ) - scores.append(total_r) - matches.append(seq[offset:offset+self.width]) - return matches,endpoints,scores - def _scan_smaller(self, seq, threshold=''): - """internal utility function for performing sequence scans. The sequence - is smaller than the PSSM. Are there good matches to regions of the PSSM?""" - ll = self.ll #Shortcut for Log-likelihood matrix - matches = [] - endpoints = [] - scores = [] - w = self.width - for offset in range(self.width-len(seq)+1): #Check if +/-1 needed - maximum = 0 - for i in range(len(seq)): - maximum = maximum + max(ll[i+offset].values()) - if not threshold: threshold = 0.8 * maximum - total_f = 0 - total_r = 0 - for i in range(len(seq)): - total_f = total_f + ll[i+offset ][ seq[i] ] - total_r = total_r + ll[w-(i+offset)-1][revcomp[seq[i]]] - if 0: - print "\t\t%s vs %s: F=%6.2f R=%6.2f %6.2f %4.2f"%(seq, self.oneletter[offset:offset+len(seq)], - total_f, total_r, maximum, - max([total_f,total_r])/self.maxscore) - if total_f > threshold and total_f > total_r: - endpoints.append( (offset,offset+self.width-1) ) - scores.append(total_f) - matches.append(seq[offset:offset+self.width]) - elif total_r > threshold: - endpoints.append( (offset,offset+self.width-1) ) - scores.append(total_r) - matches.append(seq[offset:offset+self.width]) - return matches,endpoints,scores - - def mask_seq(self,seq): - """return a copy of input sequence in which any regions matching m are - replaced with strings of N's """ - masked = '' - matches, endpoints, scores = self.scan(seq) - cursor = 0 - for start, stop in endpoints: - masked = masked + seq[cursor:start] + 'N'*self.width - cursor = stop+1 - masked = masked + seq[cursor:] - return masked - - def masked_neighborhoods(self,seq,flanksize): - """chop up the input sequence into regions surrounding matches to m. - Replace the subsequences that match the motif with N's.""" - ns = self.seq_neighborhoods(seq,flanksize) - return [self.mask_seq(n) for n in ns] - - def seq_neighborhoods(self,seq,flanksize): - """chop up the input sequence into regions surrounding matches to the - motif.""" - subseqs = [] - matches, endpoints, scores = self.scan(seq) - laststart, laststop = -1, -1 - for start, stop in endpoints: - curstart, curstop = max(0,start-flanksize), min(stop+flanksize,len(seq)) - if curstart > laststop: - if laststop != -1: - subseqs.append(seq[laststart:laststop]) - laststart, laststop = curstart, curstop - else: - laststop = curstop - if endpoints: subseqs.append(seq[laststart:laststop]) - return subseqs - - def __sub__(self,other): - pass - """Overloads the '-' operator to compute the Euclidean distance between - probability matrices motifs of equal width.""" - if type(other) != type(self): - print "computing distance of unlike pssms (types %s, %s)"%( - type(other),type(self)) - print 'First: %s'%other - print 'Self: %s'%self - sys.exit(1) - if other.width != self.width: - print "computing distance of unlike pssms (width %d != %d)"%( - other.width,self.width) - sys.exit(1) - D = 0 - FABS = math.fabs - POW = math.pow - for L in self.logP[0].keys(): - for i in range(self.width): - D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) - #D = D + FABS( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L])) - #D = D + FABS(self.logP[i][L] - other.logP[i][L]) - return math.sqrt(D) - - def maskdiff(self,other): - """a different kind of motif comparison metric. See THEME paper for - details""" - return maskdiff(self,other) - - def maxdiff(self): - """compute maximum possible Euclidean distance to another motif. (For - normalizing?)""" - POW = math.pow - D = 0 - for i in range(self.width): - _min = 100 - _max = -100 - for L in ACGT: - val = POW(2,self.logP[i][L]) - if val > _max: - _max = val - _maxL = L - elif val < _min: - _min = val - _minL = L - for L in ACGT: - if L == _minL: - delta = 1-POW(2,self.logP[i][L]) #1-val - D = D + delta*delta - else: - D = D + POW( POW(2,self.logP[i][L]), 2) #0-val - return math.sqrt(D) - - def revcomp(self): - """return reverse complement of motif""" - return revcompmotif(self) - def trimmed(self,thresh=0.1): - """return motif with low-information flanks removed. 'thresh' is in bits.""" - for start in range(0,self.width-1): - if self.bits[start]>=thresh: break - for stop in range(self.width,1,-1): - if self.bits[stop-1]>=thresh: break - m = self[start,stop] - return m - def bestseqs(self,thresh=None): - """return all k-mers that match motif with a score >= thresh""" - if not thresh: - if self._bestseqs: - return self._bestseqs - if not thresh: thresh = 0.8 * self.maxscore - self._bestseqs = bestseqs(self,thresh) - return self._bestseqs - def emit(self,prob_min=0.0,prob_max=1.0): - """consider motif as a generative model, and have it emit a sequence""" - if not self.cumP: - for logcol in self.logP: - tups = [] - for L in ACGT: - p = math.pow(2,logcol[L]) - tups.append((p,L)) - tups.sort() - cumu = [] - tot = 0 - for p,L in tups: - tot = tot + p - cumu.append((tot,L)) - self.cumP.append(cumu) - s = [] - #u = random()+0.01 #Can make higher for more consistent motifs - for cumu in self.cumP: - u = (prob_max-prob_min)*random() + prob_min - #u = random()+0.01 #Can make higher for more consistent motifs - last = 0 - for p,L in cumu: - if last < u and u <= p: - letter = L - break - else: last = p -# print L,'%8.4f'%u,cumu - s.append(L) - #print ''.join(s) - return ''.join(s) - - - def random_kmer(self): - """generate one of the many k-mers that matches the motif. See m.emit() - for a more probabilistic generator""" - if not self._bestseqs: self._bestseqs = self.bestseqs() - seqs = self._bestseqs - pos = int(random() * len(seqs)) - print 'Random: ',self.oneletter,seqs[pos][1] - return seqs[pos][1] - - def __getitem__(self,tup): - pass - """ - m.__getitem__(tup) -- Overload m[a,b] to submotif. Less pythonish than [:], but more reliable - """ - if len(tup) != 2: - print "Motif[i,j] requires two arguments, not ",tup - else: - beg, end = tup[0], tup[1] - return submotif(self,beg,end) - def __getslice__(self,beg,end): - pass - """ - m.__getslice__(,beg,end) -- Overload m[a:b] to submotif. - """ - if beg >= end: - #Probably python converted negative idx. Undo - beg = beg - self.width - return submotif(self,beg,end) - def __add__(self,other): - pass - """ - m.__add__(other) -- Overload '+' for concatenating motifs - """ - return merge(self,other,0) - def __len__(self): - pass - """ - m.__len__() -- Overload len(m) to return width - """ - return self.width - def shuffledP(self): - """ - m.shuffledP() -- Generate motif in which probability matrix has been shuffled. - """ - return shuffledP(self) - def copy(self): - """return a 'deep' copy of the motif""" - a = Motif() - a.__dict__ = self.__dict__.copy() - return a - - def random_diff_avestd(self,iters=5000): - """see modules' random_diff_avestd""" - return random_diff_avestd(self,iters) - def bogus_kmers(self,count=200): - """Generate a faked multiple sequence alignment that will reproduce the - probability matrix.""" - - POW = math.pow - #Build p-value inspired matrix - #Make totals cummulative: - # A: 0.1 C: 0.4 T:0.2 G:0.3 - # -> A:0.0 C:0.1 T:0.5 G:0.7 0.0 - - #Take bg into account: - # We want to pick P' for each letter such that: - # P'/0.25 = P/Q - # so P' = 0.25*P/Q - - m = [] - for i in range(self.width): - _col = [] - tot = 0.0 - for L in ACGT: - _col.append( tot ) - tot = tot + POW(2,self.logP[i][L]) * 0.25 / self.background[L] - _col.append(tot) - #Renormalize - for idx in range(len(_col)): - _col[idx] = _col[idx] / _col[-1] - m.append(_col) - - for p in range(0): #Was 5 - for i in range(len(m)): - print '%6.4f '%m[i][p], - print - - seqs=[] - for seqnum in range(count+1): - f = float(seqnum)/(count+1) - s = [] - for i in range(self.width): - for j in range(4): - if (m[i][j] <= f and f < m[i][j+1]): - s.append(ACGT[j]) - break - seqs.append(''.join(s)) - - del(seqs[0]) - #for i in range(count): - # print ">%3d\n%s"%(i,seqs[i]) - - return seqs - - -def minwindowdiff(M1,M2,overlap=5,diffmethod='diff'): - #Alternate method: maskdiff, infomaskdiff - if type(M1) != type(M2): - print "Error: Attempted to compute alignment of objects that are not both Motifs" - print " types %s: %s and %s: %s"%(M1,type(M1),M2,type(M2)) - sys.exit(1) - - if M1.width <= M2.width: A = M1; Borig = M2 - else: A = M2; Borig = M1 - wA = A.width - wB = Borig.width - O = overlap - - if diffmethod == 'diff': - diff_fcn = diff - elif diffmethod == 'maskdiff': - diff_fcn = maskdiff - elif diffmethod == 'infomaskdiff': - diff_fcn = infomaskdiff - - mindiff = 1000 - #print 'minwindodebug wA ', wA, 'wB ', wB, 'O ', O, 'wA-0', wA-O, 'wB-O', wB-O - for Astart in range(wA-O+1): - subA = A[Astart:Astart+O] - for B in [Borig, Borig.revcomp()]: - for Bstart in range(wB-O+1): - subB = B[Bstart:Bstart+O] - mindiff = min(mindiff, diff_fcn(subA,subB)) - #print 'minwindodebug ',subA, subB, diff_fcn(subA,subB) - return mindiff - - -def minaligndiff(M1,M2,overlap=5,diffmethod='diff'): - #Alternate method: maskdiff, infomaskdiff - if type(M1) != type(M2): - print "Error: Attempted to compute alignment of objects that are not both Motifs" - print " types %s: %s and %s: %s"%(M1,type(M1),M2,type(M2)) - sys.exit(1) - - if M1.width <= M2.width: - A = M1; Borig = M2 - switch = 0 - else: - A = M2; Borig = M1 - switch = 1 - wA = A.width - wB = Borig.width - O = overlap - - ''' - Here is the figure to imagine: - 012345678901234567890 wA: 6 Bstart: 6-3 = 3 - A (A) wB: 11 Bstop: 6+11-3-1= 13 - ------ %%%%%% O: 3 lastA: 6+11-3-3= 11 - ----------- - |O| B - ''' - - if diffmethod == 'diff': - diff_fcn = diff - elif diffmethod == 'maskdiff': - diff_fcn = maskdiff - elif diffmethod == 'infomaskdiff': - diff_fcn = infomaskdiff - - Bstart = wA-O - Bstop = wA+wB-O-1 - lastA = wA+wB-O-O - Dmin = 1000 - Dmins=[] - #print A - #print '%s%s'%(' '*Bstart,Borig) - for B in [Borig, Borig.revcomp()]: - for start in range(0,lastA+1): - Bpos = [] - Apos = [] - for offset in range(wA): - abs = start+offset - if abs >= Bstart and abs <= Bstop: - Apos.append(offset) - Bpos.append(abs-Bstart) - subA = A[min(Apos),max(Apos)+1] - subB = B[min(Bpos),max(Bpos)+1] - #print '%s%s\n%s%s %f'%( - # ' '*start, subA, - # ' '*start, subB, diff_fcn(subA,subB)) - if switch: _diff = diff_fcn(subB,subA) - else: _diff = diff_fcn(subA,subB) - Dmin = min(Dmin, _diff) - return Dmin - -''' -To compare 2 motifs of the same width, there are these five functions: - -m1 - m2 - Euclidean Distance (sqrt(sum_col(sum_row))) -diff(m1,m2) - psuedo-Euclidean (sum_col(sqrt(norm(sum_row)))/#col -maskdiff(m1,m2) - diff, but excluding positions with "N" in m2 -infomaskdiff(m1,m2)- diff, but scaling distance by normalized - information content at each position in m2. -diverge(m1,m2) - Mutual information sum[p log (p/q)] - -**Note that maskdiff, infomaskdiff, and diverge are not symmetric functions - -To compare 2 motifs of different widths, there is the function: - -minaligndiff(M1,M2,overlap=5,diffmethod='diff') - -this does a 'sliding' comparison of two motifs and reports the minimum -distance over all alignments. overlap refers to the minumum overlap -required while sliding. Below, overlap is '2'. The default is '5'. - - ------ - ----------- - -You can optionally specify the distance metric as a text string. -The default is 'diff'. - -''' - - -def diff(self,other): - """psuedo-Euclidean (sum_col(sqrt(norm(sum_row)))/#col""" - if type(other) != type(self): - print "computing distance of unlike pssms (types %s, %s)"%( - type(other),type(self)) - print 'First: %s'%other - print 'Self: %s'%self - sys.exit(1) - if other.width != self.width: - print "computing distance of unlike pssms (width %d != %d)"%( - other.width,self.width) - sys.exit(1) - POW = math.pow - Dtot = 0 - for i in range(self.width): - '''Computes distance''' - D = 0 - for L in ACGT: - D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) - Dtot = Dtot + math.sqrt(D)/math.sqrt(2.0) - return Dtot/self.width - - -def maskdiff(self,other): - """diff, but excluding positions with 'N' in m2. Return pseudo-Euclidean - distance, but only include columns that are not background.""" - if type(other) != type(self): - print "computing distance of unlike pssms (types %s, %s)"%( - type(other),type(self)) - print 'First: %s'%other - print 'Self: %s'%self - sys.exit(1) - if other.width != self.width: - print "computing distance of unlike pssms (width %d != %d)"%( - other.width,self.width) - sys.exit(1) - - Dtot = 0 - POW = math.pow - NEAR0= lambda x:(-0.01 < x and x < 0.01) - divisor = 0 - for i in range(self.width): - nearcount = 0 - - '''Implements mask''' - for L in ACGT: - diff = POW(2,other.logP[i][L]) - other.background[L] - if NEAR0(diff): nearcount = nearcount + 1 - if nearcount == 4: - #print 'Skipping position %d :'%i,other.logP[i] - continue - - '''Computes distance''' - divisor = divisor + 1 - D = 0 - for L in ACGT: - D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) - Dtot = Dtot + math.sqrt(D)/math.sqrt(2.0) - return Dtot/divisor - -def infomaskdiff(self,other): - """Return pseudo-Euclidean distance, but scale column distance by - information content of "other". Used by THEME""" - if type(other) != type(self): - print "computing distance of unlike pssms (types %s, %s)"%( - type(other),type(self)) - print 'First: %s'%other - print 'Self: %s'%self - sys.exit(1) - if other.width != self.width: - print "computing distance of unlike pssms (width %d != %d)"%( - other.width,self.width) - sys.exit(1) - - maxbits = math.log( 1.0/min(other.background.values()) ) / math.log(2.0) - '''or... alternatively''' - #print maxbits, max(other.bits) - #print other.bits - maxbits = max(other.bits) - if maxbits < 0.1: #'''There is nothing important here''' - return 1 - - Dtot = 0 - POW = math.pow - divisor = 0 - '''Computes distance''' - for i in range(self.width): - D = 0 - for L in ACGT: - D = D + POW( POW(2,self.logP[i][L]) - POW(2,other.logP[i][L]), 2 ) - col_dist = math.sqrt(D)/math.sqrt(2.0) - col_scale = other.bits[i]/maxbits - divisor = divisor + col_scale - Dtot = Dtot + col_dist*col_scale - return Dtot/divisor - -def diverge(self,other): - """Yet another distance metric""" - if type(other) != type(self): - print "computing distance of unlike pssms (types %s, %s)"%( - type(other),type(self)) - print 'First: %s'%other - print 'Self: %s'%self - sys.exit(1) - if other.width != self.width: - print "computing distance of unlike pssms (width %d != %d)"%( - other.width,self.width) - sys.exit(1) - - Dtot = 0 - POW = math.pow - LOG2 = lambda x:math.log(x)/math.log(2.0) - NEAR0= lambda x:(-0.01 < x and x < 0.01) - divisor = 0 - for i in range(self.width): - nearcount = 0 - - '''Implements mask''' - for L in ACGT: - diff = POW(2,other.logP[i][L]) - self.background[L] - if NEAR0(diff): nearcount = nearcount + 1 - if nearcount == 4: - #print 'Skipping position %d :'%i,other.logP[i] - continue - - '''Computes distance''' - divisor = divisor + 1 - D = 0 - for L in ACGT: - Pself = POW(2, self.logP[i][L]) - Pother= POW(2,other.logP[i][L]) - D = D + Pself * LOG2(Pself/Pother) - Dtot = Dtot + D - return Dtot/divisor - - - -def bestseqs(motif,thresh, seq='',score=0,depth=0,bestcomplete=None,SEQS=[]): - """This function returns a list of all sequences that a motif could - match match with a sum(log-odds) score greater than thresh.""" - if depth == 0: - SEQS = [] #Must be a python 2.1 bug. I shouldn't have to do this - if not bestcomplete: - M = motif - maxs = [] - for i in range(M.width): - bestj = 'A' - for j in ['C', 'G', 'T']: - if M.ll[i][j] > M.ll[i][bestj]: - bestj = j - maxs.append(M.ll[i][bestj]) - bestcomplete = [] - for i in range(M.width): - tot = 0 - for j in range(i,M.width): - tot = tot + maxs[j] - bestcomplete.append(tot) - if depth == motif.width: - if score > thresh: - SEQS.append((score,seq)) - #if len(SEQS) > 2000: - # thresh = 1000.0 # Return Early, You don't really want all these sequences, do you? - return - if depth==-1: - print '# %-10s %6.3f %6.3f %2d'%(seq, score, bestcomplete[depth], depth) - if score + bestcomplete[depth] < thresh: return - #if depth > 0 and len(SEQS) > 2000: - # return - for L in ACGT: - newseq = seq + L - newscore = score + motif.ll[depth][L] - bestseqs(motif,thresh,newseq,newscore,depth+1,bestcomplete,SEQS) - if depth == 0: - SEQS.sort() - SEQS.reverse() - return SEQS - -def seqs2fasta(seqs,fasta_file = ''): - """ - seqs2fasta(seqs,fasta_file = '') -- Dumps a Fasta formatted file of sequences, - keyed by the sequence itself:: - - >ACTTTTTGTCCCA - ACTTTTTGTCCCA - >ACTTTTGGGGCCA - ACTTTTGGGGCCA - ... - - """ - if not fasta_file: - fasta_file = tempfile.mktemp() - FH = open(fasta_file,'w') - for i in range(len(seqs)): - FH.write(">%d\n%s\n"%(i,seqs[i])) - FH.close() - return fasta_file - -def top_nmers(N,seqs,with_counts = 0,purge_Ns = ''): - """Assemble list of all nmers (kmers) with width 'N' from supplied sequences. - Option with_counts returns list of (kmer, count) tuples instead. Purge N's - ignores kmers containing N's. """ - Nmers = {} - revcompTBL = string.maketrans("AGCTagctnN", "TCGAtcganN") - for seq in seqs: - for i in range(len(seq)-N+1): - Nmer = seq[i:i+N] - if purge_Ns: - if Nmer.find('N') >= 0: continue - _t = list(Nmer.translate(revcompTBL)) - _t.reverse() - NmerRC = ''.join(_t) # _t used until here to revese comp seq - _t = [Nmer, NmerRC] - _t.sort() - NmerKey = _t[0] # _t used until here to get alphabetically first seq - if Nmers.has_key(NmerKey): - Nmers[NmerKey] = Nmers[NmerKey] + 1 - else: - Nmers[NmerKey] = 1 - sorted = Nmers.keys() - sorted.sort(lambda x,y,D=Nmers:cmp(D[y],D[x]) or cmp(x,y)) - #for i in range(10): - # print "# %2d %s %d"%(i,sorted[i],Nmers[sorted[i]]) - if with_counts: - return zip(sorted,map(lambda x,N=Nmers:N[x], sorted)) - else: - return sorted - -def m_matches(seqs,wmer,m): - """Returns list of all kmers among sequences that have at most - m mismatches to the supplied wmer (kmer).""" - matches = [] - width = len(wmer) - for (nmer, count) in top_nmers(width,seqs,'with counts'): - match = 0 - for i in range(width): - if nmer[i] == wmer[i]: - match = match+1 - if match >= m: - for i in range(count): - matches.append(nmer) - return matches - -def compare_seqs(s1, s2): - pass - """ - compare_seqs(s1, s2) - """ - if len(s1) > len(s2): - long = s1 - short = s2 - else: - long = s2 - short = s1 - (maxcount,max_i) = (0,0) - for i in range(len(long)-len(short)+1): - idcount_f = 0 - idcount_r = 0 - for j in range(len(short)): - if short[j] == long[i+j]: - idcount_f = idcount_f + 1 - if short[-(j+1)] == revcomp[long[i+j]]: - idcount_r = idcount_r + 1 - if (idcount_f > maxcount and idcount_f >= idcount_r): - maxcount = idcount_f - max_i = i - elif (idcount_r > maxcount): - maxcount = idcount_r - max_i = i - #print i,j,idcount_f,idcount_r,maxcount - maxfrac = float(maxcount) / len(short) - print maxfrac,maxcount,len(short) - return maxfrac,short,long[max_i:max_i+len(short)] - -def shuffle_bases(m): - """return a new motif object in which the probabilities are randomly - re-assigned to different letters at the same position.""" - C = [] - letts = list('ACGT') - for i in range(m.width): - D = {} - vals = m.counts[i].values() - shuffle(vals) - for i in range(4): - D[letts[i]] = vals[i] - C.append(D) - n = Motif() - #n.__dict__ = m.__dict__.copy() #May copy too much information (cached diff information, etc...) - n.compute_from_counts(C) - return n - -def random_diff_avestd(motif,iters=5000): - """Return the average & stddev distance ('diff') between a - motif and "iters" random motifs of the same width.""" - w = motif.width - vals = [] - for i in range(iters): - vals.append(motif - Random_motif(w)) - return avestd(vals) - -def random_motif(w): - """Generate a random motif of width w. Each position will have a dominant - letter with probability around 0.91.""" - C = [] - for i in range(w): - D = {} - tot = 0 - p = int(random.random() * 4) - Lup = ACGT[p] - for L in ACGT: - D[L] = 0.1 - tot = tot + 0.001 - D[Lup] = D[Lup] + 1 - for L in ACGT: - D[L] = D[L]/tot - C.append(D) - m = Motif() - m.compute_from_counts(C) - return m - -def toDict(M): - pass - ''' - toDict(M) -- Convert a 2D array to a list of dictionaries (which is how the motif object - stores information internally). Assumes M entries are in alphabetical order (ACGT) - ''' - if type(M[0]) == type(0.0): - return toDictVect(M) - else: - a = [] - for i in range(len(M)): - a.append(toDictVect(M[i])) - return a - -def toDictVect(V): - pass - """ - toDictVect(V) -- Convert a 1D vector to a dictionary of DNA letters. Assumes values - in V are in alphabetical order (ACGT). - """ - D = {} - for L,i in (('A',0), ('C',1), ('G',2), ('T',3)): - D[L]=V[i] - return D - -def submotif(self,beg,end): - """**Deprecated** Use slice functionality (m[2:4]) instead. - - Utility function - for extracting sub-motifs and padding motifs.""" - bg = self.background.copy() - P = [] - - #Determine if any 'zeros' should be added at begining - #because the user has specified a negative beg index - for i in range(beg,0): - P.append(bg.copy()) - - #Copy relevant content of motif - start = max(beg,0) - stop = min(end,self.width) - for i in range(start,stop): - D = {} - for L in ACGT: - D[L] = math.pow(2.,self.logP[i][L]) - P.append(D) - - #Determine if any 'zeros' should be added at the end - #because the user has specified a width too large - for i in range(self.width,end): - P.append(bg.copy()) - - #print "BEG, END", beg,end - #for i in range(beg,end): - # print i,P[i] - - #Build the Motif - M = copy.deepcopy(self) - #M = Motif(None,bg.copy()) - M.compute_from_counts(P) - M.source = self.source - return M - -def shuffledP(self): - """Construct a motif in which the letter distributions are preserved but - are reassigned to rondom positions in the motif.""" - bg = self.background.copy() - P = [] - - #Copy relevant content of motif - for i in range(0,self.width): - D = {} - _s = ACGT[:] - shuffle(_s) - for L,_L in zip(ACGT,_s): - D[L] = math.pow(2.,self.logP[i][_L]) - P.append(D) - - #Build the Motif - M = copy.deepcopy(self) - #M = Motif(None,bg.copy()) - M.compute_from_counts(P) - M.source = self.source - return M - -def revcompmotif(self): - """Construct the reverse complement of the motif. Use m.revcomp() member - function instead.""" - bg = self.background.copy() - P = [] - - for i in range(self.width): - D = {} - for L in ACGT: - D[L] = math.pow(2.,self.logP[self.width-i-1][revcomp[L]]) - P.append(D) - - #Build the Motif - M = copy.deepcopy(self) - M.compute_from_counts(P) - return M - - -def sum(motifs,weights=[]): - """Perhaps better called 'average'. Constructs a motif by averaging the - probabilities at each position of the (pre-aligned) input motifs. Optional - weights can be assigned, and must be in the same order as the motifs. - """ - if not weights: - weights = [1.0] * len(motifs) - tot = 0.0 - for w in weights: tot=tot+float(w) - weights = [(w/tot) for w in weights] - C = [] - for c in motifs[0].fracs: - D = {} - for L in ACGT: D[L] = 0.0 - C.append(D) - for m,w in zip(motifs,weights): - for i in range(m.width): - for L in ACGT: - C[i][L] = C[i][L] + m.fracs[i][L]*w - motif = Motif_from_counts(C,0.0,bg=motifs[0].background) - return motif.trimmed() - - -def giflogo(motif,id,title=None,scale=0.8): - """Interface to the 'weblogo/seqlogo' perl - scripts that generate colorful sequence logos - """ - return seqlogo(motif,id,title,scale,format='GIF') - - -seqlogo_formats = ('GIF','PDF','EPS','PNG') -illegal_fn_chars = '&;/ ()' -fn_trans = string.maketrans(illegal_fn_chars,'_'*len(illegal_fn_chars)) -def seqlogo(motif,motif_id,title=None,scale=0.8,img_format='GIF') : - """Interface to the'weblogo/seqlogo' perl scripts that generate colorful - sequence logos. Available formats are %s. Replaces illegal filename - characters in *id* parameter (i.e. '%s') with underscores when writing - to file. The executable *seqlogo* must be on your path. - """%(seqlogo_formats,illegal_fn_chars) - #SEQLOGO = TAMOpaths.weblogodir + 'seqlogo' - #TAMOpaths.CHECK(SEQLOGO,'','Weblogo/Seqlogo') - kmers = motif.bogus_kmers(100) - width = float(len(kmers[0]) ) - height = float(4) - m = motif - width, height = width*scale, height*scale - tmp = tempfile.mktemp() + '.fsa' - if title is None: - title = motif_id - - if img_format.upper() not in seqlogo_formats : - raise MotifToolsException('seqlogo requires one of %s'%seqlogo_formats) - - seqs2fasta(kmers,tmp) - fn = id.translate(fn_trans) - cmd = 'seqlogo -F %s -acpY -w%d -h%d -k 1 -M -f %s -o %s -t "%s" '%( - img_format.upper(), width, height, tmp, fn, title) - - call(cmd,shell=True) - return "%s.%s"%(fn,img_format.lower()) - - -def merge(A,B,overlap=0): - """**Deprecated** Use the '+' operator instead. - - Used for concatenating motifs into a new motif, allowing for the averaging - of overlapping bases between them. - """ - if (overlap < 0 or overlap > A.width or overlap >B.width): - print 'Cannot overlap %s with %s by %d bases'%(A.oneletter,B.oneletter,overlap) - return None - - #Build Probability matrix. Width will be A.width + B.width - overlap - w = A.width + B.width - overlap - - P = [] - #Make a copy of A's probabilities into P - for i in range(A.width): - D = {} - logP = A.logP[i] - for L in logP.keys(): - D[L] = math.pow(2,logP[L]) - P.append(D) - #Add B's first 'overlap' probabilities to last 'overlap' probabilities of P - for i in range(overlap): - logP = B.logP[i] - Pidx = len(P)-overlap+i - _tot = 0 - for L in logP.keys(): - P[Pidx][L] = (P[Pidx][L] + math.pow(2,logP[L])) / 2.0 - P[Pidx][L] = max(P[Pidx][L],math.pow(2,logP[L])) - _tot = _tot + P[Pidx][L] - for L in logP.keys(): - P[Pidx][L] = P[Pidx][L] / _tot - #Append B's remaining probabilites to P - for i in range(overlap,B.width): - D = {} - logP = B.logP[i] - for L in logP.keys(): - D[L] = math.pow(2,logP[L]) - P.append(D) - - #Build a motif - M = Motif(None,A.background.copy()) - M.source = A.source,B.source - M.compute_from_counts(P) - return M - -def avestd(vals): - """return an (average, stddev) tuple computed from the supplied list of values""" - (sum, sum2) = (0.,0.) - N = float(len(vals)) - for val in vals: - sum = sum + float(val) - sum2 = sum2 + float(val)*float(val) - if N == 1: - ave = sum - std = 0 - else: - ave = sum / N - std = math.sqrt( (sum2-(N*ave*ave)) / (N-1.0) ) - return ave,std - - -def load(filename): - """load a 'TAMO'-formatted motif file""" - FID = open(filename,'r') - lines = FID.readlines() - FID.close() - motifs = [] - seedD = {} - seedfile = '' - for i in range(len(lines)): - if lines[i][0:10] == 'Log-odds matrix'[0:10]: - w = len(lines[i+1].split())-1 - ll = [] - for pos in range(w): - ll.append({}) - for j in range(0,4): - toks = lines[i+j+2].split() - L = toks[0][1] - for pos in range(w): - ll[pos][L] = float(toks[pos+1]) - m = Motif_from_ll(ll) - motifs.append(m) - if lines[i][0:6] == 'Motif '[0:6]: - toks = lines[i].split() - motifs[-1].nseqs = float(re.sub('[\(\)]','',toks[3])) - motifs[-1].totalbits= float(toks[5]) - motifs[-1].MAP = float(toks[7]) - motifs[-1].seeddist = float(toks[9]) - motifs[-1].seednum = int(toks[10][0:-1]) - motifs[-1].pvalue = math.pow(10,-float(toks[12])) - - if 'ch:' in toks: - _idx = toks.index('ch:') - motifs[-1].church = math.pow(10,-float(toks[_idx+1])) - if 'Es:' in toks: - _idx = toks.index('Es:') - motifs[-1].E_site = math.pow(10,-float(toks[_idx+1])) - if 'x2:' in toks: - _idx = toks.index('x2:') - motifs[-1].E_chi2 = math.pow(10,-float(toks[_idx+1])) - if 'Eq:' in toks: - _idx = toks.index('Eq:') - motifs[-1].E_seq = math.pow(10,-float(toks[_idx+1])) - if 'mn:' in toks: - _idx = toks.index('mn:') - motifs[-1].MNCP = float(toks[_idx+1]) - if 'f:' in toks: - _idx = toks.index('f:') - motifs[-1].frac = float(toks[_idx+1]) - if 'Ra:' in toks: - _idx = toks.index('Ra:') - motifs[-1].ROC_auc = float(toks[_idx+1]) - if 'cR:' in toks: - _idx = toks.index('cR:') - motifs[-1].CRA = float(toks[_idx+1]) - if 'Cf:' in toks: - _idx = toks.index('Cf:') - motifs[-1].Cfrac = float(toks[_idx+1]) - if 'k:' in toks: - _idx = toks.index('k:') - motifs[-1].kellis = float(toks[_idx+1]) - - if 'b:' in toks: - _idx = toks.index('b:') - motifs[-1].numbound = int(toks[_idx+1]) - if 'nG:' in toks: - _idx = toks.index('nG:') - motifs[-1].nummotif = int(toks[_idx+1]) - if 'bn:' in toks: - _idx = toks.index('bn:') - motifs[-1].numboundmotif = int(toks[_idx+1]) - - - - if lines[i][0:10] == 'Threshold: '[0:10]: - toks = lines[i].split() - motifs[-1].threshold= float(toks[1]) - if lines[i][0:5] == 'Seed '[0:5]: - toks = lines[i].split() - id = int(toks[1][0:-1]) #'10:' -> '10' - seedD[id] = toks[2] - if lines[i][0:7] == 'Source: '[0:7]: - motifs[-1].source = lines[i][7:].strip() - if lines[i][0:6] == 'Gamma: '[0:6]: - motifs[-1].gamma = float(lines[i][6:]) - if lines[i][0:6] == 'Evalue: '[0:6]: - motifs[-1].evalue = float(lines[i][7:].strip()) - if lines[i][0:22]=='Program specific score: '[0:22]: - tempprogscore=lines[i][23:].split(":"); - - for i in range(len(tempprogscore)): - tempprogscore[i]=tempprogscore[i].strip() - - if len(tempprogscore)>1: - try: - tempprogscore[1]=float(tempprogscore[1]) - except ValueError: - tempprogscore[1]=tempprogscore[1] - motifs[-1].progscore=tempprogscore - - if lines[i][0:10] == 'fasta file:'[0:10]: - parts=lines[i].strip().split() - motifs[-1].dataset, motifs[-1].beta, motifs[-1].bgfile = \ - parts[2],float(parts[4]), parts[7] - - if lines[i][0:21]=='classification error: '[0:21]: - motifs[-1].cverror=float(lines[i][22:].strip()) - if lines[i][0:20]=='SVM match threshold: '[0:20]: - motifs[-1].match_thresh=float(lines[i][21:].strip()) - if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0: - '''#Using all (132) motifs in SLT_081503.seeds as seeds:''' - seedfile = lines[i].split()[-3] - for i in range(len(motifs)): - if seedfile: motifs[i].seedfile = seedfile - seednum = motifs[i].seednum - if seedD.has_key(seednum): - motifs[i].seedtxt = seedD[seednum] - return motifs - -def save_motifs(motifs,filename,kmer_count=20): - """Save list of motifs as a 'TAMO'-formatted motif file to the specificied file. - optional kmer_count specificies how many sequences to include in the printed - multiple sequence alignment that recapitulates the probability matrix.""" - try : - print_motifs(motifs,kmer_count,f=filename) - except: - print '!-- Error saving motifs to %s'%filename - raise - -def print_motif(motif,kmer_count=20,istart=0,f=None): - """Print a motif in the 'TAMO'-format. istart specificies the motif number, and - optional kmer_count specificies how many sequences to include in the printed - multiple sequence alignment that recapitulates the probability matrix. """ - print_motifs([motif],kmer_count,istart) - sys.stdout.flush() - -def print_motifs(motifs,kmer_count=20,istart=0,f=None): - """Print list of motifs as a 'TAMO'-formatted motif file to the specificied file. - Optional kmer_count specificies how many sequences to include in the printed - multiple sequence alignment that recapitulates the probability matrix. - istart specifies number from which to begin motif ids.""" - - # handle f input cases - if f is None : - f = sys.stdout - elif isinstance(f,str) : - f = open(f,'w') - - i = istart-1 - for m in motifs: - i = i + 1 - print >>f, "Log-odds matrix for Motif %3d %s"%(i,m) - m._print >>f, _ll() - #print >>f, "Probability matrix for Motif %3d %s"%(i,m) - #m._print >>f, _p() - print >>f, "Sequence Logo" - m._print >>f, _bits() - for newprop in ('gamma', 'church', 'E_site', 'E_seq', 'E_chi2', 'realpvalue', - 'kellis', 'MNCP', 'ROC_auc', 'CRA', 'Cfrac', 'frac', 'binomial'): - if not m.__dict__.has_key(newprop): #Kludge to deal w/ old shelves - m.__dict__[newprop] = None - if m.seedtxt: print >>f, "Seed: %3d %s"%(i,m.seedtxt) - if m.gamma: print >>f, "Gamma: %7.5f"%m.gamma - if m.evalue != None: print >>f, 'Evalue: %6.3e'%m.evalue - if m.progscore is not None : - printableProgscore=(m.progscore[0],str(m.progscore[1])) - print >>f, 'Program specific score: '+ ": ".join(printableProgscore) - - if m.family: print >>f, "Family: ",m.family - if m.source: print >>f, "Source: ",m.source - if m.dataset: print >>f, "fasta file: %s beta: %f background sequences: %s"%(m.dataset,m.beta,m.bgfile) - if m.match_thresh: print >>f, "SVM match threshold: ",m.match_thresh - if m.cverror: print >>f, "classification error: ",m.cverror - #Motif 0 NGAGGGGGNN (0) (Bits: 8.24 MAP: 6.53 D: 0.21 0) Enr: 54.000 - print >>f, "Motif %3d %-25s (Bits: %5.2f MAP: %5.2f D: %5.3f %2d) E: %6.3f"%( - i, m, m.totalbits, m.MAP, m.seeddist, m.seednum, nlog10(m.pvalue)), - if m.binomial!=None: print >>f, ' Bi: %5.2f'%nlog10(m.binomial), - if m.church != None: print >>f, ' ch: %5.2f'%nlog10(m.church), - if m.frac != None: print >>f, ' f: %5.2f'%(m.frac), - if m.E_site != None: print >>f, ' Es: %5.2f'%nlog10(m.E_site), - if m.E_seq != None: print >>f, ' Eq: %5.2f'%(nlog10(m.E_seq)), - if m.MNCP != None: print >>f, ' mn: %5.2f'%(m.MNCP), - if m.ROC_auc!= None: print >>f, ' Ra: %6.4f'%(m.ROC_auc), - if m.E_chi2 != None: - if m.E_chi2 == 0: m.E_chi2=1e-20 - print >>f, ' x2: %5.2f'%(nlog10(m.E_chi2)), - if m.CRA != None: print >>f, ' cR: %6.4f'%(m.CRA), - if m.Cfrac != None: print >>f, ' Cf: %6.4f'%(m.Cfrac), - if m.realpvalue != None: print >>f, ' P: %6.4e'%(m.realpvalue) - if m.kellis != None: print >>f, ' k: %5.2f'%(m.kellis), - try: - if m.numbound : print >>f, ' b: %3d'%(m.numbound), - if m.nummotif : print >>f, ' nG: %3d'%(m.nummotif), - if m.numboundmotif : print >>f, ' bn: %3d'%(m.numboundmotif), - except: pass - print >>f, '' - - _max = m.maxscore - m.maxscore = -100 - if kmer_count >= 0: - seqs = m.bogus_kmers(kmer_count) - else: - seqs = m.seqs - - for seq in seqs: - print >>f, seq,i,m.scan(seq)[2][0] - - m.maxscore = _max - print >>f, '*'*m.width - print >>f, "MAP Score: %f"%(m.MAP) - -def nlog10(x,min=1e-323): - """returns -log10(x) with a maximum default value of 323.""" - if x < min: x=min - try: - return math.fabs(math.log(x)/math.log(10)) - except: - return 0 - -def txt2motifs(txt,VERBOSE=1): - """Convert a text string into a list of motifs: - Examples: - - 'TGASTCA,GAATC' --> 2 motifs from ambiguity codes - 'results.tamo' --> All motifs in TAMO-format file - 'results.tamo:34,45' --> Motifs 34 and 45 in TAMO-format file - 'results.pickle' --> All motifs in pickle (list or dict of Motifs) - 'results.pickle%GAL4 --> 'GAL4' entry in results.pickle dictionary - 'results.pickle:34,45 -> Motifs 34 and 45 in results.pickle list - """ - motifs = [] - exists = os.path.exists - toks = txt.split(':') - if exists(toks[0]): #It's a file!! - fname = toks[0] - if fname.find('.pickle') > 0: #It's a pickle!! - return pickletxt2motifs(toks) - else: #It's a "Motif" file!! - if VERBOSE: - print "# Loading motif from %s"%fname - allmotifs = load(fname) - if len(toks) == 1: motifs = allmotifs - else: - idxs = [int(x) for x in toks[1].split(',')] - motifs = [allmotifs[x] for x in idxs] - else: #It's a text string!! - fname = 'TXT' - for t in txt.split(','): - motifs.append(Motif_from_text(t)) - for i in range(len(motifs)): motifs[i].index = i - for i in range(len(motifs)): motifs[i].file = fname - return motifs - -def pickletxt2motifs(toks): - """[Utility function] See txt2motifs documentation.""" - fname = toks[0] - print "# Loading motif pickle from %s"%fname - F = open(fname,'r') - DA = pickle.load(F) - F.close() - ans = [] - if type(DA) == type({}): - if len(toks) > 1: - keys = [x.replace('%',' ') for x in toks[1].split(',')] - for k in keys: ans.append(DA[k]) - else: - for k in DA.keys(): DA[k].key = k - ans = DA.values() - else: #Assuming DA is a list - if len(toks) > 1: - idxs = [int(x) for x in toks[1].split(',')] - ans = [DA[x] for x in idxs] - else: - ans = DA - return ans - - -def sortby(motiflist, property, REV=0): - """Sort a motif list according to a particular property""" - mtype = type(Motif()) - for m in motiflist: - if type(m) != mtype: - print "Not a Motif Object: ",m - return - try: - motiflist.sort(lambda x,y,p=property: cmp(x.__dict__[p],y.__dict__[p])) - if REV: motiflist.reverse() - except: - print 'Could not sort list. Probably, the specificied property "%s" is not posessed by all motifs'%property - -
--- a/chipsequtil-master/src/chipsequtil/nib.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,393 +0,0 @@ -'''Functions and classes used to interface with .nib files as created by Jim -Kent's nibFrag and faToNib utilities.''' - -import glob -import math -import os -import struct -import sys -import warnings -from cStringIO import StringIO -from collections import defaultdict as dd - -from chipsequtil import reverse_complement, get_file_parts, BEDFile - - -# module fields -NOMASK,MASK,HARDMASK = range(3) - - -class NibException(Exception) : pass - - -def _nib_fd(nib) : - '''Returns filename and file descriptor for nib, detecting whether it is a \ - path or fd appropriately''' - - # check to see if nib is a file or a string - if isinstance(nib,file) : - nib_fn = nib.name - nib.seek(0) - nib_f = nib - elif isinstance(nib,str) : - nib_fn = nib - nib_f = open(nib,'rb') - else : - raise NibException('Incompatible .nib argument %s with type %s, needs to \ - be either <type \'file\'> or <type \'str\'>'%(str(nib),type(nib))) - - return nib_fn, nib_f - - -def get_nib(nib,start=0,end=-1,strand='+',mask=NOMASK,name=None,dbHeader=None,tbaHeader=None) : - '''Return a (header,sequence) tuple representing this nibFrag record''' - headers = get_nib_header_batch(nib,[(start,end,strand,name,dbHeader,tbaHeader),]) - seqs = get_nib_seq_batch(nib,[(start,end,strand)],mask) - return headers[0], seqs[0] - - -def get_nib_batch(nib,queries,mask=NOMASK) : - '''Batch interface for fetching fasta records. Returns tuple of lists - (headers,sequences)''' - headers = get_nib_header_batch(nib,queries) - seqs = get_nib_seq_batch(nib,[x[:3] for x in queries],mask=mask) - return headers, seqs - - -def get_nib_seq(nib,start=0,end=-1,strand='+',mask=NOMASK) : - '''Extract subsequence from .nib file like Jim Kent's nibFrag utility. - Default behavior is to return the entire sequence. - - Extract the nucleotide substring defined by the closed interval [start,end] - from the sequence found in *nib_fn*. *mask* parameter has the following - possible values: - - chipsequtil.nib.NOMASK -- masked positions are not indicated (default) - chipsequtil.nib.MASK -- masked positions are capitalized, normal bases lower case - chipsequtil.nib.NOMASK -- masked positions are replaced with Ns - ''' - return get_nib_seq_batch(nib,[(start,end,strand)],mask)[0] - - -def get_nib_header(nib_fn,start=0,end=-1,strand='+',name=None,dbHeader=None,tbaHeader=None) : - '''Method for constructing fasta headers compliant with nibFrag utility''' - headers = get_nib_header_batch(nib,[(start,end,strand,name,dbHeader,tbaHeader),]) - return headers[0] - - -def get_nib_header_batch(nib,queries) : - '''Batch method for creating nibFrag headers. *queries* is a list of at most - 6-tuples (start,end,strand,name,dbHeader,tbaHeader) representing queries as - specified by the original nibFrag utility. Only start, end, and strand - fields are required.''' - - nib_path, nib_f = _nib_fd(nib) - - nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) - nbases = validate_nib_file(nib) - headers = [] - header_tmpl = '>%(name)s%(db)s\n' - - for rec in queries : - - # set some defaults if they are not supplied - rec = list(rec) - rec.extend([None]*(6-len(rec))) - start, end, strand, name, dbHeader, tbaHeader = rec - - if end == -1 : - end = nbases - fields = {} - fields['name'] = nib_path+':%d-%d'%(start,end) if not name else name - fields['db'] = '' - - if tbaHeader : - # ignored for some reason in nibFrag when tbaHeader supplied and dbHeader is not - fields['name'] = '' if not dbHeader else fields['name'] - fields['db'] = '%s.%s:%d-%d of %d'%(tbaHeader,nib_base,start,end,nbases) - if dbHeader : - fields['db'] = ':%s.%s:%d-%d:%s:%d'%(dbHeader,nib_base,start,end,strand,nbases) - - headers.append(header_tmpl%fields) - - return headers - - -def validate_nib_file(nib) : - '''Validate .nib file header, returning number of bases indicated if successful. - *nib* argument is either a filename or an open file object. - ''' - - nib_fn, nib_f = _nib_fd(nib) - - # first 4 bytes are a nib file signature - #TODO - consider attempting to figure out byte order to make truly cross platform - def_sig = 0x6BE93D3A - sig = struct.unpack('=l',nib_f.read(4))[0] - if def_sig != sig : - raise NibException('Invalid nib file signature in %s, found %s, expected \ - %s, perhaps .nib file as not created on this platform?\n\nnibFrag style \ - error: %s is not not a good .nib file.'%(nib_fn,hex(sig),hex(def_sig),nib_fn)) - - # second 4 bytes are number of bases in sequence - nbases = struct.unpack('=l',nib_f.read(4))[0] - - return nbases - - -def get_nib_seq_batch(nib,queries,mask=NOMASK) : - '''Extract subsequence from .nib file like Jim Kent's nibFrag utility. - - Extract the nucleotide substrings defined by the closed intervals in *queries* - from the sequence found in *nib*. *nib* argument is either a filename or - an open file object. Entries in *queries* are 3-tuples defining (start,end,strand) - sequence coordinates. Sequences are returned in order in a list as - strings. *mask* parameter has the following possible values: - - chipsequtil.nib.NOMASK -- masked positions are not indicated (default) - chipsequtil.nib.MASK -- masked positions are capitalized, normal bases lower case - chipsequtil.nib.NOMASK -- masked positions are replaced with Ns - ''' - - nib_fn, nib_f = _nib_fd(nib) - - nbases = validate_nib_file(nib_f) - - # rest of file is sequence, with each nibble (4 bytes) being a base as \ - # follows (from http://genome.ucsc.edu/FAQ/FAQformat.html#format8) : - # - # 0 - T - # 1 - C - # 2 - A - # 3 - G - # 4 - N - # - # The most significant bit in a nibble is set if the base is masked - trans_nuc = 'tcagn' - - # start translating the nibbles into nucleotides - def trans_nib(nib) : - nuc = trans_nuc[nib&7] - mask_bit = nib & 8 - if mask in [MASK,HARDMASK] and mask_bit == 0 : - return nuc.upper() - if mask == HARDMASK and mask_bit == 1 : - return 'N' - return nuc - - headers = [] # stores headers - seqs = [] # stores sequences - - # sort the coords so we can walk most efficiently through the file - queries.sort() - - for start, end, strand in queries : - - if start < 0 : - raise NibException('Received negative start coordinate, this may '\ - 'indicate a region on mitochondrial DNA that '\ - 'spans reference sequence start and end. This '\ - 'utility cannot handle these cases, aborting. '\ - 'Requested interval: %s (%d,%d)'%(nib_fn,start,end)) - - start, end = map(int,(start,end)) - - # end == -1 means caller wants entire sequence - if end == -1 : - end = nbases - - if any([nbases < c for c in [start,end]]) : - raise NibException(('Requested slice (%(start)d,%(end)d) not compatible ' \ - 'with sequence of length %(nbases)d in %(nib_fn)s, aborting\n\nnibFrag '\ - 'style error: nib read past end of file (%(start)d %(end)d) in file: '\ - '%(nib_fn)s')%{'start':start,'end':end,'nbases':nbases,'nib_fn':nib_fn}) - - # figure out how many bytes to read through - start_byte,rem_byte = start/2,start%2 - - # calculate where we need to move to in the file from the current location - # + 8 is from the 2*4 bytes header info in the .nib format - byte_offset = start_byte-nib_f.tell() + 8 - nib_f.seek(byte_offset,1) # seek forward to the beginning byte from current location - seq_bytes,seq_rem_byte = int(math.ceil((end-start+rem_byte)/2.)),(end+1)%2 - seq_bytes = nib_f.read(seq_bytes+seq_rem_byte) - - # start translating the bytes - seq = StringIO() # we use StringIO because it is more efficient than concatenating strings - for c in seq_bytes : - c_byte = struct.unpack('=b',c)[0] - - # higher nibble - c_nib = (c_byte & (15<<4))>>4 - nuc = trans_nib(c_nib) - seq.write(nuc) - - # lower nibble - c_nib = int(c_byte) & 15 - nuc = trans_nib(c_nib) - seq.write(nuc) - - # final nucleotide sequence - seq_str = seq.getvalue() - - # if we're reading to the end, don't clip anything - if end != nbases : - # if the coordinate requested was not on a byte boundary, adjust - if rem_byte == 1 : - seq_str = seq_str[1:] - if seq_rem_byte == 1 : - seq_str = seq_str[:-1] - - # nibFrag apparently uses zero-based indexing, clip off one base - seq_str = seq_str[:-1] - seq.close() - - # adjust strand - if strand == '-' : - seq_str = reverse_complement(seq_str) - seqs.append(seq_str) - - return seqs - - -class SeqDBException(Exception): pass -class NibDBException(Exception): pass - - -class SeqDB(object) : - '''Base class for different kinds of sequence databases. Does nothing, - implement subclasses. Constructor rovides _db_map and db_info class members.''' - def __init__(self) : - self._db_map = {} - self.db_info = dd(dict) - - def get_seq(self,*args, **kwargs) : - raise SeqDBException('Base class SeqDB has no get_seq implementation') - - -class NibDB(SeqDB) : - '''Class providing an interface to a set of .nib files as created by faToNib - in Jim Kent's software suite. - - Sequences are identified by the basename of the .nib file without the .nib - extension, e.g. chr1.nib is identified as chr1. - - Some potentially useful information about the entries in the database is - stored in the *nib_info* dictionary. - ''' - - def __init__(self,nib_fns=[],nib_dirs=[]) : - '''*nib_fns* is a list of paths to specific .nib files desired for the - NibDB. *nib_dirs* is a list of paths to directories containing .nib - files such that every .nib file in the directories is added to the NibDB. - Explicitly passed files take precedence over those found in directories - when sequence names collide. - ''' - SeqDB.__init__(self) - - # find all *.nib files in the directories passed - if isinstance(nib_dirs,str) : # user just provided single directory - nib_dirs = [nib_dirs] - - dir_nibs = [] - for d in nib_dirs : - dir_nibs.extend(glob.glob(os.path.join(d,'*.nib'))) - - if isinstance(nib_fns,str) : - nib_fns = [nib_fns] - # for each .nib found, add to db - # if there is a collision of names, those specified in files (not dirs) - # takes precedence without warning - for fn in dir_nibs+nib_fns : - - # open the nib file - nib_path,nib_fn,nib_base,nib_ext = get_file_parts(fn) - fn, nib_f = _nib_fd(fn) - self._db_map[nib_base] = nib_f - - # store some info - self.db_info[nib_base]['path'] = fn - nbases = validate_nib_file(self._db_map[nib_base]) - self.db_info[nib_base]['nbases'] = nbases - - def __del__(self) : - '''import this - ...Explicit is better than implicit... - ''' - for nib_f in self._db_map.values() : - nib_f.close() - - def _get_db_map(self,name) : - '''Gets appropriate file handle for the requested name, raises NibDBException - if it cannot be found''' - try : - return self._db_map[name] - except KeyError : - raise NibDBException('Sequence name %s not found in NibDB'%name) - - def get_fasta(self,name,start=0,end=-1,strand='+',mask=NOMASK) : - '''Get the fasta record for the specified arguments, returns (header,sequence) - tuple.''' - - nib_f = self._get_db_map(name) - return get_nib(nib_f,start,end,strand,mask) - - def get_fasta_batch(self,recs,mask=NOMASK) : - '''Batch version of *get_fasta* method. *recs* is a list of lists/tuples - with (<chromo>,<start>,<end>,<strand>). Returns list of (header,sequence) - tuples in the same sequence as the input records.''' - - # gather the records for each chromosome together - chrom_recs = dd(list) - for i,r in enumerate(recs) : - chrom_recs[r[0]].append((i,r)) # recs are (index,<tuple>) - - # extract sequences - all_chrom_recs = [] - for chrom, rec_list in chrom_recs.items() : - # sorted lists make sequence extraction efficient - rec_list.sort(key=lambda x: x[1][1]) # recs are (index,<tuple>) - - # separate indexes from records, extract for this chromo - indexes, c_recs = zip(*rec_list) - - # get_nib_batch requires list of (<start>,<end>,<strand>) tuples, remove - # chromo in first position - c_recs = [r[1:] for r in c_recs] - - nib_f = self._get_db_map(chrom) - headers, seqs = get_nib_batch(nib_f,c_recs,mask) - - # return the sequences to a (index,(header,sequence)) list - all_chrom_recs.extend(zip(indexes,zip(headers,seqs))) - - # put the sequences back in the original order - all_chrom_recs.sort(key=lambda x: x[0]) # recs are (index,<tuple>) again - indexes, recs = zip(*all_chrom_recs) - - return zip(*recs) - - def get_fasta_from_bed(self,bed,mask=NOMASK) : - '''Accepts either a chipsequtil.BEDFile instance or a filename for a BED - file (used to construct a BEDFile instance) and returns the fasta - records for all records in order.''' - - # determine if *bed* is a filename or a BEDFile - if isinstance(bed,str) : # filename - bed = BEDFile(bed) - - # construct the records - recs = [] - for rec in bed : - if rec['chrom'].lower().startswith('track') : # track line, skip - continue - recs.append((rec['chrom'],int(rec['chromStart']),int(rec['chromEnd']),rec['strand'])) - - return self.get_fasta_batch(recs,mask) - - def get_seq(self,name,start=0,end=-1,strand='+',mask=NOMASK) : - '''Extract sequence from sequence *name*. Other arguments are passed - directly to *get_nib_seq* function.''' - - nib_f = self._get_db_map(name) - return get_nib_seq(nib_f,start,end,strand,mask)
--- a/chipsequtil-master/src/chipsequtil/plotting.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -import math - -from matplotlib.pyplot import hist, plot, savefig, title, show, xticks, yticks, figure, clf - -from chipsequtil import get_gc_content - -def plot_gc_content(sequences,bins=10,fn=None) : - - # calculate all the GC contents, sort them - gc_contents = map(get_gc_content,sequences) - gc_contents.sort() - - f = figure() - points = hist(gc_contents,bins=bins) - if fn : - savefig(fn) - else : - show() - clf() - - -def plot_pos_neg_peaks(pos_peaks,neg_peaks) : - '''Plot # pos peaks/# neg peaks by p-value''' - pass
--- a/chipsequtil-master/src/chipsequtil/sampling.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,252 +0,0 @@ - -import math -import random -import re -import sys -from collections import defaultdict - -from chipsequtil import get_org_settings, get_gc_content, get_gc_content_distribution, RefGeneFile -from nib import NibDB, NibException - -def kl_divergence(p,q) : - """Return Kullback-Leibler divergence for two probability distributions - p and q. p and q should be indexable objects of the same length where - p_i corresponds to q_i. - """ - kl_sum = 0. - for p_i, q_i in zip(p,q) : - if p_i != 0 and q_i != 0 : - kl_sum += p_i * math.log(p_i/q_i) - return kl_sum - -def rejection_sample_bg(fg_dict,organism,bins=100,num_samples=None,verbose=False, - bg_match_epsilon=1e-3) : - '''Generate background sequences according to the size, distance from genes, - and GC content distributions of the supplied foreground sequences. *fg_dict* - is a dictionary of <header>:<sequence> items, where the first part of the - header must contain: - - >chrX:<start>-<end> - - *organism* is a string that will be used to call the *chipsequtil.get_org - settings* function and uses the 'genome_dir' and 'annotation_path' keys. - *bins* is the number of bins to use for representing the GC content - distribution. Function returns a dictionary of <header>:<sequence> items - of generated background sequences.''' - - nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']]) - tss_fn = get_org_settings(organism)['annotation_path'] - tss = defaultdict(list) - for rec in RefGeneFile(tss_fn) : - tss[rec['chrom']].append((int(rec['txStart']),int(rec['txEnd']),)) - - # for each peak find the chromosome, distance to nearest - # gene, size of peaks in bases, and GC content - num_samples = len(fg_dict) if not num_samples else num_samples - dists,sizes=[],[] - - for header,seq in fg_dict.items() : - - # chromosome first field in fasta headers from bed2seq.bedtoseq - chrom = header.split(':')[0] - - # adjust chromosomes in special cases - if re.search('random',chrom.lower()) or chrom.lower() == 'chrm' : - continue - - # start first int in second field of bed2seq.bedtoseq header - start = int(header.split(':')[1].split('-')[0]) - midpoint = start + len(seq)/2 - - # figure out which chromosome we're working on - tss_chr = tss[chrom] - - # dsts_to_genes is the distance of this peak from all the genes, find minimum - dists_to_genes = [(s[0]-midpoint) for s in tss_chr] - try : - min_dist = min(dists_to_genes,key=lambda x : abs(x)) - dists.append(min_dist) - except : - err_str = 'Warning: no genes were found for sequence with header' \ - ' %s, not using to calculate distributions.\n'%header - sys.stderr.write(err_str) - - # calculate # bases - sizes.append(len(seq)) - - # GC content distribution for the foreground sequences - gc_dist = get_gc_content_distribution(fg_dict.values(),bins=bins) - - # max_gc is # peaks w/ highest GC content - max_gc = max(gc_dist) - - # gene_starts is a list of all genes in (chromosome,gene start) tuples - gene_starts=[] - for key in tss.keys(): - chrom=key.split('chr')[-1] - for x in tss[key]: - gene_starts.append((key,x[0])) - - # encapsulated function for proposing sequences - def propose_sequence(dists, gene_starts, sizes, nib_db) : - # sample a random distance from the list of distances - d = random.choice(dists) - - # pick a random gene - chrom, coord = random.choice(gene_starts) - - # propose a starting point for the bg sequence - midpoint = coord-d+random.randint(-100,100) - - # propose a size for the bg sequence - size = random.choice(sizes) - start = int(midpoint-int(size/2)) - stop = int(midpoint+int(size/2)) - - #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d)) - # if start or stop are negative, skip and try again - if start < 0 or stop < 0 : seq = None - - # randomly choose strand - strand = '+' if random.random() > 0.5 else '-' - - # extract the proposed sequence - try : - nib_title, seq = nib_db.get_fasta(chrom,start,stop,strand) - except IOError, e : - if verbose : sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n'%(chrom,start,stop,strand)) - seq = None - except NibException, e : - if verbose : sys.stderr.write('NibDB.get_fasta error, %s\n'%e) - seq = None - - header = '%s:%d-%d'%(chrom,start,stop) - - return header, seq - - - # build gc content distribution based on seq length and - # distance from TSS foreground distributions - # keep sampling sequences until the distribution stops - # changing a lot (KL divergence < epsilon) - bg_gc_cnts = [1.]*bins - converged = False - epsilon = bg_match_epsilon - if verbose : sys.stderr.write('Building empirical background GC content distribution\n') - while not converged : - - # propose a sequence - header, seq = propose_sequence(dists,gene_starts,sizes,nib_db) - - # sometimes this happens when there is an error, just try again - if seq is None : - continue - - # determine the GC bin for this sequence - gc_content = get_gc_content(seq) - gc_bin = -1 - for i in range(bins) : - win_start = i/float(bins) - win_end = (i+1)/float(bins) - if gc_content >= win_start and gc_content < win_end : - gc_bin = i - break - - # update the gc content distribution - sum_cnts = float(sum(bg_gc_cnts)) - if sum_cnts != 0 : # ! on first sequence - - # calculate the current distributions - last_gc_p = map(lambda x:x/sum_cnts,bg_gc_cnts) - bg_gc_cnts[gc_bin] += 1 - new_gc_p = map(lambda x:x/sum_cnts,bg_gc_cnts) - - # calculate the kl divergence between last distribution - # and current one, stopping if less than epsilon - kl_d = kl_divergence(new_gc_p,last_gc_p) - if verbose : sys.stderr.write('dist to converge: %.3g\r'%(kl_d-epsilon)) - if kl_d < epsilon : - converged = True - - else : - bg_gc_cnts[gc_bin] += 1 - - if verbose : sys.stderr.write('\ndone\n') - - # add pseudocounts to account for missing data in bg as to avoid - # inappropriate scaling in rejection sampling step - # the fg bin with the largest value that corresponds to an empty - # bg bin is used to calculate the number of pseudocounts so that - # the resulting bg bin has the same propotion of counts in it as - # the original fg bin. This is calculated as: - # - # x_{pseudo} = \frac{p_i\sum_{i=1}^{N}a_i}{1-p_iN} - # - # where p_i is the value of the max fg bin w/ zero in the bg bin - # x_{pseudo} is added to every bin - pseudocounts = 0 - for fg_i, bg_i in zip(gc_dist,bg_gc_cnts) : - if fg_i != 0 and bg_i == 0 and fg_i*len(fg_dict) > pseudocounts : - # if fg_i > 1/sum(bg_gc_cnts) this won't work, but that *shouldn't* - # ever happen - if fg_i >= 1./sum(bg_gc_cnts) : - raise Exception('There was a numeric issue in the rejection sampling routine, please try it again') - sys.stderr.write(str([fg_i,sum(bg_gc_cnts),len(bg_gc_cnts),1.*fg_i*len(bg_gc_cnts),bg_gc_cnts])+'\n') - sys.stderr.flush() - pseudocounts = (fg_i*sum(bg_gc_cnts))/(1-1.*fg_i*len(bg_gc_cnts)) - - bg_gc_cnts = map(lambda x: x+pseudocounts/sum(bg_gc_cnts),bg_gc_cnts) - bg_gc_dist = map(lambda x: x/sum(bg_gc_cnts),bg_gc_cnts) - - # last, find the multiplier that causes the background gc distribution to - # envelope the foreground gc dist - z_coeff = gc_dist[0]/bg_gc_dist[0] - for fg_i, bg_i in zip(gc_dist[1:],bg_gc_dist[1:]) : - z_coeff = max(z_coeff,fg_i/bg_i) - bg_gc_dist = map(lambda x: x*z_coeff,bg_gc_dist) - - # start generating bg sequences - bg_dict = {} - - bg_gcs,bg_sizes=[],[] - - # generate a bg sequence for every fg sequence - for i in range(num_samples): - if verbose : sys.stderr.write('%d/%d'%(i,num_samples)) - - # propose sequences until one is accepted - accepted_sequence = False - while not accepted_sequence: - if verbose : sys.stderr.write('.') - - # propose a sequence - header, seq = propose_sequence(dists,gene_starts,sizes,nib_db) - - # problem occured in proposing sequence, just keep going - if seq is None : continue - - # determine the GC bin for this sequence - gc_content = get_gc_content(seq) - gc_bin = -1 - for i in range(bins) : - win_start = i/float(bins) - win_end = (i+1)/float(bins) - if gc_content >= win_start and gc_content < win_end : - gc_bin = i - continue - - # pick a uniform random number such that it does not exceed - # the maximum GC content distribution over bins - # if the random number is <= the GC content for this - # proposed sequence, accept, otherwise reject - r = random.random() * bg_gc_dist[gc_bin] - if r > gc_dist[gc_bin] : - continue - else: - bg_gcs.append(x) - #bg_sizes.append(size) - accepted_sequence = True - bg_dict[header] = seq - - if verbose : sys.stderr.write('\r') - return bg_dict
--- a/chipsequtil-master/src/chipsequtil/seq.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,265 +0,0 @@ -from itertools import izip -from textwrap import wrap - -# FASTA functions and classes -def fasta_itr(f) : - '''Returns a generator that iterates through a FASTA formatted file. - *f* may be either a text or gzipped file, or a file-like python object - representing either of these. Records are returned in the order they - are found.''' - if isinstance(f,str) : - f = open(f) - - # check for magic number 1f 8b indicating gzip file, I dunno, just cuz - if f.read(2) == "\x1f\x8b" : f = gzip.GzipFile(f) - else : f.seek(0) - - curr_header, curr_seq = None, None - for r in f : - if r.startswith('>') : - if curr_header is not None : - yield (curr_header, curr_seq) - curr_header = r[1:].strip() - curr_seq = '' - else : - curr_seq += r.strip() - # return the last record - yield (curr_header,curr_seq) - -def fasta_to_dict(f) : - '''Returns a dictionary whose keys are FASTA headers and values are - sequences. *f* may be a text, gzipped file, or a file-like - python object representing either of these.''' - return dict(fasta_itr(f)) - -def write_fasta_to_file(fasta,f,linelen=None) : - '''Writes the FASTA records in *fasta* to file specified in *f*. *fasta* - may be a dictionary like that returned by *fasta_to_dict* or a *FASTAFile* - instance. *f* may be a filename or a file-like object opened with write - mode.''' - if isinstance(fasta,dict) : - fasta_itr = fasta.iteritems() - else : - fasta_itr = fasta - - if isinstance(f,str) : - f = open(str,'w') - - for header, seq in fasta_itr : - if linelen is not None : - seq = fill(seq,linelen) - f.write('>%s\n%s\n'%(header,seq)) - f.close() - - -class FASTAFile(object) : - '''A file-like object providing information and statistics about the - sequences in a FASTA formatted file. Efficiently iterates through a - text or gzipped FASTA file and provides sequential or random access to - the records. Instances store header and sequence data as they are read. - - >>> fasta_str = StringIO(">seq1\\nACATAGGGAT\\n>seq2\\nTTATNTAGATA\\n") - >>> fasta_f = FASTAFile(fasta_str) - >>> [r for r in fasta_f] - [('seq1', 'ACATAGGGAT'), ('seq2', 'TTATNTAGATA')] - >>> fasta_f['seq1'] - ACATAGGGAT - >>> fasta_f.headers - ['seq1', 'seq2'] - >>> fasta_f.sequences - ['ACATAGGGAT', 'TTATNTAGATA'] - - Instances have the following members: - - **headers** - list of FASTA headers in original order - - **sequences** - list of FASTA sequences in original order - - .. NOTE:: - The members **headers** and **sequences** are not available until the - the FASTA records have been iterated once. - - When indexing like `fasta_f['seq1']`, the class assumes all headers are - unique, iterating does not make this assumption. - ''' - - def __init__(self,f) : - self._f = f - self._fasta_itr = fasta_itr(f) - self.headers = [] - self.sequences = [] - self._dict = {} - - def __getitem__(self,key) : - return self._dict[key] - - def __setitem__(self,key,val) : - self._dict[key] = val - - def next(self) : - '''Returns next FASTA record in the file as (header, sequence) tuple.''' - - if self._fasta_itr is None : - self._fasta_itr = izip(self.headers,self.sequences) - - try : - header, seq = self._fasta_itr.next() - except StopIteration, e : - self._fasta_itr = None - self._f = None - raise e - - if self._f is not None : - # this means we're not done reading through the file yet - self.headers.append(header) - self.sequences.append(seq) - self._dict[header] = seq - - return header, seq - - def __iter__(self) : - return self - -# FASTQ functions and classes -def fastq_itr(f) : - '''Returns a generator that iterates through a FASTQ formatted file. - *f* may be either a text or gzipped file, or a file-like python object - representing either of these. Records are returned in the order they - are found.''' - if isinstance(f,str) : - f = open(f) - - # check for magic number 1f 8b indicating gzip file, I dunno, just cuz - if f.read(2) == "\x1f\x8b" : f = gzip.GzipFile(f) - else : f.seek(0) - - SEQ, QUAL = 0,1 - in_region = SEQ - curr_header, curr_seq, curr_qual = None, None, None - for r in f : - if r.startswith('@') : - if curr_header is not None : - yield (curr_header, (curr_seq, curr_qual)) - curr_header = r[1:].strip() - curr_seq = '' - curr_qual = '' - in_region = SEQ - elif r.startswith('+') : - in_region = QUAL - else : - curr_field = r.strip() - if in_region == SEQ : - curr_seq += curr_field - elif in_region == QUAL : - curr_qual += curr_field - - # return the last record - yield (curr_header,(curr_seq,curr_qual)) - -def fastq_to_dict(f) : - '''Returns a dictionary whose keys are FASTQ headers and values are - sequences. *f* may be a text, gzipped file, or a file-like - python object representing either of these.''' - return dict(fastq_itr(f)) - -def write_fastq_to_file(fastq,f,linelen=None) : - '''Writes the FASTQ records in *fasta* to file specified in *f*. *fastq* - may be a dictionary like that returned by *fastq_to_dict* or a *FASTQFile* - instance. *f* may be a filename or a file-like object opened with write - mode.''' - if isinstance(fastq,dict) : - fastq_itr = fasta.iteritems() - else : - fastq_itr = fasta - - f_out = open(str,'w') if isinstance(f,str) else f - - for header, (seq, qual) in fastq_itr : - if linelen is not None : - seq = fill(seq,linelen) - f_out.write('>%s\n%s\n'%(header,seq)) - - if isinstance(f,str) : - f_out.close() - - -class FASTQFile(object) : - '''A file-like object providing information and statistics about the - sequences in a FASTQ formatted file. Efficiently iterates through a - text or gzipped FASTQ file and provides sequential or random access to - the records. Instances store header and sequence data as they are read - - >>> fastq_str = StringIO("@seq1\\nACATAGGGAT\\n+seq2\\nY^_cccQYJQ\\n - @seq2\\nTTATNTAGAT\\n+seq2\\nY^_cJcQQJQ") - >>> fastq_f = FASTQFile(fastq_str) - >>> [r for r in fastq_f] - [('seq1', ('ACATAGGGAT', 'Y^_cccQYJQ')), ('seq2', ('TTATNTAGATA', 'Y^_cJcQQJQ'))] - >>> fastq_f['seq1'] - ('seq1', ('ACATAGGGAT', 'Y^_cccQYJQ')) - >>> fastq_f.headers - ['seq1', 'seq2'] - >>> fastq_f.sequences - ['ACATAGGGAT', 'TTATNTAGAT'] - >>> fastq_f.quals - ['Y^_cccQYJQ', 'Y^_cJcQQJQ'] - - Instances have the following members: - - **headers** - list of FASTQ headers in original order - - **sequences** - list of FASTQ sequences in original order - - **quals** - list of FASTQ quality scores in original order - - .. NOTE:: - The members **headers**, **sequences**, and **quals** are not available - until the the FASTQ records have been iterated once - - When indexing like `fastq_f['seq1']`, the class assumes all headers are - unique, iterating does not make this assumption. - ''' - - def __init__(self,f) : - self._f = f - self._fastq_itr = fastq_itr(f) - self.headers = [] - self.sequences = [] - self.quals = [] - self._dict = {} - - def __getitem__(self,key) : - return self._dict[key] - - def __setitem__(self,key,val) : - self._dict[key] = val - - def next(self) : - '''Returns next FASTA record in the file as (header, sequence) tuple.''' - - if self._fastq_itr is None : - self._fastq_itr = izip(self.headers,self.sequences) - - try : - header, (seq, qual) = self._fastq_itr.next() - except StopIteration, e : - self._fastq_itr = None - self._f = None - raise e - - if self._f is not None : - # this means we're not done reading through the file yet - self.headers.append(header) - self.sequences.append(seq) - self.quals.append(qual) - self._dict[header] = (seq, qual) - - return header, (seq, qual) - - def __iter__(self) : - return self -
--- a/chipsequtil-master/src/chipsequtil/util.py Mon Mar 28 11:56:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ -"""Utility/helper classes and functions used by the chipsequtil package. -""" - -import textwrap - -from optparse import IndentedHelpFormatter - -class MultiLineHelpFormatter(IndentedHelpFormatter) : - """An OptionParser formatter that preserves newline characters in - description and epilog fields and word-wraps all sequences of text - not interrupted by newline characters. - """ - - def _format_text(self, text) : - """Wrap paragraphs of text individually separated by - newlines (preserves explicit newline characters). - """ - text_width = self.width - self.current_indent - indent = " "*self.current_indent - output_text = [] - paragraphs = text.split('\n') - for p in paragraphs : - output_text.append(textwrap.fill(p, - text_width, - initial_indent=indent, - subsequent_indent=indent)) - return '\n'.join(output_text) - - - - -# A binary ordered tree example -# shamelessly copied from: http://code.activestate.com/recipes/286239-binary-ordered-tree/ -class CNode: - left , right, data = None, None, 0 - - def __init__(self, data): - # initializes the data members - self.left = None - self.right = None - self.data = data - - -class KeyedBinaryTree : # do this later... - pass - - -class CBOrdTree: - def __init__(self): - # initializes the root member - self.root = None - - def addNode(self, data): - # creates a new node and returns it - return CNode(data) - - def insert(self, root, data): - # inserts a new data - if root == None: - # it there isn't any data - # adds it and returns - return self.addNode(data) - else: - # enters into the tree - if data <= root.data: - # if the data is less than the stored one - # goes into the left-sub-tree - root.left = self.insert(root.left, data) - else: - # processes the right-sub-tree - root.right = self.insert(root.right, data) - return root - - def lookup(self, root, target): - # looks for a value into the tree - if root == None: - return 0 - else: - # if it has found it... - if target == root.data: - return 1 - else: - if target < root.data: - # left side - return self.lookup(root.left, target) - else: - # right side - return self.lookup(root.right, target) - - def minValue(self, root): - # goes down into the left - # arm and returns the last value - while(root.left != None): - root = root.left - return root.data - - def maxDepth(self, root): - if root == None: - return 0 - else: - # computes the two depths - ldepth = self.maxDepth(root.left) - rdepth = self.maxDepth(root.right) - # returns the appropriate depth - return max(ldepth, rdepth) + 1 - - def size(self, root): - if root == None: - return 0 - else: - return self.size(root.left) + 1 + self.size(root.right) - - def printTree(self, root): - # prints the tree path - if root == None: - pass - else: - self.printTree(root.left) - print root.data, - self.printTree(root.right) - - def printRevTree(self, root): - # prints the tree path in reverse - # order - if root == None: - pass - else: - self.printRevTree(root.right) - print root.data, - self.printRevTree(root.left) -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes.py Mon Mar 28 12:31:17 2016 -0400 @@ -0,0 +1,236 @@ +#!/usr/local/bin/python + +import sys, os +from optparse import OptionParser +from collections import defaultdict as dd +from csv import DictReader, DictWriter + +from chipsequtil import MACSFile, BEDFile, KnownGeneFile, parse_number +from chipsequtil.util import MultiLineHelpFormatter + +usage = '%prog [options] <knownGene file> <knownGene xRef file> <peaks file>' +description = """ +Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> is\ +format is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.\ +<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension \ +is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail \ +option is provided, the following extra fields are appended to each row: + +peak loc, dist from feature, map type, map subtype +""" +epilog = '' +parser = OptionParser(usage=usage,description=description,epilog=epilog,formatter=MultiLineHelpFormatter()) +parser.add_option('--upstream-window',dest='upst_win',type='int',default=5500,help='window width in base pairs to consider promoter region [default: %default]') +parser.add_option('--downstream-window',dest='dnst_win',type='int',default=2500,help='window width in base pairs to consider downstream region [default: %default]') +parser.add_option('--tss',dest='tss',action='store_true',help='calculate downstream window from transcription start site instead of transcription end site') +parser.add_option('--map-output',dest='peak_output',default=None,help='filename to output mapped peaks to [default: stdout]') +parser.add_option('--stats-output',dest='stats_output',default=sys.stderr,help='filename to output summary stats in conversion [default: stderr]') +parser.add_option('--peaks-format',dest='peaks_fmt',default='auto',type='choice',choices=['auto','MACS','BED'],help='format of peaks input file [default: %default]') +parser.add_option('--detail',dest='detail',action='store_true',help='add extra fields to output, see description') +parser.add_option('--intergenic',dest='intergenic',action='store_true',help='write intergenic peaks to the gene file as well with None as gene ID') +#parser.add_option('--symbol-xref',dest='symbol_xref',default=None,help='use the kgXref table file supplied to find a gene symbol, output as second column') + +# TODO - options +#parser.add_option('--use-cds',dest='use_cds',action='store_true',help='use cdsStart and cdsEnd fields instead of txStart and txEnd to do mapping') +#parser.add_option('--capture-intergenic'...) +#parser.add_option('--map-format',dest='peak_format',type='choice',choices=['default','BED'],help='format of peak output [default: %default]') +#parser.add_option('--stats-format',dest='stats_format',type='choice',choices=['human','python'],help='format of summary stats output [default: %default]') + +def parse_gene_ref(ref_gene) : + reader = KnownGeneFile(ref_gene) + gene_ref = dd(list) + for ref_dict in reader : + gene_ref[ref_dict['chrom']].append(ref_dict) + + return gene_ref + +def parse_gene_ref_line(l) : + l = map(parse_number, l) # coerce to numbers where possible + l[9] = map(parse_number, l[9].split(',')) # turn 'x,x,x,...' into list + l[10] = map(parse_number, l[10].split(',')) + return l + +if __name__ == '__main__' : + + opts, args = parser.parse_args(sys.argv[1:]) + + if len(args) < 3 : + parser.error('Must provide three filename arguments') + + gene_ref = parse_gene_ref(args[0]) + xref_fn = args[1] + peaks_fn = args[2] + if opts.peaks_fmt == 'auto' : + path,ext = os.path.splitext(peaks_fn) + if ext.lower() == '.xls' : + opts.peaks_fmt = 'MACS' + elif ext.lower() == '.bed' : + opts.peaks_fmt = 'BED' + elif ext.lower() == '.narrowpeak' : + opts.peaks_fmt = 'BED' + else : + parser.error('Could not guess peaks file format by extension (%s), aborting'%ext) + + if opts.peaks_fmt == 'MACS' : + peaks_reader_cls = MACSFile + chr_field, start_field, end_field = 'chr', 'start', 'end' + elif opts.peaks_fmt == 'BED' : + peaks_reader_cls = BEDFile + chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' + else : + # should never happen + fieldnames = [] + + #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') + peaks_reader = peaks_reader_cls(peaks_fn) + + # default output format: + if opts.peak_output : + peak_output = open(opts.peak_output,'w') + else : + peak_output = sys.stdout + + fieldnames = peaks_reader.FIELD_NAMES + if opts.detail : + fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score" + output_fields = ['knownGeneID']+fieldnames + + # see if the user wants gene symbols too + # TODO - actually make this an option, or make it required + opts.symbol_xref = xref_fn + if opts.symbol_xref : + kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] + symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') + symbol_xref_map = {} + for rec in symbol_xref_reader : + symbol_xref_map[rec['kgID']] = rec + output_fields = ['knownGeneID','geneSymbol']+fieldnames + + peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') + peaks_writer.writerow(dict([(k,k) for k in output_fields])) + unique_genes = set() + map_stats = dd(int) + for peak in peaks_reader : + + # if this is a comment or header line get skip it + if peak[fieldnames[0]].startswith('#') or \ + peak[fieldnames[0]] == fieldnames[0] or \ + peak[fieldnames[0]].startswith('track') : continue + + # coerce values to numeric if possible + for k,v in peak.items() : peak[k] = parse_number(v) + + # MACS output gives us summit + if opts.peaks_fmt == 'MACS' : + peak_loc = peak[start_field]+peak['summit'] + else : # peak assumed to be in the middle of the reported peak range + peak_loc = (peak[start_field]+peak[end_field])/2 + + chrom_genes = gene_ref[peak[chr_field]] + + if len(chrom_genes) == 0 : + sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) + continue + + mapped = False + + # walk through the genes for this chromosome + for gene in chrom_genes : + + # reusable dictionary for output + out_d = {}.fromkeys(output_fields,0) + out_d.update(peak) + out_d['map type'] = '' + out_d['chromo'] = peak[chr_field] + out_d['peak loc'] = peak_loc + + # determine intervals for promoter, gene, and downstream + if gene['strand'] == '+' : + promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 + if opts.tss : + gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) + downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win + else : + promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing + if opts.tss : + gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] + downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + else : + gene_coords = gene['txStart'], gene['txEnd'] + downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing + + # check for promoter + if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : + out_d['map type'] = 'promoter' + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + # check for gene + elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : + # check for intron/exon + exon_coords = zip(gene['exonStarts'],gene['exonEnds']) + in_exon = False + for st,en in exon_coords : + if peak_loc >= st and peak_loc <= en : + in_exon = True + break + out_d['map type'] = 'gene' + out_d['map subtype'] = 'exon' if in_exon else 'intron' + + #Commented out to keep score reported in bed file - AJD 7/29/14 + # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene + #gene_len = float(gene_coords[1]-gene_coords[0]) + #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len + + # distance calculated from start of gene + out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc + + map_stats[out_d['map subtype']] += 1 + + # check for downstream + elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : + out_d['map type'] = 'after' + if opts.tss : + out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc + else : + out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc + + # does not map to this gene + else : + pass + + # map type is not blank if we mapped to something + if out_d['map type'] != '' : + + #out_d = {'knownGeneID':gene['name']} + out_d['knownGeneID'] = gene['name'] + if opts.symbol_xref : + out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] + peaks_writer.writerow(out_d) + + mapped = True + + # reset map_type + out_d['map type'] = '' + + if not mapped : + if opts.intergenic : + out_d['knownGeneID'] = 'None' + out_d['geneSymbol'] = 'None' + out_d['map type'] = 'intergenic' + peaks_writer.writerow(out_d) + map_stats['intergenic'] += 1 + + if peak_output != sys.stdout : + peak_output.close() + + #if opts.stats_output != sys.stderr : + # opts.stats_output = open(opts.stats_output,'w') + + #for k,v in map_stats.items() : + # opts.stats_output.write('%s: %s\n'%(k,v)) + + #if opts.stats_output != sys.stderr : + # opts.stats_output.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/map_to_known_genes.xml Mon Mar 28 12:31:17 2016 -0400 @@ -0,0 +1,46 @@ +<tool id="chipsequtil_maptoknowngenes" name="Map Peaks to Known Genes" version="0.1"> + <description> + Map the peaks in <peaks file> to genes in <knownGene file>. <knownGene file> isformat is as specified in http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.sql.<peaks file> format is as produced by MACS. If *auto* is chosen (default) file extension is examined for *.xls* for default MACS format or *.bed* for BED format. If the --detail option is provided, the following extra fields are appended to each row: + peak loc, dist from feature, map type, map subtype + </description> + <parallelism method="basic"></parallelism> + <requirements> + <requirement type="package">chipsequtil</requirement> + </requirements> + <command interpreter="python"> + map_to_known_genes.py + $tss + --upstream-window=$upst_win + --downstream-window=$dnst_win + --map-output=$peaksOutput + --peaks-format=$peaks_fmt + $detail + $intergenic + $knownGeneFile $knownGeneRef $macsPeaksFile + + </command> + <inputs> + <param name="knownGeneFile" type="data" label="knownGene file" help="" optional="false" /> + <param name="knownGeneRef" type="data" label="knownGene xRef file" help="" optional="false" /> + <param name="macsPeaksFile" type="data" label="Peaks File" help="" optional="false" /> + <param name="peaksOutput" type="text" label="Output filename" help="filename to output mapped peaks to" optional="false" /> + + <param name="upst_win" type="integer" label="Upstream Window" help="Window width in base pairs to consider promoter region [default: %default]" optional="false" value="5500" /> + <param name="dnst_win" type="integer" label="Downstream Window" help="Window width in base pairs to consider downstream region [default: %default]" optional="false" value="2500" /> + + <param name="tss" checked="true" label="calculate downstream window from transcription start site instead of transcription end site" type="boolean" truevalue="--tss" falsevalue="" help="" /> + + <param name="peaks_fmt" type="select" label="Peaks Format" help="Format of peaks input file" optional="false"> + <option value="auto">auto</option> + <option value="MACS">MACS</option> + <option selected="true" value="BED">BED</option> + </param> + + <param name="detail" checked="false" label="Add extra fields to output" type="boolean" truevalue="--detail" falsevalue="" help="" /> + <param name="intergenic" checked="false" label="Write intergenic peaks to the gene file as well with None as gene ID" type="boolean" truevalue="--intergenic" falsevalue="" help="" /> + </inputs> + <outputs> + <data format="txt" hidden="false" name="default"/> + </outputs> + <help></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chipsequtil/tool_dependencies.xml Mon Mar 28 12:31:17 2016 -0400 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="chipsequtil" version="1.0"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://github.com/adamlabadorf/chipsequtil/archive/master.zip</action> + <action type="shell_command">unzip chipsequtil-master.zip -d chipsequtil</action> + <action type="shell_command">cd chipsequtil</action> + <action type="shell_command">cp org_settings.cfg src/chipsequtil/</action> + <action type="shell_command">python setup.py install</action> + </actions> + </install> + <readme></readme> + </package> +</tool_dependency>