changeset 3:6ad594db0143

Uploaded
author g2cmnty@test-web1.g2.bx.psu.edu
date Tue, 21 Jun 2011 16:25:34 -0400
parents 81a79b4a3f6f
children a188de29f06e
files bam_to_bigwig/README.txt bam_to_bigwig/bam_to_bigwig.xml bam_to_bigwig/bam_to_wiggle.py filtering.py filtering.xml
diffstat 5 files changed, 193 insertions(+), 166 deletions(-) [+]
line wrap: on
line diff
--- a/bam_to_bigwig/README.txt	Tue Jun 21 09:44:52 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-Convert a BAM file into a BigWig coverage file. This can be used directly from 
-Galaxy for display at UCSC. The advantage over standard Wiggle format is that 
-the data is stored in a compressed format and can be retrieved by genome
-region. This allows you to view regions of arbitrarily large Wiggle file data
-at UCSC while avoiding the upload costs.
-
-The latest version of the bam_to_wiggle.py script is available at:
-
-https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py
-
-Place the script in the same directory as the XML configuration file, or
-provide a soft link to it.
-
-This requires:
-
-Python2, version 2.6 or better
-pysam (http://code.google.com/p/pysam/)
-wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
--- a/bam_to_bigwig/bam_to_bigwig.xml	Tue Jun 21 09:44:52 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.0.2">
-  <description>Calculates coverage from a BAM alignment file</description>
-  <command interpreter="python">bam_to_wiggle.py $align --outfile=$out</command>
-  <inputs>
-    <param format="bam" name="align" type="data" label="BAM alignment file"/>
-  </inputs>
-  <outputs>
-    <data format="bigwig" name="out" />
-  </outputs>
-
-<help>
-**What it does**
-
-Creates a coverage file in BigWig format, given a BAM alignment file. 
-
-**Input**
-
-A BAM alignment file. This needs to have the genome database build used in alignment annotated. If your file has '?' for the database build, click on the pencil icon to edit the alignment attributes, and specify the organism used to align against.
-
-**Output**
-
-BigWig files can be loaded directly from Galaxy into the UCSC browser. They can be loaded incrementally by UCSC, so a single file can be used to represent the entire genome without having to upload the entire thing as a custom track.
-</help>
-
-</tool>
--- a/bam_to_bigwig/bam_to_wiggle.py	Tue Jun 21 09:44:52 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-#!/usr/bin/env python
-"""Convert BAM files to BigWig file format in a specified region.
-
-Usage:
-    bam_to_wiggle.py <BAM file> [<YAML config>]
-    [--outfile=<output file name>
-     --chrom=<chrom>
-     --start=<start>
-     --end=<end>]
-
-chrom start and end are optional, in which case they default to everything.
-
-The config file is in YAML format and specifies the location of the wigToBigWig
-program from UCSC:
-
-program:
-  ucsc_bigwig: wigToBigWig
-
-If not specified, these will be assumed to be present in the system path.
-
-The script requires:
-    pysam (http://code.google.com/p/pysam/)
-    wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
-If a configuration file is used, then PyYAML is also required (http://pyyaml.org/)
-"""
-import os
-import sys
-import subprocess
-import tempfile
-from optparse import OptionParser
-from contextlib import contextmanager, closing
-
-import pysam
-
-def main(bam_file, config_file=None, chrom='all', start=0, end=None,
-         outfile=None):
-    if config_file:
-        import yaml
-        with open(config_file) as in_handle:
-            config = yaml.load(in_handle)
-    else:
-        config = {"program": {"ucsc_bigwig" : "wigToBigWig"}}
-    if outfile is None:
-        outfile = "%s.bigwig" % os.path.splitext(bam_file)[0]
-    if start > 0:
-        start = int(start) - 1
-    if end is not None:
-        end = int(end)
-    regions = [(chrom, start, end)]
-    if os.path.abspath(bam_file) == os.path.abspath(outfile):
-        sys.stderr.write("Bad arguments, input and output files are the same.\n")
-        sys.exit(1)
-    if not (os.path.exists(outfile) and os.path.getsize(outfile) > 0):
-        #Use a temp file to avoid any possiblity of not having write permission
-        out_handle = tempfile.NamedTemporaryFile(delete=False)
-        wig_file = out_handle.name
-        with closing(out_handle):
-            chr_sizes, wig_valid = write_bam_track(bam_file, regions, config, out_handle)
-        try:
-            if wig_valid:
-                convert_to_bigwig(wig_file, chr_sizes, config, outfile)
-        finally:
-            os.remove(wig_file)
-
-@contextmanager
-def indexed_bam(bam_file, config):
-    if not os.path.exists(bam_file + ".bai"):
-        pysam.index(bam_file)
-    sam_reader = pysam.Samfile(bam_file, "rb")
-    yield sam_reader
-    sam_reader.close()
-
-def write_bam_track(bam_file, regions, config, out_handle):
-    out_handle.write("track %s\n" % " ".join(["type=wiggle_0",
-        "name=%s" % os.path.splitext(os.path.split(bam_file)[-1])[0],
-        "visibility=full",
-        ]))
-    is_valid = False
-    with indexed_bam(bam_file, config) as work_bam:
-        sizes = zip(work_bam.references, work_bam.lengths)
-        if len(regions) == 1 and regions[0][0] == "all":
-            regions = [(name, 0, length) for name, length in sizes]
-        for chrom, start, end in regions:
-            if end is None and chrom in work_bam.references:
-                end = work_bam.lengths[work_bam.references.index(chrom)]
-            assert end is not None, "Could not find %s in header" % chrom
-            out_handle.write("variableStep chrom=%s\n" % chrom)
-            for col in work_bam.pileup(chrom, start, end):
-                out_handle.write("%s %s\n" % (col.pos+1, col.n))
-                is_valid = True
-    return sizes, is_valid
-
-def convert_to_bigwig(wig_file, chr_sizes, config, bw_file=None):
-    if not bw_file:
-        bw_file = "%s.bigwig" % (os.path.splitext(wig_file)[0])
-    size_file = "%s-sizes.txt" % (os.path.splitext(wig_file)[0])
-    with open(size_file, "w") as out_handle:
-        for chrom, size in chr_sizes:
-            out_handle.write("%s\t%s\n" % (chrom, size))
-    try:
-        cl = [config["program"]["ucsc_bigwig"], wig_file, size_file, bw_file]
-        subprocess.check_call(cl)
-    finally:
-        os.remove(size_file)
-    return bw_file
-
-if __name__ == "__main__":
-    parser = OptionParser()
-    parser.add_option("-o", "--outfile", dest="outfile")
-    parser.add_option("-c", "--chrom", dest="chrom")
-    parser.add_option("-s", "--start", dest="start")
-    parser.add_option("-e", "--end", dest="end")
-    (options, args) = parser.parse_args()
-    if len(args) not in [1, 2]:
-        print "Incorrect arguments"
-        print __doc__
-        sys.exit()
-    kwargs = dict(
-        outfile=options.outfile,
-        chrom=options.chrom or 'all',
-        start=options.start or 0,
-        end=options.end)
-    main(*args, **kwargs)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filtering.py	Tue Jun 21 16:25:34 2011 -0400
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties.
+# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
+
+from __future__ import division
+import sys, re, os.path
+from galaxy import eggs
+
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def get_operands( filter_condition ):
+    # Note that the order of all_operators is important
+    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
+    for item in items_to_strip:
+        if filter_condition.find( item ) >= 0:
+            filter_condition = filter_condition.replace( item, ' ' )
+    operands = set( filter_condition.split( ' ' ) )
+    return operands
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+in_fname = sys.argv[1]
+out_fname = sys.argv[2]
+cond_text = sys.argv[3]
+try:
+    in_columns = int( sys.argv[4] )
+    assert sys.argv[5]  #check to see that the column types varaible isn't null
+    in_column_types = sys.argv[5].split( ',' )
+except:
+    stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
+
+# Unescape if input has been escaped
+mapped_str = {
+    '__lt__': '<',
+    '__le__': '<=',
+    '__eq__': '==',
+    '__ne__': '!=',
+    '__gt__': '>',
+    '__ge__': '>=',
+    '__sq__': '\'',
+    '__dq__': '"',
+}
+for key, value in mapped_str.items():
+    cond_text = cond_text.replace( key, value )
+    
+# Attempt to determine if the condition includes executable stuff and, if so, exit
+secured = dir()
+operands = get_operands(cond_text)
+for operand in operands:
+    try:
+        check = int( operand )
+    except:
+        if operand in secured:
+            stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
+
+# Prepare the column variable names and wrappers for column data types
+cols, type_casts = [], []
+for col in range( 1, in_columns + 1 ):
+    col_name = "c%d" % col
+    cols.append( col_name )
+    col_type = in_column_types[ col - 1 ]
+    type_cast = "%s(%s)" % ( col_type, col_name )
+    type_casts.append( type_cast )
+ 
+col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
+type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
+assign = "%s = line.split( '\\t' )" % col_str
+wrap = "%s = %s" % ( col_str, type_cast_str )
+skipped_lines = 0
+first_invalid_line = 0
+invalid_line = None
+lines_kept = 0
+total_lines = 0
+out = open( out_fname, 'wt' )
+    
+# Read and filter input file, skipping invalid lines
+code = '''
+for i, line in enumerate( file( in_fname ) ):
+    total_lines += 1
+    line = line.rstrip( '\\r\\n' )
+    if not line or line.startswith( '#' ):
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+        continue
+    try:
+        %s
+        %s
+        if %s:
+            lines_kept += 1
+            print >> out, line
+    except:
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+''' % ( assign, wrap, cond_text )
+
+valid_filter = True
+try:
+    exec code
+except Exception, e:
+    out.close()
+    if str( e ).startswith( 'invalid syntax' ):
+        valid_filter = False
+        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
+    else:
+        stop_err( str( e ) )
+
+if valid_filter:
+    out.close()
+    valid_lines = total_lines - skipped_lines
+    print 'Filtering with %s, ' % cond_text
+    if valid_lines > 0:
+        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+    else:
+        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
+    if skipped_lines > 0:
+        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filtering.xml	Tue Jun 21 16:25:34 2011 -0400
@@ -0,0 +1,65 @@
+<tool id="Filter1" name="Filter" version="1.0.1">
+  <description>data on any column using simple expressions</description>
+  <command interpreter="python">
+    filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}"
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Filter" help="Query missing? See TIP below."/>
+    <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
+      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="1.bed"/>
+      <param name="cond" value="c1=='chr22'"/>
+      <output name="out_file1" file="filter1_test1.bed"/>
+    </test>
+    <test>
+      <param name="input" value="7.bed"/>
+      <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/>
+      <output name="out_file1" file="filter1_test2.bed"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**)
+
+.. class:: infomark
+
+**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the columns being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings).  If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition.  The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue".
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+The filter tool allows you to restrict the dataset using simple conditional statements.
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
+- Make sure that multi-character operators contain no white space ( e.g., **&lt;=** is valid while **&lt; =** is not valid )
+- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **c1=='chr1'** )
+- Non-numerical values must be included in single or double quotes ( e.g., **c6=='+'** )
+- Filtering condition can include logical operators, but **make sure operators are all lower case** ( e.g., **(c1!='chrX' and c1!='chrY') or not c6=='+'** )
+
+-----
+
+**Example**
+
+- **c1=='chr1'** selects lines in which the first column is chr1
+- **c3-c2&lt;100*c4** selects lines where subtracting column 3 from column 2 is less than the value of column 4 times 100
+- **len(c2.split(',')) &lt; 4** will select lines where the second column has less than four comma separated elements
+- **c2>=1** selects lines in which the value of column 2 is greater than or equal to 1
+- Numbers should not contain commas - **c2&lt;=44,554,350** will not work, but **c2&lt;=44554350** will
+- Some words in the data can be used, but must be single or double quoted ( e.g., **c3=='exon'** )
+
+</help>
+</tool>