Mercurial > repos > boris > getalleleseq

--- a/getalleleseq.py	Fri Jul 19 17:49:30 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-#!/usr/bin/env python
-# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
-#
-#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
-#
-#Given a table with minor and major alleles per position, it generates the
-#minor and major allele sequences in FASTA format
-#
-#positional arguments:
-#  alleles               Table containing minor and major allele base per
-#                        position. cols: [id, chr, pos, A, C, G, T, cvrg,
-#                        plody, major, minor, freq_minor]
-#
-#optional arguments:
-#  -h, --help            show this help message and exit
-#  -l INT, --seq-length INT
-#                        Background sequence length. Bases in an artifical
-#                        all-N-sequence of length INT will be replaced by
-#                        either the major or minor allele base accordingly
-#  -j FILE, --major-seq FILE
-#                        File to write major allele sequences in FASTA multiple
-#                        alignment format.
-#  -d DIR, --minor-dir DIR
-#                        Per sample minor allele sequences will be written to
-#                        this directory
-#
-# The expected columns in the alleles table follow Nicholas Stoler's
-# Variant Annotator tool format.  See Variant Annotator in Galaxy's tool shed
-# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
-#
-# Expected columns:
-# 1.  sample_id
-# 2.  chr
-# 3.  position
-# 4   counts for A's
-# 5.  counts for C's
-# 6.  counts for G's
-# 7.  counts for T's
-# 8.  Coverage
-# 9.  Number of alleles passing a given criteria
-# 10. Major allele
-# 11. Minor allele
-# 12. Minor allele frequency in position
-
-import sys
-import os
-import argparse
-
-def createseq(sample, allele, seq_size, table):
-    """Generate major or minor allele sequence"""
-    out_sequence = ['N' for i in range(seq_size)]
-    sample_data  = [line for line in table if line[0] == sample]
-
-    for entry in sample_data:
-        position = int(entry[2])
-        number_of_alleles = int(entry[8])
-        major_allele = entry[9].strip()
-        minor_allele = entry[10].strip()
-
-        if allele == 'major':
-            out_sequence[position-1] = major_allele
-        elif allele == 'minor':
-            if number_of_alleles == 2:
-                out_sequence[position-1] = minor_allele
-            else:
-                out_sequence[position-1] = major_allele
-    return out_sequence
-
-def printseq(sample,allele,seq,output):
-    """Print out sequence"""
-    #print >> output, '>{0}_{1}'.format(sample,allele)
-    print >> output, '>{0}{1}'.format(sample,allele)
-    for i in range(0,len(seq),70):
-        print >> output, ''.join(seq[i:i+70])
-
-def main():
-    parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
-    parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
-    parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
-    parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
-    parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
-    parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
-    args = parser.parse_args()
-
-
-    try:
-        table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
-        samples = sorted(list(set([ line[0] for line in table ])))
-    except:
-        sys.exit('\nERROR: Could not open %s\n' % args.alleles)
-    try:
-        major_out = open(args.major_seq, 'w+')
-    except:
-        sys.exit('\nCould not create %s\n' % args.major_seq)
-
-    # Single file for all major allele sequences in FASTA multiple alignment
-    for sample in samples:
-        sequence = createseq(sample,'major',args.seq_length,table)
-        #printseq(sample,'major',sequence,major_out)
-        printseq(sample,'',sequence,major_out)
-    major_out.close()
-
-    # Sample specific minor allele sequence in FASTA format
-    try:
-        os.makedirs(args.minor_dir)
-    except:
-        pass
-
-    for sample in samples:
-        if args.minor_prefix: # to fit Galaxy requirements
-            name = sample.replace('_','')
-            minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
-        else: # for non-Galaxy
-            minor_name = sample+'-minor.fa'
-        minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
-        sequence = createseq(sample,'minor',args.seq_length,table)
-        #printseq(sample,'minor',sequence,minor_out)
-        printseq(sample,'_minor',sequence,minor_out)
-        minor_out.close()
-
-if __name__ == "__main__": main()
\ No newline at end of file
--- a/getalleleseq.xml	Fri Jul 19 17:49:30 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,109 +0,0 @@
-<tool id="getalleleseq" name="FASTA from allele counts" version="0.0.1" force_history_refresh="True">
-  <description>Generate major and minor allele sequences from alleles table</description>
-  <command interpreter="python">getalleleseq.py
-                                   $alleles
-                                -l $seq_length
-                                -j $major_seq
-                                -d $__new_file_path__
-                                -p $major_seq.id
-</command>
-  <inputs>
-    <param format="tabular" name="alleles" type="data" label="Table containing major and minor alleles base per position" help="must be tabular and follow *Count alleles* tool output format"/>
-    <param name="seq_length" type="integer" value="16569" label="Background sequence length" help="e.g. 16569 for mitochondrial variants"/>
-  </inputs>
-  <outputs>
-    <data format="fasta" name="major_seq"/>
-  </outputs>
-  <tests>
-    <test>
-      <param name="alleles" value="test-table-getalleleseq.tab"/>
-      <param name="seq_length" value="16569"/>
-      <output name="major_seq" file="test-major-allele-out-getalleleseq.fa"/>
-    </test>
-  </tests>
-
-  <help>
-
-
-The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position.
-Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence.
-
------
-
-.. class:: infomark
-
-**What it does**
-
-It takes the table generated from the Count alleles tool to derive a major and minor allele sequence per sample.
-Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample)
-to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses.
-In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation.
-
------
-
-.. class:: warningmark
-
-**Note**
-
-Please, follow the format described below for the input file:
-
------
-
-.. class:: infomark
-
-**Formats**
-
-**Count alleles tool output format**
-
-Columns::
-
-    1.  sample id
-    2.  chromosome
-    3.  position
-    4   counts for A's
-    5.  counts for C's
-    6.  counts for G's
-    7.  counts for T's
-    8.  Coverage
-    9.  Number of alleles passing frequency threshold
-    10. Major allele
-    11. Minor allele
-    12. Minor allele frequency in position
-
-
-**FASTA multiple alignment**
-
-See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format
-
------
-
-**Example**
-
-- For the following dataset::
-
-    S9	chrM	3	3	0	2	214	219	0	T	A	0.013698630137
-    S9	chrM	4	3	249	3	0	255	0	C	N	0.0
-    S9	chrM	5	245	1	1	0	247	1	A	N	0.0
-    S11	chrM	6	0	292	0	0	292	1	C	.	0.0
-    S7	chrM	6	0	254	0	0	254	1	C	.	0.0
-    S9	chrM	6	2	306	2	0	310	0	C	N	0.0
-    S11	chrM	7	281	0	3	0	284	0	A	G	0.0105633802817
-    S7	chrM	7	249	0	2	0	251	1	A	G	0.00796812749004
-    etc. for all covered positions per sample...
-
-- Running this tool with background sequence length 16569 will produce 4 files::
-
-    1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11
-    2. minor allele sequence of sample S7
-    3. minor allele sequence of sample S9
-    4. minor allele sequence of sample S11
-
------
-
-**Citation**
-
-If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al. *In preparation.*
-(boris-at-bx.psu.edu)
-
-  </help>
-</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getalleleseq/getalleleseq.py	Tue Mar 18 09:05:02 2014 -0400
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
+#
+#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
+#
+#Given a table with minor and major alleles per position, it generates the
+#minor and major allele sequences in FASTA format
+#
+#positional arguments:
+#  alleles               Table containing minor and major allele base per
+#                        position. cols: [id, chr, pos, A, C, G, T, cvrg,
+#                        plody, major, minor, freq_minor]
+#
+#optional arguments:
+#  -h, --help            show this help message and exit
+#  -l INT, --seq-length INT
+#                        Background sequence length. Bases in an artifical
+#                        all-N-sequence of length INT will be replaced by
+#                        either the major or minor allele base accordingly
+#  -j FILE, --major-seq FILE
+#                        File to write major allele sequences in FASTA multiple
+#                        alignment format.
+#  -d DIR, --minor-dir DIR
+#                        Per sample minor allele sequences will be written to
+#                        this directory
+#
+# The expected columns in the alleles table follow Nicholas Stoler's
+# Variant Annotator tool format.  See Variant Annotator in Galaxy's tool shed
+# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
+#
+# Expected columns:
+# 1.  sample_id
+# 2.  chr
+# 3.  position
+# 4   counts for A's
+# 5.  counts for C's
+# 6.  counts for G's
+# 7.  counts for T's
+# (8.  counts for a's)
+# (9.  counts for c's)
+# (10. counts for g's)
+# (11. counts for t's)
+# 8.  (12.) Coverage
+# 9.  (13.) Number of alleles passing a given criteria
+# 10. (14.) Major allele
+# 11. (15.) Minor allele
+# 12. (16.) Minor allele frequency in position
+
+import sys
+import os
+import argparse
+
+def createseq(sample, allele, seq_size, table):
+    """Generate major or minor allele sequence"""
+    out_sequence = ['N' for i in range(seq_size)]
+    sample_data  = [line for line in table if line[0] == sample]
+
+    for entry in sample_data:
+        position = int(entry[2])
+        if len(entry)==12:
+            number_of_alleles = int(entry[8])
+            major_allele = entry[9].strip()
+            minor_allele = entry[10].strip()
+        else:
+            number_of_alleles = int(entry[12])
+            major_allele = entry[13].strip()
+            minor_allele = entry[14].strip()
+
+        if allele == 'major':
+            out_sequence[position-1] = major_allele
+        elif allele == 'minor':
+            if number_of_alleles >= 2:
+                out_sequence[position-1] = minor_allele
+            else:
+                out_sequence[position-1] = major_allele
+    return out_sequence
+
+def printseq(sample,allele,seq,output):
+    """Print out sequence"""
+    #print >> output, '>{0}_{1}'.format(sample,allele)
+    print >> output, '>{0}{1}'.format(sample,allele)
+    for i in range(0,len(seq),70):
+        print >> output, ''.join(seq[i:i+70])
+
+def main():
+    parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
+    parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
+    parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
+    parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
+    parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
+    parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
+    args = parser.parse_args()
+
+
+    try:
+        table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
+        samples = sorted(list(set([ line[0] for line in table ])))
+    except:
+        sys.exit('\nERROR: Could not open %s\n' % args.alleles)
+    try:
+        major_out = open(args.major_seq, 'w+')
+    except:
+        sys.exit('\nCould not create %s\n' % args.major_seq)
+
+    # Single file for all major allele sequences in FASTA multiple alignment
+    for sample in samples:
+        sequence = createseq(sample,'major',args.seq_length,table)
+        #printseq(sample,'major',sequence,major_out)
+        printseq(sample,'',sequence,major_out)
+    major_out.close()
+
+    # Sample specific minor allele sequence in FASTA format
+    try:
+        os.makedirs(args.minor_dir)
+    except:
+        pass
+
+    for sample in samples:
+        if args.minor_prefix: # to fit Galaxy requirements
+            name = sample.replace('_','')
+            minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
+        else: # for non-Galaxy
+            minor_name = sample+'-minor.fa'
+        minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
+        sequence = createseq(sample,'minor',args.seq_length,table)
+        #printseq(sample,'minor',sequence,minor_out)
+        printseq(sample,'_minor',sequence,minor_out)
+        minor_out.close()
+
+if __name__ == "__main__": main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getalleleseq/getalleleseq.xml	Tue Mar 18 09:05:02 2014 -0400
@@ -0,0 +1,108 @@
+<tool id="getalleleseq" name="FASTA from allele counts" version="0.0.1" force_history_refresh="True">
+  <description>Generate major and minor allele sequences from alleles table</description>
+  <command interpreter="python">getalleleseq.py
+                                   $alleles
+                                -l $seq_length
+                                -j $major_seq
+                                -p $major_seq.id
+</command>
+  <inputs>
+    <param format="tabular" name="alleles" type="data" label="Table containing major and minor alleles base per position" help="must be tabular and follow the Variant Annotator tool output format"/>
+    <param name="seq_length" type="integer" value="16569" label="Background sequence length" help="e.g. 16569 for mitochondrial variants"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="major_seq"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="alleles" value="test-table-getalleleseq.tab"/>
+      <param name="seq_length" value="16569"/>
+      <output name="major_seq" file="test-major-allele-out-getalleleseq.fa"/>
+    </test>
+  </tests>
+
+  <help>
+
+
+The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position.
+Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence.
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+It takes the table generated from the Variant Annotator tool to derive a major and minor allele sequence per sample.
+Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample)
+to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses.
+In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Please, follow the format described below for the input file:
+
+-----
+
+.. class:: infomark
+
+**Formats**
+
+**Variant Annotator tool output format**
+
+Columns::
+
+    1.  sample id
+    2.  chromosome
+    3.  position
+    4   counts for A's
+    5.  counts for C's
+    6.  counts for G's
+    7.  counts for T's
+    8.  Coverage
+    9.  Number of alleles passing frequency threshold
+    10. Major allele
+    11. Minor allele
+    12. Minor allele frequency in position
+
+
+**FASTA multiple alignment**
+
+See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format
+
+-----
+
+**Example**
+
+- For the following dataset::
+
+    S9	chrM	3	3	0	2	214	219	0	T	A	0.013698630137
+    S9	chrM	4	3	249	3	0	255	0	C	N	0.0
+    S9	chrM	5	245	1	1	0	247	1	A	N	0.0
+    S11	chrM	6	0	292	0	0	292	1	C	.	0.0
+    S7	chrM	6	0	254	0	0	254	1	C	.	0.0
+    S9	chrM	6	2	306	2	0	310	0	C	N	0.0
+    S11	chrM	7	281	0	3	0	284	0	A	G	0.0105633802817
+    S7	chrM	7	249	0	2	0	251	1	A	G	0.00796812749004
+    etc. for all covered positions per sample...
+
+- Running this tool with background sequence length 16569 will produce 4 files::
+
+    1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11
+    2. minor allele sequence of sample S7
+    3. minor allele sequence of sample S9
+    4. minor allele sequence of sample S11
+
+-----
+
+**Citation**
+
+If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al (2014). *Acccepted in Biotechniques*
+(boris-at-bx.psu.edu)
+
+  </help>
+</tool>
\ No newline at end of file
Binary file getalleleseq/test-data/._test-major-allele-out-getalleleseq.fa has changed