# HG changeset patch
# User boris
# Date 1395147902 14400
# Node ID 28640b50ba2f070ac082b17ca989bdfd0c02ee3f
# Parent 8d0a0c488a8e4ed4ab5a7ba0d3b830f67cfd8fc1
Updated xml. Output is written to working directory instead of Galaxy's new file directory
diff -r 8d0a0c488a8e -r 28640b50ba2f getalleleseq.py
--- a/getalleleseq.py Fri Jul 19 17:49:30 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-#!/usr/bin/env python
-# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
-#
-#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
-#
-#Given a table with minor and major alleles per position, it generates the
-#minor and major allele sequences in FASTA format
-#
-#positional arguments:
-# alleles Table containing minor and major allele base per
-# position. cols: [id, chr, pos, A, C, G, T, cvrg,
-# plody, major, minor, freq_minor]
-#
-#optional arguments:
-# -h, --help show this help message and exit
-# -l INT, --seq-length INT
-# Background sequence length. Bases in an artifical
-# all-N-sequence of length INT will be replaced by
-# either the major or minor allele base accordingly
-# -j FILE, --major-seq FILE
-# File to write major allele sequences in FASTA multiple
-# alignment format.
-# -d DIR, --minor-dir DIR
-# Per sample minor allele sequences will be written to
-# this directory
-#
-# The expected columns in the alleles table follow Nicholas Stoler's
-# Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed
-# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
-#
-# Expected columns:
-# 1. sample_id
-# 2. chr
-# 3. position
-# 4 counts for A's
-# 5. counts for C's
-# 6. counts for G's
-# 7. counts for T's
-# 8. Coverage
-# 9. Number of alleles passing a given criteria
-# 10. Major allele
-# 11. Minor allele
-# 12. Minor allele frequency in position
-
-import sys
-import os
-import argparse
-
-def createseq(sample, allele, seq_size, table):
- """Generate major or minor allele sequence"""
- out_sequence = ['N' for i in range(seq_size)]
- sample_data = [line for line in table if line[0] == sample]
-
- for entry in sample_data:
- position = int(entry[2])
- number_of_alleles = int(entry[8])
- major_allele = entry[9].strip()
- minor_allele = entry[10].strip()
-
- if allele == 'major':
- out_sequence[position-1] = major_allele
- elif allele == 'minor':
- if number_of_alleles == 2:
- out_sequence[position-1] = minor_allele
- else:
- out_sequence[position-1] = major_allele
- return out_sequence
-
-def printseq(sample,allele,seq,output):
- """Print out sequence"""
- #print >> output, '>{0}_{1}'.format(sample,allele)
- print >> output, '>{0}{1}'.format(sample,allele)
- for i in range(0,len(seq),70):
- print >> output, ''.join(seq[i:i+70])
-
-def main():
- parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
- parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
- parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
- parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
- parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
- parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
- args = parser.parse_args()
-
-
- try:
- table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
- samples = sorted(list(set([ line[0] for line in table ])))
- except:
- sys.exit('\nERROR: Could not open %s\n' % args.alleles)
- try:
- major_out = open(args.major_seq, 'w+')
- except:
- sys.exit('\nCould not create %s\n' % args.major_seq)
-
- # Single file for all major allele sequences in FASTA multiple alignment
- for sample in samples:
- sequence = createseq(sample,'major',args.seq_length,table)
- #printseq(sample,'major',sequence,major_out)
- printseq(sample,'',sequence,major_out)
- major_out.close()
-
- # Sample specific minor allele sequence in FASTA format
- try:
- os.makedirs(args.minor_dir)
- except:
- pass
-
- for sample in samples:
- if args.minor_prefix: # to fit Galaxy requirements
- name = sample.replace('_','')
- minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
- else: # for non-Galaxy
- minor_name = sample+'-minor.fa'
- minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
- sequence = createseq(sample,'minor',args.seq_length,table)
- #printseq(sample,'minor',sequence,minor_out)
- printseq(sample,'_minor',sequence,minor_out)
- minor_out.close()
-
-if __name__ == "__main__": main()
\ No newline at end of file
diff -r 8d0a0c488a8e -r 28640b50ba2f getalleleseq.xml
--- a/getalleleseq.xml Fri Jul 19 17:49:30 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,109 +0,0 @@
-
- Generate major and minor allele sequences from alleles table
- getalleleseq.py
- $alleles
- -l $seq_length
- -j $major_seq
- -d $__new_file_path__
- -p $major_seq.id
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position.
-Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence.
-
------
-
-.. class:: infomark
-
-**What it does**
-
-It takes the table generated from the Count alleles tool to derive a major and minor allele sequence per sample.
-Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample)
-to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses.
-In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation.
-
------
-
-.. class:: warningmark
-
-**Note**
-
-Please, follow the format described below for the input file:
-
------
-
-.. class:: infomark
-
-**Formats**
-
-**Count alleles tool output format**
-
-Columns::
-
- 1. sample id
- 2. chromosome
- 3. position
- 4 counts for A's
- 5. counts for C's
- 6. counts for G's
- 7. counts for T's
- 8. Coverage
- 9. Number of alleles passing frequency threshold
- 10. Major allele
- 11. Minor allele
- 12. Minor allele frequency in position
-
-
-**FASTA multiple alignment**
-
-See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format
-
------
-
-**Example**
-
-- For the following dataset::
-
- S9 chrM 3 3 0 2 214 219 0 T A 0.013698630137
- S9 chrM 4 3 249 3 0 255 0 C N 0.0
- S9 chrM 5 245 1 1 0 247 1 A N 0.0
- S11 chrM 6 0 292 0 0 292 1 C . 0.0
- S7 chrM 6 0 254 0 0 254 1 C . 0.0
- S9 chrM 6 2 306 2 0 310 0 C N 0.0
- S11 chrM 7 281 0 3 0 284 0 A G 0.0105633802817
- S7 chrM 7 249 0 2 0 251 1 A G 0.00796812749004
- etc. for all covered positions per sample...
-
-- Running this tool with background sequence length 16569 will produce 4 files::
-
- 1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11
- 2. minor allele sequence of sample S7
- 3. minor allele sequence of sample S9
- 4. minor allele sequence of sample S11
-
------
-
-**Citation**
-
-If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al. *In preparation.*
-(boris-at-bx.psu.edu)
-
-
-
\ No newline at end of file
diff -r 8d0a0c488a8e -r 28640b50ba2f getalleleseq/getalleleseq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/getalleleseq/getalleleseq.py Tue Mar 18 09:05:02 2014 -0400
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
+#
+#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
+#
+#Given a table with minor and major alleles per position, it generates the
+#minor and major allele sequences in FASTA format
+#
+#positional arguments:
+# alleles Table containing minor and major allele base per
+# position. cols: [id, chr, pos, A, C, G, T, cvrg,
+# plody, major, minor, freq_minor]
+#
+#optional arguments:
+# -h, --help show this help message and exit
+# -l INT, --seq-length INT
+# Background sequence length. Bases in an artifical
+# all-N-sequence of length INT will be replaced by
+# either the major or minor allele base accordingly
+# -j FILE, --major-seq FILE
+# File to write major allele sequences in FASTA multiple
+# alignment format.
+# -d DIR, --minor-dir DIR
+# Per sample minor allele sequences will be written to
+# this directory
+#
+# The expected columns in the alleles table follow Nicholas Stoler's
+# Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed
+# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
+#
+# Expected columns:
+# 1. sample_id
+# 2. chr
+# 3. position
+# 4 counts for A's
+# 5. counts for C's
+# 6. counts for G's
+# 7. counts for T's
+# (8. counts for a's)
+# (9. counts for c's)
+# (10. counts for g's)
+# (11. counts for t's)
+# 8. (12.) Coverage
+# 9. (13.) Number of alleles passing a given criteria
+# 10. (14.) Major allele
+# 11. (15.) Minor allele
+# 12. (16.) Minor allele frequency in position
+
+import sys
+import os
+import argparse
+
+def createseq(sample, allele, seq_size, table):
+ """Generate major or minor allele sequence"""
+ out_sequence = ['N' for i in range(seq_size)]
+ sample_data = [line for line in table if line[0] == sample]
+
+ for entry in sample_data:
+ position = int(entry[2])
+ if len(entry)==12:
+ number_of_alleles = int(entry[8])
+ major_allele = entry[9].strip()
+ minor_allele = entry[10].strip()
+ else:
+ number_of_alleles = int(entry[12])
+ major_allele = entry[13].strip()
+ minor_allele = entry[14].strip()
+
+ if allele == 'major':
+ out_sequence[position-1] = major_allele
+ elif allele == 'minor':
+ if number_of_alleles >= 2:
+ out_sequence[position-1] = minor_allele
+ else:
+ out_sequence[position-1] = major_allele
+ return out_sequence
+
+def printseq(sample,allele,seq,output):
+ """Print out sequence"""
+ #print >> output, '>{0}_{1}'.format(sample,allele)
+ print >> output, '>{0}{1}'.format(sample,allele)
+ for i in range(0,len(seq),70):
+ print >> output, ''.join(seq[i:i+70])
+
+def main():
+ parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
+ parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
+ parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
+ parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
+ parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
+ parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
+ args = parser.parse_args()
+
+
+ try:
+ table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
+ samples = sorted(list(set([ line[0] for line in table ])))
+ except:
+ sys.exit('\nERROR: Could not open %s\n' % args.alleles)
+ try:
+ major_out = open(args.major_seq, 'w+')
+ except:
+ sys.exit('\nCould not create %s\n' % args.major_seq)
+
+ # Single file for all major allele sequences in FASTA multiple alignment
+ for sample in samples:
+ sequence = createseq(sample,'major',args.seq_length,table)
+ #printseq(sample,'major',sequence,major_out)
+ printseq(sample,'',sequence,major_out)
+ major_out.close()
+
+ # Sample specific minor allele sequence in FASTA format
+ try:
+ os.makedirs(args.minor_dir)
+ except:
+ pass
+
+ for sample in samples:
+ if args.minor_prefix: # to fit Galaxy requirements
+ name = sample.replace('_','')
+ minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
+ else: # for non-Galaxy
+ minor_name = sample+'-minor.fa'
+ minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
+ sequence = createseq(sample,'minor',args.seq_length,table)
+ #printseq(sample,'minor',sequence,minor_out)
+ printseq(sample,'_minor',sequence,minor_out)
+ minor_out.close()
+
+if __name__ == "__main__": main()
\ No newline at end of file
diff -r 8d0a0c488a8e -r 28640b50ba2f getalleleseq/getalleleseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/getalleleseq/getalleleseq.xml Tue Mar 18 09:05:02 2014 -0400
@@ -0,0 +1,108 @@
+
+ Generate major and minor allele sequences from alleles table
+ getalleleseq.py
+ $alleles
+ -l $seq_length
+ -j $major_seq
+ -p $major_seq.id
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position.
+Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence.
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+It takes the table generated from the Variant Annotator tool to derive a major and minor allele sequence per sample.
+Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample)
+to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses.
+In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Please, follow the format described below for the input file:
+
+-----
+
+.. class:: infomark
+
+**Formats**
+
+**Variant Annotator tool output format**
+
+Columns::
+
+ 1. sample id
+ 2. chromosome
+ 3. position
+ 4 counts for A's
+ 5. counts for C's
+ 6. counts for G's
+ 7. counts for T's
+ 8. Coverage
+ 9. Number of alleles passing frequency threshold
+ 10. Major allele
+ 11. Minor allele
+ 12. Minor allele frequency in position
+
+
+**FASTA multiple alignment**
+
+See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format
+
+-----
+
+**Example**
+
+- For the following dataset::
+
+ S9 chrM 3 3 0 2 214 219 0 T A 0.013698630137
+ S9 chrM 4 3 249 3 0 255 0 C N 0.0
+ S9 chrM 5 245 1 1 0 247 1 A N 0.0
+ S11 chrM 6 0 292 0 0 292 1 C . 0.0
+ S7 chrM 6 0 254 0 0 254 1 C . 0.0
+ S9 chrM 6 2 306 2 0 310 0 C N 0.0
+ S11 chrM 7 281 0 3 0 284 0 A G 0.0105633802817
+ S7 chrM 7 249 0 2 0 251 1 A G 0.00796812749004
+ etc. for all covered positions per sample...
+
+- Running this tool with background sequence length 16569 will produce 4 files::
+
+ 1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11
+ 2. minor allele sequence of sample S7
+ 3. minor allele sequence of sample S9
+ 4. minor allele sequence of sample S11
+
+-----
+
+**Citation**
+
+If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al (2014). *Acccepted in Biotechniques*
+(boris-at-bx.psu.edu)
+
+
+
\ No newline at end of file
diff -r 8d0a0c488a8e -r 28640b50ba2f getalleleseq/test-data/._test-major-allele-out-getalleleseq.fa
Binary file getalleleseq/test-data/._test-major-allele-out-getalleleseq.fa has changed