Mercurial > repos > boris > getalleleseq
changeset 7:28640b50ba2f draft default tip
Updated xml. Output is written to working directory instead of Galaxy's new file directory
author | boris |
---|---|
date | Tue, 18 Mar 2014 09:05:02 -0400 |
parents | 8d0a0c488a8e |
children | |
files | getalleleseq.py getalleleseq.xml getalleleseq/getalleleseq.py getalleleseq/getalleleseq.xml getalleleseq/test-data/._test-major-allele-out-getalleleseq.fa |
diffstat | 5 files changed, 238 insertions(+), 230 deletions(-) [+] |
line wrap: on
line diff
--- a/getalleleseq.py Fri Jul 19 17:49:30 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -#!/usr/bin/env python -# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) -# -#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles -# -#Given a table with minor and major alleles per position, it generates the -#minor and major allele sequences in FASTA format -# -#positional arguments: -# alleles Table containing minor and major allele base per -# position. cols: [id, chr, pos, A, C, G, T, cvrg, -# plody, major, minor, freq_minor] -# -#optional arguments: -# -h, --help show this help message and exit -# -l INT, --seq-length INT -# Background sequence length. Bases in an artifical -# all-N-sequence of length INT will be replaced by -# either the major or minor allele base accordingly -# -j FILE, --major-seq FILE -# File to write major allele sequences in FASTA multiple -# alignment format. -# -d DIR, --minor-dir DIR -# Per sample minor allele sequences will be written to -# this directory -# -# The expected columns in the alleles table follow Nicholas Stoler's -# Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed -# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details -# -# Expected columns: -# 1. sample_id -# 2. chr -# 3. position -# 4 counts for A's -# 5. counts for C's -# 6. counts for G's -# 7. counts for T's -# 8. Coverage -# 9. Number of alleles passing a given criteria -# 10. Major allele -# 11. Minor allele -# 12. Minor allele frequency in position - -import sys -import os -import argparse - -def createseq(sample, allele, seq_size, table): - """Generate major or minor allele sequence""" - out_sequence = ['N' for i in range(seq_size)] - sample_data = [line for line in table if line[0] == sample] - - for entry in sample_data: - position = int(entry[2]) - number_of_alleles = int(entry[8]) - major_allele = entry[9].strip() - minor_allele = entry[10].strip() - - if allele == 'major': - out_sequence[position-1] = major_allele - elif allele == 'minor': - if number_of_alleles == 2: - out_sequence[position-1] = minor_allele - else: - out_sequence[position-1] = major_allele - return out_sequence - -def printseq(sample,allele,seq,output): - """Print out sequence""" - #print >> output, '>{0}_{1}'.format(sample,allele) - print >> output, '>{0}{1}'.format(sample,allele) - for i in range(0,len(seq),70): - print >> output, ''.join(seq[i:i+70]) - -def main(): - parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') - parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') - parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') - parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') - parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") - parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility - args = parser.parse_args() - - - try: - table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] - samples = sorted(list(set([ line[0] for line in table ]))) - except: - sys.exit('\nERROR: Could not open %s\n' % args.alleles) - try: - major_out = open(args.major_seq, 'w+') - except: - sys.exit('\nCould not create %s\n' % args.major_seq) - - # Single file for all major allele sequences in FASTA multiple alignment - for sample in samples: - sequence = createseq(sample,'major',args.seq_length,table) - #printseq(sample,'major',sequence,major_out) - printseq(sample,'',sequence,major_out) - major_out.close() - - # Sample specific minor allele sequence in FASTA format - try: - os.makedirs(args.minor_dir) - except: - pass - - for sample in samples: - if args.minor_prefix: # to fit Galaxy requirements - name = sample.replace('_','') - minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') - else: # for non-Galaxy - minor_name = sample+'-minor.fa' - minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') - sequence = createseq(sample,'minor',args.seq_length,table) - #printseq(sample,'minor',sequence,minor_out) - printseq(sample,'_minor',sequence,minor_out) - minor_out.close() - -if __name__ == "__main__": main() \ No newline at end of file
--- a/getalleleseq.xml Fri Jul 19 17:49:30 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -<tool id="getalleleseq" name="FASTA from allele counts" version="0.0.1" force_history_refresh="True"> - <description>Generate major and minor allele sequences from alleles table</description> - <command interpreter="python">getalleleseq.py - $alleles - -l $seq_length - -j $major_seq - -d $__new_file_path__ - -p $major_seq.id -</command> - <inputs> - <param format="tabular" name="alleles" type="data" label="Table containing major and minor alleles base per position" help="must be tabular and follow *Count alleles* tool output format"/> - <param name="seq_length" type="integer" value="16569" label="Background sequence length" help="e.g. 16569 for mitochondrial variants"/> - </inputs> - <outputs> - <data format="fasta" name="major_seq"/> - </outputs> - <tests> - <test> - <param name="alleles" value="test-table-getalleleseq.tab"/> - <param name="seq_length" value="16569"/> - <output name="major_seq" file="test-major-allele-out-getalleleseq.fa"/> - </test> - </tests> - - <help> - - -The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position. -Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence. - ------ - -.. class:: infomark - -**What it does** - -It takes the table generated from the Count alleles tool to derive a major and minor allele sequence per sample. -Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample) -to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses. -In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation. - ------ - -.. class:: warningmark - -**Note** - -Please, follow the format described below for the input file: - ------ - -.. class:: infomark - -**Formats** - -**Count alleles tool output format** - -Columns:: - - 1. sample id - 2. chromosome - 3. position - 4 counts for A's - 5. counts for C's - 6. counts for G's - 7. counts for T's - 8. Coverage - 9. Number of alleles passing frequency threshold - 10. Major allele - 11. Minor allele - 12. Minor allele frequency in position - - -**FASTA multiple alignment** - -See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format - ------ - -**Example** - -- For the following dataset:: - - S9 chrM 3 3 0 2 214 219 0 T A 0.013698630137 - S9 chrM 4 3 249 3 0 255 0 C N 0.0 - S9 chrM 5 245 1 1 0 247 1 A N 0.0 - S11 chrM 6 0 292 0 0 292 1 C . 0.0 - S7 chrM 6 0 254 0 0 254 1 C . 0.0 - S9 chrM 6 2 306 2 0 310 0 C N 0.0 - S11 chrM 7 281 0 3 0 284 0 A G 0.0105633802817 - S7 chrM 7 249 0 2 0 251 1 A G 0.00796812749004 - etc. for all covered positions per sample... - -- Running this tool with background sequence length 16569 will produce 4 files:: - - 1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11 - 2. minor allele sequence of sample S7 - 3. minor allele sequence of sample S9 - 4. minor allele sequence of sample S11 - ------ - -**Citation** - -If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al. *In preparation.* -(boris-at-bx.psu.edu) - - </help> -</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getalleleseq/getalleleseq.py Tue Mar 18 09:05:02 2014 -0400 @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) +# +#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles +# +#Given a table with minor and major alleles per position, it generates the +#minor and major allele sequences in FASTA format +# +#positional arguments: +# alleles Table containing minor and major allele base per +# position. cols: [id, chr, pos, A, C, G, T, cvrg, +# plody, major, minor, freq_minor] +# +#optional arguments: +# -h, --help show this help message and exit +# -l INT, --seq-length INT +# Background sequence length. Bases in an artifical +# all-N-sequence of length INT will be replaced by +# either the major or minor allele base accordingly +# -j FILE, --major-seq FILE +# File to write major allele sequences in FASTA multiple +# alignment format. +# -d DIR, --minor-dir DIR +# Per sample minor allele sequences will be written to +# this directory +# +# The expected columns in the alleles table follow Nicholas Stoler's +# Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed +# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details +# +# Expected columns: +# 1. sample_id +# 2. chr +# 3. position +# 4 counts for A's +# 5. counts for C's +# 6. counts for G's +# 7. counts for T's +# (8. counts for a's) +# (9. counts for c's) +# (10. counts for g's) +# (11. counts for t's) +# 8. (12.) Coverage +# 9. (13.) Number of alleles passing a given criteria +# 10. (14.) Major allele +# 11. (15.) Minor allele +# 12. (16.) Minor allele frequency in position + +import sys +import os +import argparse + +def createseq(sample, allele, seq_size, table): + """Generate major or minor allele sequence""" + out_sequence = ['N' for i in range(seq_size)] + sample_data = [line for line in table if line[0] == sample] + + for entry in sample_data: + position = int(entry[2]) + if len(entry)==12: + number_of_alleles = int(entry[8]) + major_allele = entry[9].strip() + minor_allele = entry[10].strip() + else: + number_of_alleles = int(entry[12]) + major_allele = entry[13].strip() + minor_allele = entry[14].strip() + + if allele == 'major': + out_sequence[position-1] = major_allele + elif allele == 'minor': + if number_of_alleles >= 2: + out_sequence[position-1] = minor_allele + else: + out_sequence[position-1] = major_allele + return out_sequence + +def printseq(sample,allele,seq,output): + """Print out sequence""" + #print >> output, '>{0}_{1}'.format(sample,allele) + print >> output, '>{0}{1}'.format(sample,allele) + for i in range(0,len(seq),70): + print >> output, ''.join(seq[i:i+70]) + +def main(): + parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') + parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') + parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') + parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') + parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") + parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility + args = parser.parse_args() + + + try: + table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] + samples = sorted(list(set([ line[0] for line in table ]))) + except: + sys.exit('\nERROR: Could not open %s\n' % args.alleles) + try: + major_out = open(args.major_seq, 'w+') + except: + sys.exit('\nCould not create %s\n' % args.major_seq) + + # Single file for all major allele sequences in FASTA multiple alignment + for sample in samples: + sequence = createseq(sample,'major',args.seq_length,table) + #printseq(sample,'major',sequence,major_out) + printseq(sample,'',sequence,major_out) + major_out.close() + + # Sample specific minor allele sequence in FASTA format + try: + os.makedirs(args.minor_dir) + except: + pass + + for sample in samples: + if args.minor_prefix: # to fit Galaxy requirements + name = sample.replace('_','') + minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') + else: # for non-Galaxy + minor_name = sample+'-minor.fa' + minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') + sequence = createseq(sample,'minor',args.seq_length,table) + #printseq(sample,'minor',sequence,minor_out) + printseq(sample,'_minor',sequence,minor_out) + minor_out.close() + +if __name__ == "__main__": main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getalleleseq/getalleleseq.xml Tue Mar 18 09:05:02 2014 -0400 @@ -0,0 +1,108 @@ +<tool id="getalleleseq" name="FASTA from allele counts" version="0.0.1" force_history_refresh="True"> + <description>Generate major and minor allele sequences from alleles table</description> + <command interpreter="python">getalleleseq.py + $alleles + -l $seq_length + -j $major_seq + -p $major_seq.id +</command> + <inputs> + <param format="tabular" name="alleles" type="data" label="Table containing major and minor alleles base per position" help="must be tabular and follow the Variant Annotator tool output format"/> + <param name="seq_length" type="integer" value="16569" label="Background sequence length" help="e.g. 16569 for mitochondrial variants"/> + </inputs> + <outputs> + <data format="fasta" name="major_seq"/> + </outputs> + <tests> + <test> + <param name="alleles" value="test-table-getalleleseq.tab"/> + <param name="seq_length" value="16569"/> + <output name="major_seq" file="test-major-allele-out-getalleleseq.fa"/> + </test> + </tests> + + <help> + + +The major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position. +Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence. + +----- + +.. class:: infomark + +**What it does** + +It takes the table generated from the Variant Annotator tool to derive a major and minor allele sequence per sample. +Since all sequences share the same length all the major allele sequences are included into a single file (with proper headers per sample) +to create a multiple sequence alignment in FASTA format that can be used for downstream phylogenetic analyses. +In contrast, the minor allele sequences are informed as single FASTA files per sample to ease their downstream manipulation. + +----- + +.. class:: warningmark + +**Note** + +Please, follow the format described below for the input file: + +----- + +.. class:: infomark + +**Formats** + +**Variant Annotator tool output format** + +Columns:: + + 1. sample id + 2. chromosome + 3. position + 4 counts for A's + 5. counts for C's + 6. counts for G's + 7. counts for T's + 8. Coverage + 9. Number of alleles passing frequency threshold + 10. Major allele + 11. Minor allele + 12. Minor allele frequency in position + + +**FASTA multiple alignment** + +See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format + +----- + +**Example** + +- For the following dataset:: + + S9 chrM 3 3 0 2 214 219 0 T A 0.013698630137 + S9 chrM 4 3 249 3 0 255 0 C N 0.0 + S9 chrM 5 245 1 1 0 247 1 A N 0.0 + S11 chrM 6 0 292 0 0 292 1 C . 0.0 + S7 chrM 6 0 254 0 0 254 1 C . 0.0 + S9 chrM 6 2 306 2 0 310 0 C N 0.0 + S11 chrM 7 281 0 3 0 284 0 A G 0.0105633802817 + S7 chrM 7 249 0 2 0 251 1 A G 0.00796812749004 + etc. for all covered positions per sample... + +- Running this tool with background sequence length 16569 will produce 4 files:: + + 1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11 + 2. minor allele sequence of sample S7 + 3. minor allele sequence of sample S9 + 4. minor allele sequence of sample S11 + +----- + +**Citation** + +If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al (2014). *Acccepted in Biotechniques* +(boris-at-bx.psu.edu) + + </help> +</tool> \ No newline at end of file