# HG changeset patch # User boris # Date 1372135625 14400 # Node ID dc78c20610a14a1909743a4555f0ac94261859e0 # Parent 8f63bfc151e9fdccc51f35b224c54e17ea2d4fbf Deleted selected files diff -r 8f63bfc151e9 -r dc78c20610a1 getalleleseq/getalleleseq.py --- a/getalleleseq/getalleleseq.py Tue Jun 25 00:46:02 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -#!/usr/bin/env python -# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) -# -#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles -# -#Given a table with minor and major alleles per position, it generates the -#minor and major allele sequences in FASTA format -# -#positional arguments: -# alleles Table containing minor and major allele base per -# position. cols: [id, chr, pos, A, C, G, T, cvrg, -# plody, major, minor, freq_minor] -# -#optional arguments: -# -h, --help show this help message and exit -# -l INT, --seq-length INT -# Background sequence length. Bases in an artifical -# all-N-sequence of length INT will be replaced by -# either the major or minor allele base accordingly -# -j FILE, --major-seq FILE -# File to write major allele sequences in FASTA multiple -# alignment format. -# -d DIR, --minor-dir DIR -# Per sample minor allele sequences will be written to -# this directory -# -# The expected columns in the alleles table follow Nicholas Stoler's -# allele-count tool format. See allele-count in Galaxy's tool shed -# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details -# -# Expected columns: -# 1. sample_id -# 2. chr -# 3. position -# 4 counts for A's -# 5. counts for C's -# 6. counts for G's -# 7. counts for T's -# 8. Coverage -# 9. Number of alleles passing a given criteria -# 10. Major allele -# 11. Minor allele -# 12. Minor allele frequency in position - -import sys -import os -import argparse - -def createseq(sample, allele, seq_size, table): - """Generate major or minor allele sequence""" - out_sequence = ['N' for i in range(seq_size)] - sample_data = [line for line in table if line[0] == sample] - - for entry in sample_data: - position = int(entry[2]) - number_of_alleles = int(entry[8]) - major_allele = entry[9].strip() - minor_allele = entry[10].strip() - - if allele == 'major': - out_sequence[position-1] = major_allele - elif allele == 'minor': - if number_of_alleles == 2: - out_sequence[position-1] = minor_allele - else: - out_sequence[position-1] = major_allele - return out_sequence - -def printseq(sample,allele,seq,output): - """Print out sequence""" - print >> output, '>{0}_{1}'.format(sample,allele) - for i in range(0,len(seq),70): - print >> output, ''.join(seq[i:i+70]) - -def main(): - parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') - parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') - parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') - parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') - parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") - parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility - args = parser.parse_args() - - - try: - table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] - samples = sorted(list(set([ line[0] for line in table ]))) - except: - sys.exit('\nERROR: Could not open %s\n' % args.alleles) - try: - major_out = open(args.major_seq, 'w+') - except: - sys.exit('\nCould not create %s\n' % args.major_seq) - - # Single file for all major allele sequences in FASTA multiple alignment - for sample in samples: - sequence = createseq(sample,'major',args.seq_length,table) - printseq(sample,'major',sequence,major_out) - major_out.close() - - # Sample specific minor allele sequence in FASTA format - try: - os.makedirs(args.minor_dir) - except: - pass - - for sample in samples: - if args.minor_prefix: # to fit Galaxy requirements - name = sample.replace('_','') - minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') - else: # for non-Galaxy - minor_name = sample+'-minor.fa' - minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') - sequence = createseq(sample,'minor',args.seq_length,table) - printseq(sample,'minor',sequence,minor_out) - minor_out.close() - -if __name__ == "__main__": main() \ No newline at end of file diff -r 8f63bfc151e9 -r dc78c20610a1 getalleleseq/getalleleseq.xml --- a/getalleleseq/getalleleseq.xml Tue Jun 25 00:46:02 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ - - Generate major and minor allele sequences from alleles table - getalleleseq.py - $alleles - -l $seq_length - -j $major_seq - -d $__new_file_path__ - -p $major_seq.id - - - - - - - - - - - - - - - - - - - -Given a set of sequencing reads, the major allele sequence of a sample is simply the sequence consisting of the most frequent nucleotide per position. -Replacing the major allele for the second most frequent allele at diploid positions generates the minor allele sequence. -(boris-at-bx.psu.edu) - - - ------ - -.. class:: infomark - -**What it does** - -It takes the table generated from the allele-count tool to derive a major and minor allele sequence per sample. -Since all sequences share the same length, there is no need to align them. Thus, all major allele sequences are output to a FASTA multiple alignment file. -Minor allele sequences are output to independent FASTA files per sample. - ------ - -.. class:: warningmark - -**Note** - -Please, follow the format described below for the input file: - ------ - -.. class:: infomark - -**Formats** - -**allele-count tool output format** - -Columns:: - - 1. sample id - 2. chromosome - 3. position - 4 counts for A's - 5. counts for C's - 6. counts for G's - 7. counts for T's - 8. Coverage - 9. Number of alleles passing a given criteria - 10. Major allele - 11. Minor allele - 12. Minor allele frequency in position - - - - -**FASTA multiple alignment** - -See http://www.bioperl.org/wiki/FASTA_multiple_alignment_format - ------ - -**Example** - -- For the following dataset:: - - S9 chrM 3 3 0 2 214 219 0 T A 0.013698630137 - S9 chrM 4 3 249 3 0 255 0 C N 0.0 - S9 chrM 5 245 1 1 0 247 1 A N 0.0 - S11 chrM 6 0 292 0 0 292 1 C . 0.0 - S7 chrM 6 0 254 0 0 254 1 C . 0.0 - S9 chrM 6 2 306 2 0 310 0 C N 0.0 - S11 chrM 7 281 0 3 0 284 0 A G 0.0105633802817 - S7 chrM 7 249 0 2 0 251 1 A G 0.00796812749004 - etc. for all covered positions per sample... - -- Running this tool with background sequence length 16569 will produce 4 files:: - - 1. Multiple alignment FASTA file containing the major allele sequences of samples S7, S9 and S11 - 2. minor allele sequence of sample S7 - 3. minor allele sequence of sample S9 - 4. minor allele sequence of sample S11 - - - \ No newline at end of file