Mercurial > repos > peterjc > seq_composition
view tools/seq_composition/seq_composition.py @ 2:240022d8c523 draft
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/seq_composition commit d9c784e64d63560ebec4de04853deed73f273dce
author | peterjc |
---|---|
date | Wed, 13 May 2015 10:47:40 -0400 |
parents | 06abd5621699 |
children | b7b9f5c5d682 |
line wrap: on
line source
#!/usr/bin/env python """Record sequence composition from FASTA, FASTQ or SFF files. This tool is a short Python script which requires Biopython 1.62 or later for SFF file support. If you use this tool in scientific work leading to a publication, please cite the Biopython application note: Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2014 by Peter Cock, The James Hutton Institute (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. See accompanying text file for licence details (MIT license). Use -v or --version to get the version, -h or --help for help. """ import os import sys from optparse import OptionParser from Bio import SeqIO def sys_exit(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) #Parse Command Line usage = """Example usage: $ python seq_composition.py -o my_output.tsv -q input1.fastq -q input2.fastq At least one input sequence file is required (using the -f, -q, or -s options). If the expected alphabet is given, the sequence composition is verfied against it. """ #TODO - Case senstivity? #TODO - GenBank / EMBL input? Needs the datatype defined... #TODO - Handle all the FASTQ datatype subclasses in the XML cheetah code? parser = OptionParser(usage=usage) parser.add_option('-f', '--fasta', dest='fasta', action="append", default=[], help='Input sequence filename in FASTA format') parser.add_option('-q', '--fastq', '--fastqsanger', '--fastqillumina', '--fastqsolexa', dest='fastq', action="append", default=[], help='Input sequence filename in FASTQ format') parser.add_option('-s', '--sff', dest='sff', action="append", default=[], help='Input sequence filename in SFF format') parser.add_option('-o', '--output', dest='output', default=None, help='Output filename (tabular)', metavar="FILE") parser.add_option("-v", "--version", dest="version", default=False, action="store_true", help="Show version and quit") options, args = parser.parse_args() if options.version: print("v0.0.1") sys.exit(0) if not (options.fasta or options.fastq or options.sff): sys_exit("Require an input filename") if not options.output: sys_exit("Require an output filename") file_count = 0 seq_count = 0 counts = dict() for format, filenames in [("fasta", options.fasta), ("fastq", options.fastq), ("sff-trim", options.sff), ]: for filename in filenames: file_count += 1 for record in SeqIO.parse(filename, format): seq_count += 1 for letter in record: try: counts[letter] += 1 except: counts[letter] = 1 total = sum(counts.values()) sys.stderr.write("Counted %i sequence letters from %i records from %i files\n" % (total, seq_count, file_count)) scale = 100.0 / total with open(options.output, "w") as handle: handle.write("Letter\tCount\tPercentage\n") for letter, count in sorted(counts.items()): handle.write("%s\t%i\t%0.2f\n" % (letter, count, count * scale))