Previous changeset 3:13bcc2f459b0 (2015-11-23) Next changeset 5:9d46c9ca7ceb (2015-11-23) |
Commit message:
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty |
added:
Makefile README.md align_families.py alignc.c alignc.so consensus consensus-time.py consensus.py consensus.pyc consensusc.c consensusc.so duplex.py duplex.sublime-project duplex.sublime-workspace galaxy/align_families.xml galaxy/duplex.xml galaxy/make_families.xml galaxy/tool_dependencies.xml loeb-2.0.sh make-barcodes.awk misc/00README.txt misc/ACCGACACAGACTAGGGATCAAAG.msa.qual.tsv misc/ACCGACACAGACTAGGGATCAAAG.msa.tsv misc/ACCGACACAGACTAGGGATCAAAG.tsv misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.family.msa.tsv misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.after.fa misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.before.fa misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.family.msa.tsv misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.after.fa misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.before.fa misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.family.msa.tsv misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.after.fa misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.before.fa misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.family.msa.tsv misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.after.fa misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.before.fa misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.family.msa.tsv misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.after.fa misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.before.fa misc/bug1/cmp.sh misc/bug1/cmp.txt misc/bug1/diff.family.msa.tsv misc/bug1/diff.sscs.after.fa misc/bug1/diff.sscs.before.fa misc/bug1/tmp.family.msa.tsv misc/bug1/tmp.sscs.after.fa misc/bug1/tmp.sscs.before.fa misc/family.align.fa misc/family.cons.fa misc/family.fa misc/family.msa.tsv misc/family2.align.fa misc/family2.cons.fa misc/family2.fa misc/family3.align.fa misc/family3.cons.fa misc/msa_sscs_matcher.py misc/read.fa misc/sscs_diff.py pipeline.sh planemo-template/cat.xml planemo-template/random_lines_two_pass.py planemo-template/randomlines.xml planemo-template/test-data/1.bed planemo-template/test-data/1_bed_random_lines_1_seed_asdf_out.bed seqtools.py seqtools.pyc seqtoolsc.c seqtoolsc.so swalign.py swalign.pyc swalignc.c swalignc.h swalignc.so test.fa test.fq test.py test.sam test.sscs.fa test_1.fa test_2.fa tests/families.cons.fa tests/families.cons.incl-sscs.fa tests/families.in.tsv tests/families.msa.tsv tests/gapqual.cons.fa tests/gapqual.msa.tsv tests/gaps-diffs.out.tsv tests/gaps.msa.tsv tests/qual.cons.fa tests/qual.msa.tsv tests/quirks.msa.tsv tests/run.sh tests/smoke.families.aligned.tsv tests/smoke.families.i0.tsv tests/smoke.families.tsv tests/smoke_1.fq tests/smoke_2.fq utils/get_msa.py utils/getreads.py utils/getreads.pyc utils/msa2fa.awk utils/outconv.awk utils/precheck.py utils/stats.py utils/subsample.py |
removed:
align_families.xml duplex.xml make_families.xml tool_dependencies.xml |
b |
diff -r 13bcc2f459b0 -r af383638de66 Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Makefile Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,8 @@ +CFLAGS=-Wall + +all: + gcc -Wall -shared -fPIC alignc.c -o alignc.so + gcc -Wall -shared -fPIC swalignc.c -o swalignc.so -lm + gcc -Wall -shared -fPIC seqtoolsc.c -o seqtoolsc.so + gcc -Wall -shared -fPIC consensusc.c -o consensusc.so + |
b |
diff -r 13bcc2f459b0 -r af383638de66 README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,95 @@ +Du Novo +=========== + +This is a simple pipeline to process duplex sequencing data without the use of a reference sequence. + +The pipeline was designed for use with the duplex method described in [Kennedy *et al.* 2014](https://dx.doi.org/10.1038/nprot.2014.170), but the assumptions are relatively minimal, so you should be able to apply it to variants of the protocol. + + +### Requirements + +The pipeline requires a Unix command line, and it must be able to find the `mafft` command on your [`PATH`](https://en.wikipedia.org/wiki/Search_path). + +All known requirements are below. Version numbers in parentheses are what the development environment uses. Version numbers in **bold** are known to be required. + +* [MAFFT](http://mafft.cbrc.jp/alignment/software/) (v7.123b) +* [Python](https://www.python.org/) (**2.7**) +* [gcc](https://gcc.gnu.org/) (4.8.4) +Standard unix tools: +* [bash](https://www.gnu.org/software/bash/bash.html) (4.0) +* [awk](https://www.gnu.org/software/gawk/) (4.0.1) +* [paste](https://www.gnu.org/software/coreutils/coreutils.html), [sort](https://www.gnu.org/software/coreutils/coreutils.html), [cat](https://www.gnu.org/software/coreutils/coreutils.html) (8.21) + + +### Installation + +`git clone` the source to any directory, or click "Download ZIP", unzip it, and place the "duplex-master" directory anywhere. + +You'll need to compile the C modules before using it. Do this in a terminal by `cd`ing to the source directory (where the file `Makefile` is) and run the command `make`. + + +### Usage + +This example shows how to go from raw duplex sequencing data to the final duplex consensus sequences. + +The example assumes you want to process duplex reads in the files `reads_1.fastq` and `reads_2.fastq`, and have `cd`'d to their directory. It also assumes you've placed the commands `align_families.py` and `duplex.py` on your `PATH`. Note that where it says `make-barcodes.awk`, you should replace that with the actual path to the script `make-barcodes.awk` included in this pipeline. + +1. Sort the reads into families based on their barcodes. + ```bash + $ cat reads_1.fastq | paste - - - - \ + | paste - <(cat reads_2.fastq | paste - - - -) \ + | awk -f make-barcodes.awk \ + | sort > families.tsv + ``` + +2. Do multiple sequence alignments of the read families. +`$ align_families.py families.tsv > families.msa.tsv` + +3. Build duplex consensus sequences of from the aligned families. +`$ duplex.py families.msa.tsv > duplex.fa` + +See all options for a given command by giving it the `-h` flag. + + +### Details + +##### 1. Sort the reads into families based on their barcodes. + + $ cat reads_1.fastq | paste - - - - \ + | paste - <(cat reads_2.fastq | paste - - - -) \ + | awk -f make-barcodes.awk \ + | sort > families.tsv + +This command pipeline will transform each pair of reads into a one-line record, split the 12bp barcodes off them, and sort by their combined barcode. The end result is a file (named `families.tsv` above) listing read pairs, grouped by barcode. See `make-barcodes.awk` for the details on the formation of the barcodes and the format. + +Note: This step requires your FASTQ files to have exactly 4 lines per read (no multi-line sequences). Also, in the output, the read sequence does not include the barcode or the 5bp constant sequence after it. You can customize the length of the barcode or constant sequence by setting the awk constants `BAR_LEN` and `INVARIANT` (i.e. `awk -v BAR_LEN=10 make-barcodes.awk`). + + +##### 2. Do multiple sequence alignments of the read families. + +`$ align_families.py families.tsv > families.msa.tsv` + +This step aligns each family of reads, but it processes each strand separately. It can be parallelized with the `-p` option, but at the moment that will cause the output to only be generated at the end, instead of streaming it as it's generated. + + +##### 3. Build duplex consensus sequences of from the aligned families. + +`$ duplex.py families.msa.tsv > duplex.fa` + +This calls a consensus sequence from the multiple sequence alignments of the previous step. It does this in two steps: First, single-strand consensus sequences (SSCSs) are called from the family alignments, then duplex consensus sequences are called from pairs of SSCSs. + +When calling SSCSs, by default 3 reads are required to successfully create a consensus from each strand. Quality filtering is done at this step by excluding bases below a quality threshold. By default, no base with a PHRED quality less than 20 will contribute to the consensus. If no base passes the threshold or there is no majority base, an `N` will be inserted. + +The duplex consensus sequences are created by comparing the two SSCSs. For each base, if they agree, that base will be inserted. If they disagree, the IUPAC ambiguity code for the two bases will be used. Note that a disagreement between a base and a gap will result in an `N`. A planned feature is to use information from the raw reads contributing to the duplex to make a call in such a case, coding uncertainty into quality scores. + +The output of this step is the duplex consensus sequences in FASTA format. By default, it will only include full duplex consensuses, meaning if one of the two SSCSs are missing, that sequence will be omitted. But these sequences can be included with the `--incl-sscs` option, which will add lone SSCSs to the output. + +The reads will be printed in one, interleaved file, with the naming format + + >{barcode}.{mate} {# reads in strand 1 family}/{# reads in strand 2 family} + >TTGCGCCAGGGCGAGGAAAATACT.1 8/13 + +But this isn't easy to work with. A better output is in development, but for now you can use the awk script `outconv.awk` to convert the interleaved output file into two standard forward/reverse paired files with a standard naming convention: + + $ awk -f utils/outconv.awk -v target=1 duplex.fa > duplex_1.fa + $ awk -f utils/outconv.awk -v target=2 duplex.fa > duplex_2.fa |
b |
diff -r 13bcc2f459b0 -r af383638de66 align_families.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/align_families.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,281 @@\n+#!/usr/bin/env python\n+from __future__ import division\n+import os\n+import sys\n+import time\n+import tempfile\n+import argparse\n+import subprocess\n+import collections\n+import multiprocessing\n+import distutils.spawn\n+import seqtools\n+\n+#TODO: Warn if it looks like the two input FASTQ files are the same (i.e. the _1 file was given\n+# twice). Can tell by whether the alpha and beta (first and last 12bp) portions of the barcodes\n+# are always identical. This would be a good thing to warn about, since it\'s an easy mistake\n+# to make, but it\'s not obvious that it happened. The pipeline won\'t fail, but will just\n+# produce pretty weird results.\n+\n+REQUIRED_COMMANDS = [\'mafft\']\n+OPT_DEFAULTS = {\'processes\':1}\n+DESCRIPTION = """Read in sorted FASTQ data and do multiple sequence alignments of each family."""\n+\n+\n+def main(argv):\n+\n+ parser = argparse.ArgumentParser(description=DESCRIPTION)\n+ parser.set_defaults(**OPT_DEFAULTS)\n+\n+ parser.add_argument(\'infile\', metavar=\'read-families.tsv\', nargs=\'?\',\n+ help=\'The input reads, sorted into families. One line per read pair, 8 tab-delimited columns: \'\n+ \'1. canonical barcode, 2. barcode order ("ab" for alpha+beta, "ba" for beta-alpha) 3. \'\n+ \'read 1 name, 4. read 1 sequence, 5. read 1 quality scores, 6. read 2 name, 7. read 2 \'\n+ \'sequence, 8. read 2 quality scores.\')\n+ parser.add_argument(\'-p\', \'--processes\', type=int,\n+ help=\'Number of worker subprocesses to use. Must be at least 1. Default: %(default)s.\')\n+\n+ args = parser.parse_args(argv[1:])\n+\n+ assert args.processes > 0, \'-p must be greater than zero\'\n+\n+ # Check for required commands.\n+ missing_commands = []\n+ for command in REQUIRED_COMMANDS:\n+ if not distutils.spawn.find_executable(command):\n+ missing_commands.append(command)\n+ if missing_commands:\n+ fail(\'Error: Missing commands: "\'+\'", "\'.join(missing_commands)+\'".\')\n+\n+ if args.infile:\n+ infile = open(args.infile)\n+ else:\n+ infile = sys.stdin\n+\n+ # Open all the worker processes.\n+ workers = open_workers(args.processes)\n+\n+ # Main loop.\n+ # This processes whole duplexes (pairs of strands) at a time for a future option to align the\n+ # whole duplex at a time.\n+ stats = {\'duplexes\':0, \'time\':0, \'pairs\':0, \'runs\':0, \'aligned_pairs\':0}\n+ current_worker_i = 0\n+ duplex = collections.OrderedDict()\n+ family = []\n+ barcode = None\n+ order = None\n+ for line in infile:\n+ fields = line.rstrip(\'\\r\\n\').split(\'\\t\')\n+ if len(fields) != 8:\n+ continue\n+ (this_barcode, this_order, name1, seq1, qual1, name2, seq2, qual2) = fields\n+ # If the barcode or order has changed, we\'re in a new family.\n+ # Process the reads we\'ve previously gathered as one family and start a new family.\n+ if this_barcode != barcode or this_order != order:\n+ duplex[order] = family\n+ # If the barcode is different, we\'re at the end of the whole duplex. Process the it and start\n+ # a new one. If the barcode is the same, we\'re in the same duplex, but we\'ve switched strands.\n+ if this_barcode != barcode:\n+ # sys.stderr.write(\'processing {}: {} orders ({})\\n\'.format(barcode, len(duplex),\n+ # \'/\'.join([str(len(duplex[order])) for order in duplex])))\n+ output, run_stats, current_worker_i = delegate(workers, stats, duplex, barcode)\n+ process_results(output, run_stats, stats)\n+ duplex = collections.OrderedDict()\n+ barcode = this_barcode\n+ order = this_order\n+ family = []\n+ pair = {\'name1\': name1, \'seq1\':seq1, \'qual1\':qual1, \'name2\':name2, \'seq2\':seq2, \'qual2\':qual2}\n+ family.append(pair)\n+ stats[\'pairs\'] += 1\n+ # Process the last family.\n+ duplex[order] = family\n+ # sys.stderr.write(\'processing {}: {} orders ({}) [last]\\n\'.format(barcode, len(duplex),\n+ # \'/\'.join([str(len(duplex[order])) for order in duplex])))\n+ output, run_stats, current_worker_i = delegate(workers, stats, duplex, barcode)\n+ '..b'art\n+ pairs = len(family)\n+ #logging.info(\'{} sec for {} read pairs.\'.format(elapsed, pairs))\n+ if pairs > 1:\n+ run_stats[\'time\'] += elapsed\n+ run_stats[\'runs\'] += 1\n+ run_stats[\'aligned_pairs\'] += pairs\n+ if alignment is None:\n+ pass #logging.warning(\'Error aligning family {}/{} (read {}).\'.format(barcode, order, mate))\n+ else:\n+ output += format_msa(alignment, barcode, order, mate)\n+ return output, run_stats\n+\n+\n+def align_family(family, mate):\n+ """Do a multiple sequence alignment of the reads in a family and their quality scores."""\n+ mate = str(mate)\n+ assert mate == \'1\' or mate == \'2\'\n+ # Do the multiple sequence alignment.\n+ seq_alignment = make_msa(family, mate)\n+ if seq_alignment is None:\n+ return None\n+ # Transfer the alignment to the quality scores.\n+ seqs = [read[\'seq\'] for read in seq_alignment]\n+ quals_raw = [pair[\'qual\'+mate] for pair in family]\n+ qual_alignment = seqtools.transfer_gaps_multi(quals_raw, seqs, gap_char_out=\' \')\n+ # Package them up in the output data structure.\n+ alignment = []\n+ for aligned_seq, aligned_qual in zip(seq_alignment, qual_alignment):\n+ alignment.append({\'name\':aligned_seq[\'name\'], \'seq\':aligned_seq[\'seq\'], \'qual\':aligned_qual})\n+ return alignment\n+\n+\n+def make_msa(family, mate):\n+ """Perform a multiple sequence alignment on a set of sequences and parse the result.\n+ Uses MAFFT."""\n+ mate = str(mate)\n+ assert mate == \'1\' or mate == \'2\'\n+ if len(family) == 0:\n+ return None\n+ elif len(family) == 1:\n+ # If there\'s only one read pair, there\'s no alignment to be done (and MAFFT won\'t accept it).\n+ return [{\'name\':family[0][\'name\'+mate], \'seq\':family[0][\'seq\'+mate]}]\n+ #TODO: Replace with tempfile.mkstemp()?\n+ with tempfile.NamedTemporaryFile(\'w\', delete=False, prefix=\'align.msa.\') as family_file:\n+ for pair in family:\n+ name = pair[\'name\'+mate]\n+ seq = pair[\'seq\'+mate]\n+ family_file.write(\'>\'+name+\'\\n\')\n+ family_file.write(seq+\'\\n\')\n+ with open(os.devnull, \'w\') as devnull:\n+ try:\n+ command = [\'mafft\', \'--nuc\', \'--quiet\', family_file.name]\n+ output = subprocess.check_output(command, stderr=devnull)\n+ except (OSError, subprocess.CalledProcessError):\n+ return None\n+ os.remove(family_file.name)\n+ return read_fasta(output, is_file=False, upper=True)\n+\n+\n+def read_fasta(fasta, is_file=True, upper=False):\n+ """Quick and dirty FASTA parser. Return the sequences and their names.\n+ Returns a list of sequences. Each is a dict of \'name\' and \'seq\'.\n+ Warning: Reads the entire contents of the file into memory at once."""\n+ sequences = []\n+ sequence = \'\'\n+ seq_name = None\n+ if is_file:\n+ with open(fasta) as fasta_file:\n+ fasta_lines = fasta_file.readlines()\n+ else:\n+ fasta_lines = fasta.splitlines()\n+ for line in fasta_lines:\n+ if line.startswith(\'>\'):\n+ if upper:\n+ sequence = sequence.upper()\n+ if sequence:\n+ sequences.append({\'name\':seq_name, \'seq\':sequence})\n+ sequence = \'\'\n+ seq_name = line.rstrip(\'\\r\\n\')[1:]\n+ continue\n+ sequence += line.strip()\n+ if upper:\n+ sequence = sequence.upper()\n+ if sequence:\n+ sequences.append({\'name\':seq_name, \'seq\':sequence})\n+ return sequences\n+\n+\n+def format_msa(align, barcode, order, mate, outfile=sys.stdout):\n+ output = \'\'\n+ for sequence in align:\n+ output += \'{bar}\\t{order}\\t{mate}\\t{name}\\t{seq}\\t{qual}\\n\'.format(bar=barcode, order=order,\n+ mate=mate, **sequence)\n+ return output\n+\n+\n+def process_results(output, run_stats, stats):\n+ """Process the outcome of a duplex run.\n+ Print the aligned output and sum the stats from the run with the running totals."""\n+ for key, value in run_stats.items():\n+ stats[key] += value\n+ if output:\n+ sys.stdout.write(output)\n+\n+\n+def fail(message):\n+ sys.stderr.write(message+"\\n")\n+ sys.exit(1)\n+\n+if __name__ == \'__main__\':\n+ sys.exit(main(sys.argv))\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 align_families.xml --- a/align_families.xml Mon Nov 23 18:07:11 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,59 +0,0 @@ -<?xml version="1.0"?> -<tool id="align_families" name="Align families" version="0.1"> - <description>from duplex sequencing data</description> - <requirements> - <requirement type="package" version="7.221">mafft</requirement> - <requirement type="package" version="0.1">duplex</requirement> - </requirements> - <command interpreter="python" detect_errors="exit_code">align_families.py $input > $output - </command> - <inputs> - <param name="input" type="data" format="tabular" label="Input reads" help="with barcodes, grouped by family"/> - </inputs> - <outputs> - <data name="output" format="tabular"/> - </outputs> - <tests> - <test> - <param name="input" value="smoke.families.tsv"/> - <output name="output" file="smoke.families.aligned.tsv"/> - </test> - </tests> - <help> - -**What it does** - -This is for processing duplex sequencing data. It does a multiple sequence alignment on each (single-stranded) family of reads. - ------ - -**Input** - -This expects the output format of the "Make families" tool. - ------ - -**Output** - -The output is a tabular file where each line corresponds to a (single) read. - -The columns are:: - - 1: barcode (both tags) - 2: tag order in barcode ("ab" or "ba") - 3: read mate ("1" or "2") - 4: read name - 5: read sequence, aligned ("-" for gaps) - 6: read quality scores, aligned (" " for gaps) - ------ - -**Alignments** - -The alignments are done using MAFFT, specifically the command -:: - - $ mafft --nuc --quiet family.fa > family.aligned.fa - - </help> -</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 alignc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignc.c Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,192 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define NAIVE_TEST_WINDOW 6 +#define NAIVE_TEST_THRES 0.80 +#define NAIVE_TEST_MIN 2 +#define NAIVE_WINDOW 10 +#define NAIVE_THRES 0.80 + +typedef struct Gap { + int seq; + int coord; + int length; + struct Gap *next; +} Gap; + +typedef struct Gaps { + int length; + struct Gap *root; + struct Gap *tip; +} Gaps; + +int _test_match(char *seq1, int start1, char *seq2, int start2); +void add_gap(Gaps *gaps, int seq, int coord, int length); +Gaps *make_gaps(); +char *insert_gaps(Gaps *gaps, char *seq, int seq_num); + + +// A naive algorithm for aligning two sequences which are expected to be very similar to each other +// and already nearly aligned. +void naive2(char *seq1, char *seq2) { + Gaps *gaps = make_gaps(); + int i = 0; + int j = 0; + int matches = 0; + while (seq1[i] != 0 && seq2[j] != 0) { + // Match? + printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j); + if (seq1[i] == seq2[j]) { + matches++; + i++; + j++; + continue; + } + printf("mismatch!\n"); + // Mismatch. Start adding gaps until the mismatches go away. + int new_i = i; + int new_j = j; + int gap_seq = 0; + int success; + while (1) { + if (seq1[new_i] == 0 && seq2[new_j] == 0) { + break; + } + success = _test_match(seq1, new_i, seq2, j); + if (success) { + gap_seq = 2; + break; + } + if (seq1[new_i] != 0) { + new_i++; + } + success = _test_match(seq1, i, seq2, new_j); + if (success) { + gap_seq = 1; + break; + } + if (seq2[new_j] != 0) { + new_j++; + } + } + // Which sequence are we putting the gap in? + if (gap_seq == 0) { + printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j); + // No good gap found. + } else if (i == new_i && j == new_j) { + printf("No gap required.\n"); + } else if (gap_seq == 1) { + printf("%dbp gap in seq1 at base %d.\n", new_j-j, j); + add_gap(gaps, 1, j, new_j-j); + j = new_j; + } else if (gap_seq == 2) { + printf("%dbp gap in seq2 at base %d.\n", new_i-i, i); + add_gap(gaps, 2, i, new_i-i); + i = new_i; + } + i++; + j++; + } + + char *new_seq1 = insert_gaps(gaps, seq1, 1); + char *new_seq2 = insert_gaps(gaps, seq2, 2); + printf("alignment:\n%s\n%s\n", new_seq1, new_seq2); +} + +// Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with +// few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match +// percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be +// more than NAIVE_TEST_MIN. +int _test_match(char *seq1, int start1, char *seq2, int start2) { + int matches = 0; + int total = 0; + char base1, base2; + int i; + for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) { + base1 = seq1[start1+i]; + base2 = seq2[start2+i]; + if (base1 == 0 || base2 == 0) { + break; + } + if (base1 == base2) { + matches++; + } + total++; + } + return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES; +} + +Gaps *make_gaps() { + Gaps *gaps = malloc(sizeof(Gaps)); + gaps->root = 0; + gaps->tip = 0; + gaps->length = 0; + return gaps; +} + +void add_gap(Gaps *gaps, int seq, int coord, int length) { + Gap *gap = malloc(sizeof(Gap)); + gap->next = 0; + gap->seq = seq; + gap->coord = coord; + gap->length = length; + if (gaps->root == 0) { + gaps->root = gap; + } else { + gaps->tip->next = gap; + } + gaps->tip = gap; + gaps->length++; +} + +// Take gap information from the aligner and put them into the sequence string as "-" characters. +char *insert_gaps(Gaps *gaps, char *seq, int seq_num) { + if (gaps->root == 0) { + return seq; + } + + // How long should the new sequence be? + int extra_len = 0; + Gap *gap = gaps->root; + while (gap) { + if (gap->seq == seq_num) { + extra_len += gap->length; + } + gap = gap->next; + } + + //TODO: Handle a situation with no gaps. + int new_len = extra_len + strlen(seq) + 1; + char *new_seq = malloc(sizeof(char) * new_len); + int i = 0; + int j = 0; + gap = gaps->root; + while (gap) { + // Check that it's a gap in our sequence. + if (gap->seq != seq_num) { + gap = gap->next; + continue; + } + // Copy verbatim all the sequence until the gap. + while (i <= gap->coord) { + new_seq[j] = seq[i]; + i++; + j++; + } + // Add -'s the whole length of the gap. + while (j < gap->coord + gap->length + 1) { + new_seq[j] = '-'; + j++; + } + gap = gap->next; + } + // Fill in the end sequence. + while (seq[i]) { + new_seq[j] = seq[i]; + i++; + j++; + } + new_seq[new_len-1] = 0; + return new_seq; +} |
b |
diff -r 13bcc2f459b0 -r af383638de66 alignc.so |
b |
Binary file alignc.so has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensus |
b |
Binary file consensus has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensus-time.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus-time.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,33 @@ +#!/usr/bin/env python +import sys +import time +import ctypes + +first = None +times = [] +for i in range(10000): + py_seqs = [] + seq_len = 0 + with open(sys.argv[1]) as infile: + for line in infile: + if line.startswith('>'): + continue + seq = line.strip() + if len(seq) > seq_len: + seq_len = len(seq) + py_seqs.append(seq) + + seqs = (ctypes.c_char_p * len(py_seqs))() + for j, seq in enumerate(py_seqs): + seqs[j] = ctypes.c_char_p(seq) + + consensus = ctypes.cdll.LoadLibrary('./consensus.so') + start = time.time() + consensus.get_votes(seqs, len(seqs), seq_len) + elapsed = 1000 * 1000 * (time.time() - start) + if i == 0: + first = elapsed + else: + times.append(elapsed) + +print '{:0.1f}\t{:0.1f}'.format(first, sum(times)/len(times)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensus.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,84 @@ +import os +import ctypes + +script_dir = os.path.dirname(os.path.realpath(__file__)) +consensus = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'consensusc.so')) +consensus.get_consensus.restype = ctypes.c_char_p +consensus.get_consensus_duplex.restype = ctypes.c_char_p +consensus.build_consensus_duplex_simple.restype = ctypes.c_char_p + + +# N.B.: The quality scores must be aligned with their accompanying sequences. +def get_consensus(align, quals=[], cons_thres=-1.0, qual_thres=' ', gapped=False): + cons_thres_c = ctypes.c_double(cons_thres) + qual_thres_c = ctypes.c_char(qual_thres) + n_seqs = len(align) + if gapped: + gapped_c = 1 + else: + gapped_c = 0 + assert not quals or len(quals) == n_seqs, 'Different number of sequences and quals.' + seq_len = None + for seq in (align + quals): + if seq_len is None: + seq_len = len(seq) + else: + assert seq_len == len(seq), 'All sequences in the alignment must be the same length.' + align_c = (ctypes.c_char_p * n_seqs)() + for i, seq in enumerate(align): + align_c[i] = ctypes.c_char_p(seq) + quals_c = (ctypes.c_char_p * n_seqs)() + for i, qual in enumerate(quals): + quals_c[i] = ctypes.c_char_p(qual) + if not quals: + quals_c = 0 + return consensus.get_consensus(align_c, quals_c, n_seqs, seq_len, cons_thres_c, qual_thres_c, + gapped_c) + + +# N.B.: The quality scores must be aligned with their accompanying sequences. +def get_consensus_duplex(align1, align2, quals1=[], quals2=[], cons_thres=-1.0, qual_thres=' ', + method='iupac'): + assert method in ('iupac', 'freq') + cons_thres_c = ctypes.c_double(cons_thres) + qual_thres_c = ctypes.c_char(qual_thres) + n_seqs1 = len(align1) + n_seqs2 = len(align2) + assert (not quals1 and not quals2) or (quals1 and quals2) + assert not quals1 or len(quals1) == n_seqs1 + assert not quals2 or len(quals2) == n_seqs2 + seq_len = None + for seq in (align1 + align2 + quals1 + quals2): + if seq_len is None: + seq_len = len(seq) + else: + assert seq_len == len(seq), 'All sequences in the alignment must be the same length.' + align1_c = (ctypes.c_char_p * n_seqs1)() + for i, seq in enumerate(align1): + align1_c[i] = ctypes.c_char_p(seq) + align2_c = (ctypes.c_char_p * n_seqs1)() + for i, seq in enumerate(align2): + align2_c[i] = ctypes.c_char_p(seq) + quals1_c = (ctypes.c_char_p * n_seqs1)() + for i, seq in enumerate(quals1): + quals1_c[i] = ctypes.c_char_p(seq) + quals2_c = (ctypes.c_char_p * n_seqs1)() + for i, seq in enumerate(quals2): + quals2_c[i] = ctypes.c_char_p(seq) + if not quals1: + quals1_c = 0 + if not quals2: + quals2_c = 0 + return consensus.get_consensus_duplex(align1_c, align2_c, quals1_c, quals2_c, n_seqs1, n_seqs2, + seq_len, cons_thres_c, qual_thres_c, method) + + +def build_consensus_duplex_simple(cons1, cons2, gapped=False): + assert len(cons1) == len(cons2) + cons1_c = ctypes.c_char_p(cons1) + cons2_c = ctypes.c_char_p(cons2) + if gapped: + gapped_c = 1 + else: + gapped_c = 0 + return consensus.build_consensus_duplex_simple(cons1_c, cons2_c, gapped_c) |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensus.pyc |
b |
Binary file consensus.pyc has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensusc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensusc.c Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,556 @@\n+#include <stdio.h>\n+#include <stdlib.h>\n+#include <string.h>\n+#include <ctype.h>\n+#include <limits.h>\n+\n+// N.B. This defines the valid bases, but it\'s also effectively defined in the switches in\n+// get_votes_simple(), get_votes_qual(), and get_base_prime(), and in the constant IUPAC_BASES.\n+#define N_BASES 6\n+const char *BASES = "ACGTN-";\n+/* A C G T N - A: 2 Compute IUPAC ambiguous base character by representing each base\n+A 4 6 10 14 22 26 C: 3 with a prime and multiplying. Then use a lookup table (an array\n+C 9 15 21 33 39 G: 5 where the index is the product of the two primes).\n+G 25 35 55 65 T: 7\n+T 49 77 91 N: 11\n+N 121 143 -: 13 1 2 3 4 5 6 7\n+- 169 01234567890123456789012345678901234567890123456789012345678901234567890*/\n+const char *IUPAC_BASES = "N...A.M..CR...WS.....YN..GN......N.K...N.........T.....N.........N....."\n+// 8 9 10 11 12 13 14\n+ "......N.............N.............................N..................."\n+// 15 16 17\n+ "..N.........................-";\n+#define THRES_DEFAULT 0.5\n+#define WIN_LEN 4\n+#define GAP_CHAR \' \'\n+\n+int **get_votes_simple(char *align[], int n_seqs, int seq_len);\n+int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres);\n+int init_gap_qual_window(int *window, char *quals, int seq_len);\n+char get_gap_qual(int *window);\n+int push_qual(int *window, int win_edge, char *quals, int seq_len);\n+void print_window(int *window, int win_edge);\n+int **init_votes(int seq_len);\n+void free_votes(int *votes[], int seq_len);\n+void print_votes(char *consensus, int *votes[], int seq_len);\n+char *rm_gaps(char *consensus, int cons_len);\n+char *build_consensus(int *votes[], int seq_len, double thres);\n+char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres);\n+char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped);\n+int get_base_prime(char base);\n+char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double thres,\n+ char qual_thres, int gapped);\n+char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],\n+ int n_seqs1, int n_seqs2, int seq_len, double cons_thres,\n+ char qual_thres, int gapped, char *method);\n+\n+\n+// Tally the different bases at each position in an alignment.\n+// Returns an array of arrays: for each position in the alignment, an array of the number of times\n+// each base occurs at that position. The order of bases is as in the "BASES" constant.\n+int **get_votes_simple(char *align[], int n_seqs, int seq_len) {\n+ int **votes = init_votes(seq_len);\n+\n+ // Tally votes for each base.\n+ int i, j;\n+ for (i = 0; i < n_seqs; i++) {\n+ for (j = 0; j < seq_len; j++) {\n+ // N.B.: Could write this without hardcoded literals, but it\'s about 40% slower.\n+ switch (toupper(align[i][j])) {\n+ case \'A\':\n+ votes[j][0]++;\n+ break;\n+ case \'C\':\n+ votes[j][1]++;\n+ break;\n+ case \'G\':\n+ votes[j][2]++;\n+ break;\n+ case \'T\':\n+ votes[j][3]++;\n+ break;\n+ case \'N\':\n+ votes[j][4]++;\n+ break;\n+ case \'-\':\n+ votes[j][5]++;\n+ break;\n+ }\n+ }\n+ }\n+\n+ return votes;\n+}\n+\n+\n+int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres) {\n+ int **votes = init_votes(seq_len);\n+ int *window = malloc(sizeof(int) * WIN_LEN * 2);\n+ int win_edge;\n+\n+ // Tally votes for each base.\n+ char qual;\n+ int i, j;\n+ for (i = 0; i < n_seqs; i++) {\n+ win_edge = init_gap_qual_window(window, qu'..b'ase_prime1, base_prime2;\n+ while (cons1[i] != \'\\0\' && cons2[i] != \'\\0\') {\n+ base_prime1 = get_base_prime(cons1[i]);\n+ base_prime2 = get_base_prime(cons2[i]);\n+ cons[i] = IUPAC_BASES[base_prime1*base_prime2];\n+ i++;\n+ }\n+ cons[seq_len] = \'\\0\';\n+ if (gapped) {\n+ return cons;\n+ } else {\n+ return rm_gaps(cons, seq_len);\n+ }\n+}\n+\n+\n+int get_base_prime(char base) {\n+ switch (base) {\n+ case \'A\':\n+ return 2;\n+ case \'C\':\n+ return 3;\n+ case \'G\':\n+ return 5;\n+ case \'T\':\n+ return 7;\n+ case \'N\':\n+ return 11;\n+ case \'-\':\n+ return 13;\n+ default:\n+ return 0;\n+ }\n+}\n+\n+\n+// Convenience function to create a consensus in one step.\n+// Give 0 as "quals" to not use quality scores, and -1.0 as "cons_thres" to use the default\n+// consensus threshold when evaluating base votes.\n+char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double cons_thres,\n+ char qual_thres, int gapped) {\n+ if (cons_thres == -1.0) {\n+ cons_thres = THRES_DEFAULT;\n+ }\n+ int **votes;\n+ if (quals == 0) {\n+ votes = get_votes_simple(align, n_seqs, seq_len);\n+ } else {\n+ votes = get_votes_qual(align, quals, n_seqs, seq_len, qual_thres);\n+ }\n+ char *consensus_gapped = build_consensus(votes, seq_len, cons_thres);\n+ char *consensus;\n+ if (gapped) {\n+ consensus = consensus_gapped;\n+ } else {\n+ consensus = rm_gaps(consensus_gapped, seq_len);\n+ }\n+ free_votes(votes, seq_len);\n+ return consensus;\n+}\n+\n+\n+char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],\n+ int n_seqs1, int n_seqs2, int seq_len, double cons_thres,\n+ char qual_thres, int gapped, char *method) {\n+ if (cons_thres == -1.0) {\n+ cons_thres = THRES_DEFAULT;\n+ }\n+ int **votes1;\n+ int **votes2;\n+ if (quals1 == 0 || quals2 == 0) {\n+ votes1 = get_votes_simple(align1, n_seqs1, seq_len);\n+ votes2 = get_votes_simple(align2, n_seqs2, seq_len);\n+ } else {\n+ votes1 = get_votes_qual(align1, quals1, n_seqs1, seq_len, qual_thres);\n+ votes2 = get_votes_qual(align2, quals2, n_seqs2, seq_len, qual_thres);\n+ }\n+ char *consensus_gapped;\n+ if (!strncmp(method, "freq", 4)) {\n+ consensus_gapped = build_consensus_duplex(votes1, votes2, seq_len, cons_thres);\n+ } else if (!strncmp(method, "iupac", 5)) {\n+ char *cons1 = build_consensus(votes1, seq_len, cons_thres);\n+ char *cons2 = build_consensus(votes2, seq_len, cons_thres);\n+ consensus_gapped = build_consensus_duplex_simple(cons1, cons2, 1);\n+ } else {\n+ return "";\n+ }\n+ char *consensus;\n+ if (gapped) {\n+ consensus = consensus_gapped;\n+ } else {\n+ consensus = rm_gaps(consensus_gapped, seq_len);\n+ }\n+ free_votes(votes1, seq_len);\n+ free_votes(votes2, seq_len);\n+ return consensus;\n+}\n+\n+\n+void get_gap_quals(char *quals) {\n+ int seq_len = strlen(quals);\n+ int *window = malloc(sizeof(int) * WIN_LEN * 2);\n+ int win_edge = init_gap_qual_window(window, quals, seq_len);\n+ print_window(window, win_edge);\n+\n+ int i;\n+ char gap_qual;\n+ for (i = 0; i < seq_len; i++) {\n+ if (quals[i] == GAP_CHAR) {\n+ gap_qual = get_gap_qual(window);\n+ printf("gap %2d: %2d\\n", i, gap_qual);\n+ } else {\n+ win_edge = push_qual(window, win_edge, quals, seq_len);\n+ print_window(window, win_edge);\n+ }\n+ }\n+}\n+\n+\n+int main(int argc, char *argv[]) {\n+ char **align = malloc(sizeof(char *) * (argc-1));\n+\n+ int seq_len = INT_MAX;\n+ int i;\n+ for (i = 1; i < argc; i++) {\n+ if (strlen(argv[i]) < seq_len) {\n+ seq_len = strlen(argv[i]);\n+ }\n+ align[i-1] = argv[i];\n+ }\n+\n+ if (argc <= 1) {\n+ return 1;\n+ }\n+\n+ get_gap_quals(align[0]);\n+ return 0;\n+\n+ int **votes = get_votes_simple(align, argc-1, seq_len);\n+ char *consensus = build_consensus(votes, seq_len, THRES_DEFAULT);\n+ print_votes(consensus, votes, seq_len);\n+ printf("%s\\n", consensus);\n+ free_votes(votes, seq_len);\n+\n+ return 0;\n+}\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 consensusc.so |
b |
Binary file consensusc.so has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 duplex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/duplex.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,333 @@\n+#!/usr/bin/env python\n+from __future__ import division\n+import os\n+import sys\n+import time\n+import logging\n+import tempfile\n+import argparse\n+import subprocess\n+import collections\n+import distutils.spawn\n+import consensus\n+import swalign\n+\n+SANGER_START = 33\n+SOLEXA_START = 64\n+REQUIRED_COMMANDS = [\'mafft\']\n+OPT_DEFAULTS = {\'min_reads\':3, \'processes\':1, \'qual\':20, \'qual_format\':\'sanger\'}\n+USAGE = "%(prog)s [options]"\n+DESCRIPTION = """Build consensus sequences from read aligned families. Prints duplex consensus\n+sequences in FASTA to stdout. The sequence ids are BARCODE.MATE, e.g. "CTCAGATAACATACCTTATATGCA.1",\n+where "BARCODE" is the input barcode, and "MATE" is "1" or "2" as an arbitrary designation of the\n+two reads in the pair. The id is followed by the count of the number of reads in the two families\n+(one from each strand) that make up the duplex, in the format READS1/READS2. If the duplex is\n+actually a single-strand consensus because the matching strand is missing, only one number is\n+listed."""\n+\n+\n+def main(argv):\n+\n+ parser = argparse.ArgumentParser(description=DESCRIPTION)\n+ parser.set_defaults(**OPT_DEFAULTS)\n+\n+ parser.add_argument(\'infile\', metavar=\'read-families.tsv\', nargs=\'?\',\n+ help=\'The output of align_families.py. 6 columns: 1. (canonical) barcode. 2. order ("ab" or \'\n+ \'"ba"). 3. mate ("1" or "2"). 4. read name. 5. aligned sequence. 6. aligned quality \'\n+ \'scores.\')\n+ parser.add_argument(\'-r\', \'--min-reads\', type=int,\n+ help=\'The minimum number of reads (from each strand) required to form a single-strand \'\n+ \'consensus. Strands with fewer reads will be skipped. Default: %(default)s.\')\n+ parser.add_argument(\'-q\', \'--qual\', type=int,\n+ help=\'Base quality threshold. Bases below this quality will not be counted. \'\n+ \'Default: %(default)s.\')\n+ parser.add_argument(\'-F\', \'--qual-format\', choices=(\'sanger\', \'solexa\'),\n+ help=\'FASTQ quality score format. Sanger scores are assumed to begin at \\\'{}\\\' ({}). Default: \'\n+ \'%(default)s.\'.format(SANGER_START, chr(SANGER_START)))\n+ parser.add_argument(\'--incl-sscs\', action=\'store_true\',\n+ help=\'When outputting duplex consensus sequences, include reads without a full duplex (missing \'\n+ \'one strand). The result will just be the single-strand consensus of the remaining read.\')\n+ parser.add_argument(\'-s\', \'--sscs-file\',\n+ help=\'Save single-strand consensus sequences in this file (FASTA format). Currently does not \'\n+ \'work when in parallel mode.\')\n+ parser.add_argument(\'-l\', \'--log\', metavar=\'LOG_FILE\', dest=\'stats_file\',\n+ help=\'Print statistics on the run to this file. Use "-" to print to stderr.\')\n+ parser.add_argument(\'-p\', \'--processes\', type=int,\n+ help=\'Number of processes to use. If > 1, launches this many worker subprocesses. Note: if \'\n+ \'this option is used, no output will be generated until the end of the entire run, so no \'\n+ \'streaming is possible. Default: %(default)s.\')\n+ parser.add_argument(\'-S\', \'--slurm\', action=\'store_true\',\n+ help=\'If --processes > 1, prepend sub-commands with "srun -C new".\')\n+\n+ args = parser.parse_args(argv[1:])\n+\n+ assert args.processes > 0, \'-p must be greater than zero\'\n+ # Make dict of process_family() parameters that don\'t change between families.\n+ static = {}\n+ static[\'processes\'] = args.processes\n+ static[\'incl_sscs\'] = args.incl_sscs\n+ static[\'min_reads\'] = args.min_reads\n+ if args.sscs_file:\n+ static[\'sscs_fh\'] = open(args.sscs_file, \'w\')\n+ if args.qual_format == \'sanger\':\n+ static[\'qual_thres\'] = chr(args.qual + SANGER_START)\n+ elif args.qual_format == \'solexa\':\n+ static[\'qual_thres\'] = chr(args.qual + SOLEXA_START)\n+ else:\n+ fail(\'Error: unrecognized --qual-format.\')\n+\n+ # Check for required commands.\n+ missing_commands = []\n+ if args.slurm:\n+ REQUIRED_COMMANDS.append(\'srun\')\n+ for command in REQUIRED_COMMANDS:\n+ if not distutils.spawn.find_executable(command):\n+ '..b'ily:\n+ worker[\'proc\'].stdin.write(line)\n+\n+\n+def close_workers(workers):\n+ for worker in workers:\n+ worker[\'outfile\'].close()\n+ worker[\'proc\'].stdin.close()\n+\n+\n+def compile_results(workers):\n+ for worker in workers:\n+ worker[\'proc\'].wait()\n+ with open(worker[\'outfile\'].name, \'r\') as outfile:\n+ for line in outfile:\n+ sys.stdout.write(line)\n+\n+\n+def delete_tempfiles(workers):\n+ for worker in workers:\n+ os.remove(worker[\'outfile\'].name)\n+ if worker[\'stats\']:\n+ os.remove(worker[\'stats\'])\n+\n+\n+def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None,\n+ processes=1, min_reads=1, qual_thres=\' \'):\n+ stats[\'families\'] += 1\n+ # Are we the controller process or a worker?\n+ if processes > 1:\n+ i = stats[\'families\'] % len(workers)\n+ worker = workers[i]\n+ delegate(worker, duplex, barcode)\n+ return\n+ # We\'re a worker. Actually process the family.\n+ start = time.time()\n+ consensi = []\n+ reads_per_strand = []\n+ duplex_mate = None\n+ for (order, mate), family in duplex.items():\n+ reads = len(family)\n+ if reads < min_reads:\n+ continue\n+ # The mate number for the duplex consensus. It\'s arbitrary, but all that matters is that the\n+ # two mates have different numbers. This system ensures that:\n+ # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2.\n+ if (order == \'ab\' and mate == 1) or (order == \'ba\' and mate == 2):\n+ duplex_mate = 1\n+ else:\n+ duplex_mate = 2\n+ seqs = [read[\'seq\'] for read in family]\n+ quals = [read[\'qual\'] for read in family]\n+ consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres))\n+ reads_per_strand.append(reads)\n+ assert len(consensi) <= 2\n+ if sscs_fh:\n+ for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand):\n+ sscs_fh.write(\'>{bar}.{order}.{mate} {reads}\\n\'.format(bar=barcode, order=order, mate=mate,\n+ reads=reads))\n+ sscs_fh.write(cons+\'\\n\')\n+ if len(consensi) == 1 and incl_sscs:\n+ print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand)\n+ elif len(consensi) == 2:\n+ align = swalign.smith_waterman(*consensi)\n+ #TODO: log error & return if len(align.target) != len(align.query)\n+ cons = consensus.build_consensus_duplex_simple(align.target, align.query)\n+ print_duplex(cons, barcode, duplex_mate, reads_per_strand)\n+ elapsed = time.time() - start\n+ logging.info(\'{} sec for {} reads.\'.format(elapsed, sum(reads_per_strand)))\n+ if stats and len(consensi) > 0:\n+ stats[\'time\'] += elapsed\n+ stats[\'reads\'] += sum(reads_per_strand)\n+ stats[\'runs\'] += 1\n+\n+\n+def print_duplex(cons, barcode, mate, reads_per_strand, outfile=sys.stdout):\n+ header = \'>{bar}.{mate} {reads}\'.format(bar=barcode, mate=mate,\n+ reads=\'/\'.join(map(str, reads_per_strand)))\n+ outfile.write(header+\'\\n\')\n+ outfile.write(cons+\'\\n\')\n+\n+\n+def read_fasta(fasta, is_file=True):\n+ """Quick and dirty FASTA parser. Return the sequences and their names.\n+ Returns a list of sequences. Each is a dict of \'name\' and \'seq\'.\n+ Warning: Reads the entire contents of the file into memory at once."""\n+ sequences = []\n+ seq_lines = []\n+ seq_name = None\n+ if is_file:\n+ with open(fasta) as fasta_file:\n+ fasta_lines = fasta_file.readlines()\n+ else:\n+ fasta_lines = fasta.splitlines()\n+ for line in fasta_lines:\n+ if line.startswith(\'>\'):\n+ if seq_lines:\n+ sequences.append({\'name\':seq_name, \'seq\':\'\'.join(seq_lines)})\n+ seq_lines = []\n+ seq_name = line.rstrip(\'\\r\\n\')[1:]\n+ continue\n+ seq_lines.append(line.strip())\n+ if seq_lines:\n+ sequences.append({\'name\':seq_name, \'seq\':\'\'.join(seq_lines)})\n+ return sequences\n+\n+\n+def fail(message):\n+ sys.stderr.write(message+"\\n")\n+ sys.exit(1)\n+\n+if __name__ == \'__main__\':\n+ sys.exit(main(sys.argv))\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 duplex.sublime-project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/duplex.sublime-project Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,17 @@ +{ + "build_systems": + [ + { + "file_regex": "^[ ]*File \"(...*?)\", line ([0-9]*)", + "name": "Anaconda Python Builder", + "selector": "source.python", + "shell_cmd": "python -u \"$file\"" + } + ], + "folders": + [ + { + "path": "/home/me/bx/code/duplex" + } + ] +} |
b |
diff -r 13bcc2f459b0 -r af383638de66 duplex.sublime-workspace --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/duplex.sublime-workspace Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,1134 @@\n+{\n+\t"auto_complete":\n+\t{\n+\t\t"selected_items":\n+\t\t[\n+\t\t\t[\n+\t\t\t\t"can",\n+\t\t\t\t"can_change_color\tfunction"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"NAI",\n+\t\t\t\t"NAIVE_TEST_THRES"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"curre",\n+\t\t\t\t"current_seq\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"align",\n+\t\t\t\t"align_path\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"Att",\n+\t\t\t\t"AttributeError\tclass"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"pipe",\n+\t\t\t\t"pipeline_family_choose_asm\tfunction"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"Obj",\n+\t\t\t\t"ObjectsById"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"get",\n+\t\t\t\t"getElementById"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"make",\n+\t\t\t\t"makeStatElem"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"timeTo",\n+\t\t\t\t"timeToNextReindeer"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"golden",\n+\t\t\t\t"goldenUpgrades"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"How",\n+\t\t\t\t"HowMuchPrestige"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"HTTP",\n+\t\t\t\t"HTTPLIB_PARAMS\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"ISNV",\n+\t\t\t\t"ISNVFILE_DEFAULT\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"DA",\n+\t\t\t\t"DATA_DIR_DEFAULT\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"REQ",\n+\t\t\t\t"REQUIRED_PICARDS\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"add_ar",\n+\t\t\t\t"add_argument_group\tfunction"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"sample_fam",\n+\t\t\t\t"sample_family_rows\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"SAM",\n+\t\t\t\t"SAMPLES_FILE\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"spi",\n+\t\t\t\t"spikein_str\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"scri",\n+\t\t\t\t"scriptdir"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"IGNO",\n+\t\t\t\t"IGNORE_EXTS\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"watch",\n+\t\t\t\t"watch_progs"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"IG",\n+\t\t\t\t"IGNORE_EXTS\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"Dev",\n+\t\t\t\t"DeviceDefault"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"cac",\n+\t\t\t\t"cache_file"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"Asn",\n+\t\t\t\t"AsnMacCache"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"gate",\n+\t\t\t\t"gateway_ip"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"getE",\n+\t\t\t\t"getElementsByClassName"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"high",\n+\t\t\t\t"highlightTerms"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"medi",\n+\t\t\t\t"mediaElement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"get_mp",\n+\t\t\t\t"get_mp3_name"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"pl",\n+\t\t\t\t"player_url"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"fa",\n+\t\t\t\t"fastq1"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"ba",\n+\t\t\t\t"backgrounded"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"human",\n+\t\t\t\t"human_time\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"NotI",\n+\t\t\t\t"NotImplementedError\tclass"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"recu",\n+\t\t\t\t"recursively"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"SAMPLES",\n+\t\t\t\t"SAMPLES_HEADER\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"wildcard",\n+\t\t\t\t"wildcard_match\tfunction"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"existing",\n+\t\t\t\t"existing_samples\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"SAPL",\n+\t\t\t\t"SAMPLES_HEADER\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"max",\n+\t\t\t\t"max_tz_diff"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"bio",\n+\t\t\t\t"bio_sample_id\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"tech",\n+\t\t\t\t"technology"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"HEA",\n+\t\t\t\t"HEADER_JOINED\tstatement"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"Z_B",\n+\t\t\t\t"Z_BORDER"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"BA",\n+\t\t\t\t"BASE_SIZE"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"destroy",\n+\t\t\t\t"destroyRecursive"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"sam",\n+\t\t\t\t"sample_regex"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"pair",\n+\t\t\t\t"pair_regex"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"edge",\n+\t\t\t\t"edge_effect"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"XRA",\n+\t\t\t\t"XRANDR_DISP_REGEX"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"eh",\n+\t\t\t\t"exho"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"MIN",\n+\t\t\t\t"MINUS2_SRC"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"filename",\n+\t\t\t\t"filename_new"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"MUL",\n+\t\t\t\t"MULTI_REPLACE"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"IMG",\n+\t\t\t\t"REGEX_IMGUR_SRC"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"lan_",\n+\t\t\t\t"lan_ip_current"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"BUFF",\n+\t\t\t\t"BUFFER_MAX"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"passed",\n+\t\t\t\t"passed_filters"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"getCo",\n+\t\t\t\t"getCoverageGap"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"GAME_",\n+\t\t\t\t"GAME_WIDTH"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"GAME",\n+\t\t\t\t"GAME_WIDTH"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"visi",\n+\t\t\t\t"visibility"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"source",\n+\t\t\t\t"source_file"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"like",\n+\t\t\t\t"likeliest_count"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"cso",\n+\t\t\t\t"csource_path_fixed"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"csou",\n+\t\t\t\t"csource_path"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"csour",\n+\t\t\t\t"csource_file"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"buffe",\n+\t\t\t\t"buffer_size"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"POP",\n+\t\t\t\t"POP_SIZE_MAX"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"B",\n+\t\t\t\t"BASE"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"fill",\n+\t\t\t\t"fill_test_data"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"binom",\n+\t\t\t\t"binomial_coefficient"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"alig",\n+\t\t\t\t"alignment1"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"ali",\n+\t\t\t\t"alignment2"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"choose",\n+\t\t\t\t"choose_sequence"\n+\t\t\t],\n+\t\t\t[\n+\t\t\t\t"back",\n+\t\t\t\t"background-color"\n+\t\t\t],\n+\t'..b'\t\t\t\t\t\t},\n+\t\t\t\t\t\t"selection":\n+\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t\t3646,\n+\t\t\t\t\t\t\t\t3646\n+\t\t\t\t\t\t\t]\n+\t\t\t\t\t\t],\n+\t\t\t\t\t\t"settings":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t\t"syntax": "Packages/Python/Python.tmLanguage",\n+\t\t\t\t\t\t\t"tab_size": 2,\n+\t\t\t\t\t\t\t"translate_tabs_to_spaces": true\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"translation.x": 0.0,\n+\t\t\t\t\t\t"translation.y": 1194.0,\n+\t\t\t\t\t\t"zoom_level": 1.0\n+\t\t\t\t\t},\n+\t\t\t\t\t"stack_index": 3,\n+\t\t\t\t\t"type": "text"\n+\t\t\t\t},\n+\t\t\t\t{\n+\t\t\t\t\t"buffer": 1,\n+\t\t\t\t\t"file": "sscs.py",\n+\t\t\t\t\t"semi_transient": false,\n+\t\t\t\t\t"settings":\n+\t\t\t\t\t{\n+\t\t\t\t\t\t"buffer_size": 11029,\n+\t\t\t\t\t\t"regions":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"selection":\n+\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t\t3494,\n+\t\t\t\t\t\t\t\t3494\n+\t\t\t\t\t\t\t]\n+\t\t\t\t\t\t],\n+\t\t\t\t\t\t"settings":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t\t"syntax": "Packages/Python/Python.tmLanguage",\n+\t\t\t\t\t\t\t"tab_size": 2,\n+\t\t\t\t\t\t\t"translate_tabs_to_spaces": true\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"translation.x": 0.0,\n+\t\t\t\t\t\t"translation.y": 1081.0,\n+\t\t\t\t\t\t"zoom_level": 1.0\n+\t\t\t\t\t},\n+\t\t\t\t\t"stack_index": 4,\n+\t\t\t\t\t"type": "text"\n+\t\t\t\t},\n+\t\t\t\t{\n+\t\t\t\t\t"buffer": 2,\n+\t\t\t\t\t"file": "stats.py",\n+\t\t\t\t\t"semi_transient": false,\n+\t\t\t\t\t"settings":\n+\t\t\t\t\t{\n+\t\t\t\t\t\t"buffer_size": 3713,\n+\t\t\t\t\t\t"regions":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"selection":\n+\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t\t2502,\n+\t\t\t\t\t\t\t\t2502\n+\t\t\t\t\t\t\t]\n+\t\t\t\t\t\t],\n+\t\t\t\t\t\t"settings":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t\t"syntax": "Packages/Python/Python.tmLanguage",\n+\t\t\t\t\t\t\t"tab_size": 2,\n+\t\t\t\t\t\t\t"translate_tabs_to_spaces": true\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"translation.x": 0.0,\n+\t\t\t\t\t\t"translation.y": 918.0,\n+\t\t\t\t\t\t"zoom_level": 1.0\n+\t\t\t\t\t},\n+\t\t\t\t\t"stack_index": 0,\n+\t\t\t\t\t"type": "text"\n+\t\t\t\t},\n+\t\t\t\t{\n+\t\t\t\t\t"buffer": 3,\n+\t\t\t\t\t"file": "/home/me/bx/code/duplex/align.c",\n+\t\t\t\t\t"semi_transient": false,\n+\t\t\t\t\t"settings":\n+\t\t\t\t\t{\n+\t\t\t\t\t\t"buffer_size": 7383,\n+\t\t\t\t\t\t"regions":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"selection":\n+\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t\t409,\n+\t\t\t\t\t\t\t\t409\n+\t\t\t\t\t\t\t]\n+\t\t\t\t\t\t],\n+\t\t\t\t\t\t"settings":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t\t"syntax": "Packages/C++/C.tmLanguage"\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"translation.x": 0.0,\n+\t\t\t\t\t\t"translation.y": 357.0,\n+\t\t\t\t\t\t"zoom_level": 1.0\n+\t\t\t\t\t},\n+\t\t\t\t\t"stack_index": 1,\n+\t\t\t\t\t"type": "text"\n+\t\t\t\t},\n+\t\t\t\t{\n+\t\t\t\t\t"buffer": 4,\n+\t\t\t\t\t"file": "pipeline.sh",\n+\t\t\t\t\t"semi_transient": false,\n+\t\t\t\t\t"settings":\n+\t\t\t\t\t{\n+\t\t\t\t\t\t"buffer_size": 2314,\n+\t\t\t\t\t\t"regions":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"selection":\n+\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t[\n+\t\t\t\t\t\t\t\t1134,\n+\t\t\t\t\t\t\t\t1134\n+\t\t\t\t\t\t\t]\n+\t\t\t\t\t\t],\n+\t\t\t\t\t\t"settings":\n+\t\t\t\t\t\t{\n+\t\t\t\t\t\t\t"syntax": "Packages/ShellScript/Shell-Unix-Generic.tmLanguage",\n+\t\t\t\t\t\t\t"tab_size": 2,\n+\t\t\t\t\t\t\t"translate_tabs_to_spaces": true\n+\t\t\t\t\t\t},\n+\t\t\t\t\t\t"translation.x": 0.0,\n+\t\t\t\t\t\t"translation.y": 0.0,\n+\t\t\t\t\t\t"zoom_level": 1.0\n+\t\t\t\t\t},\n+\t\t\t\t\t"stack_index": 2,\n+\t\t\t\t\t"type": "text"\n+\t\t\t\t}\n+\t\t\t]\n+\t\t}\n+\t],\n+\t"incremental_find":\n+\t{\n+\t\t"height": 25.0\n+\t},\n+\t"input":\n+\t{\n+\t\t"height": 0.0\n+\t},\n+\t"layout":\n+\t{\n+\t\t"cells":\n+\t\t[\n+\t\t\t[\n+\t\t\t\t0,\n+\t\t\t\t0,\n+\t\t\t\t1,\n+\t\t\t\t1\n+\t\t\t]\n+\t\t],\n+\t\t"cols":\n+\t\t[\n+\t\t\t0.0,\n+\t\t\t1.0\n+\t\t],\n+\t\t"rows":\n+\t\t[\n+\t\t\t0.0,\n+\t\t\t1.0\n+\t\t]\n+\t},\n+\t"menu_visible": true,\n+\t"output.anaconda_documentation":\n+\t{\n+\t\t"height": 153.0\n+\t},\n+\t"output.exec":\n+\t{\n+\t\t"height": 100.0\n+\t},\n+\t"output.find_results":\n+\t{\n+\t\t"height": 0.0\n+\t},\n+\t"output.unsaved_changes":\n+\t{\n+\t\t"height": 100.0\n+\t},\n+\t"pinned_build_system": "",\n+\t"project": "duplex.sublime-project",\n+\t"replace":\n+\t{\n+\t\t"height": 46.0\n+\t},\n+\t"save_all_on_build": true,\n+\t"select_file":\n+\t{\n+\t\t"height": 0.0,\n+\t\t"last_filter": "",\n+\t\t"selected_items":\n+\t\t[\n+\t\t\t[\n+\t\t\t\t"",\n+\t\t\t\t"~/annex/Work/PSU/Nekrutenko/code/heteroplasmy/pipeline-meta.sh"\n+\t\t\t]\n+\t\t],\n+\t\t"width": 0.0\n+\t},\n+\t"select_project":\n+\t{\n+\t\t"height": 500.0,\n+\t\t"last_filter": "",\n+\t\t"selected_items":\n+\t\t[\n+\t\t],\n+\t\t"width": 380.0\n+\t},\n+\t"select_symbol":\n+\t{\n+\t\t"height": 392.0,\n+\t\t"last_filter": "",\n+\t\t"selected_items":\n+\t\t[\n+\t\t\t[\n+\t\t\t\t"",\n+\t\t\t\t"fasta_format"\n+\t\t\t]\n+\t\t],\n+\t\t"width": 392.0\n+\t},\n+\t"selected_group": 0,\n+\t"settings":\n+\t{\n+\t},\n+\t"show_minimap": true,\n+\t"show_open_files": true,\n+\t"show_tabs": true,\n+\t"side_bar_visible": false,\n+\t"side_bar_width": 150.0,\n+\t"status_bar_visible": true,\n+\t"template_settings":\n+\t{\n+\t}\n+}\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 duplex.xml --- a/duplex.xml Mon Nov 23 18:07:11 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,53 +0,0 @@ -<?xml version="1.0"?> -<tool id="duplex" name="Make consensus reads" version="0.1"> - <description>from duplex sequencing data</description> - <requirements> - <requirement type="package" version="0.1">duplex</requirement> - </requirements> - <command interpreter="python" detect_errors="exit_code"><![CDATA[ - duplex.py -r $min_reads -q $qual_thres -F $qual_format $input - #if $keep_sscs: - --sscs-file $sscs - #end if - > duplex.fa - && awk -f $__tool_directory__/utils/outconv.awk -v target=1 duplex.fa > $output1 - && awk -f $__tool_directory__/utils/outconv.awk -v target=2 duplex.fa > $output2 - ]]> - </command> - <inputs> - <param name="input" type="data" format="tabular" label="Aligned input reads" /> - <param name="min_reads" type="integer" value="3" min="1" label="Minimum reads per family" help="Single-strand families with fewer than this many reads will be skipped."/> - <param name="qual_thres" type="integer" value="25" min="1" label="Minimum base quality" help="Bases with a PHRED score less than this will not be counted in the consensus making."/> - <param name="qual_format" type="select" label="FASTQ format" help="Solexa should also work for Illumina 1.3+ and 1.5+, and Sanger should work for Illumina 1.8+"> - <option value="sanger" selected="true">Sanger (PHRED 0 = "!")</option> - <option value="solexa">Solexa (PHRED 0 = "@")</option> - </param> - <param name="keep_sscs" type="boolean" truevalue="true" falsevalue="" label="Output single-strand consensus sequences" /> - </inputs> - <outputs> - <data name="output1" format="fasta" label="$tool.name on $on_string (mate 1)"/> - <data name="output2" format="fasta" label="$tool.name on $on_string (mate 2)"/> - <data name="sscs" format="fasta" label="$tool.name on $on_string (SSCS)"> - <filter>keep_sscs</filter> - </data> - </outputs> - <help> - -**What it does** - -This is for processing duplex sequencing data. It creates single-strand and duplex consensus reads from aligned read families. - ------ - -**Input** - -This expects the output format of the "Align families" tool. - ------ - -**Output** - -This will output final, duplex consensus reads in two FASTA files (first and second reads in the pairs). Optionally, you can save the single-strand reads too, in a separate FASTA file. - - </help> -</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 galaxy/align_families.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/align_families.xml Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,59 @@ +<?xml version="1.0"?> +<tool id="align_families" name="Align families" version="0.1"> + <description>from duplex sequencing data</description> + <requirements> + <requirement type="package" version="7.221">mafft</requirement> + <requirement type="package" version="0.1">duplex</requirement> + </requirements> + <command interpreter="python" detect_errors="exit_code">align_families.py $input > $output + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Input reads" help="with barcodes, grouped by family"/> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> + <tests> + <test> + <param name="input" value="smoke.families.tsv"/> + <output name="output" file="smoke.families.aligned.tsv"/> + </test> + </tests> + <help> + +**What it does** + +This is for processing duplex sequencing data. It does a multiple sequence alignment on each (single-stranded) family of reads. + +----- + +**Input** + +This expects the output format of the "Make families" tool. + +----- + +**Output** + +The output is a tabular file where each line corresponds to a (single) read. + +The columns are:: + + 1: barcode (both tags) + 2: tag order in barcode ("ab" or "ba") + 3: read mate ("1" or "2") + 4: read name + 5: read sequence, aligned ("-" for gaps) + 6: read quality scores, aligned (" " for gaps) + +----- + +**Alignments** + +The alignments are done using MAFFT, specifically the command +:: + + $ mafft --nuc --quiet family.fa > family.aligned.fa + + </help> +</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 galaxy/duplex.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/duplex.xml Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,53 @@ +<?xml version="1.0"?> +<tool id="duplex" name="Make consensus reads" version="0.1"> + <description>from duplex sequencing data</description> + <requirements> + <requirement type="package" version="0.1">duplex</requirement> + </requirements> + <command interpreter="python" detect_errors="exit_code"><![CDATA[ + duplex.py -r $min_reads -q $qual_thres -F $qual_format $input + #if $keep_sscs: + --sscs-file $sscs + #end if + > duplex.fa + && awk -f $__tool_directory__/utils/outconv.awk -v target=1 duplex.fa > $output1 + && awk -f $__tool_directory__/utils/outconv.awk -v target=2 duplex.fa > $output2 + ]]> + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Aligned input reads" /> + <param name="min_reads" type="integer" value="3" min="1" label="Minimum reads per family" help="Single-strand families with fewer than this many reads will be skipped."/> + <param name="qual_thres" type="integer" value="25" min="1" label="Minimum base quality" help="Bases with a PHRED score less than this will not be counted in the consensus making."/> + <param name="qual_format" type="select" label="FASTQ format" help="Solexa should also work for Illumina 1.3+ and 1.5+, and Sanger should work for Illumina 1.8+"> + <option value="sanger" selected="true">Sanger (PHRED 0 = "!")</option> + <option value="solexa">Solexa (PHRED 0 = "@")</option> + </param> + <param name="keep_sscs" type="boolean" truevalue="true" falsevalue="" label="Output single-strand consensus sequences" /> + </inputs> + <outputs> + <data name="output1" format="fasta" label="$tool.name on $on_string (mate 1)"/> + <data name="output2" format="fasta" label="$tool.name on $on_string (mate 2)"/> + <data name="sscs" format="fasta" label="$tool.name on $on_string (SSCS)"> + <filter>keep_sscs</filter> + </data> + </outputs> + <help> + +**What it does** + +This is for processing duplex sequencing data. It creates single-strand and duplex consensus reads from aligned read families. + +----- + +**Input** + +This expects the output format of the "Align families" tool. + +----- + +**Output** + +This will output final, duplex consensus reads in two FASTA files (first and second reads in the pairs). Optionally, you can save the single-strand reads too, in a separate FASTA file. + + </help> +</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 galaxy/make_families.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/make_families.xml Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,83 @@ +<?xml version="1.0"?> +<tool id="make_families" name="Make families" version="0.1"> + <description>from duplex sequencing data</description> + <requirements> + <requirement type="package" version="0.1">duplex</requirement> + <!-- <requirement type="set_environment">DUPLEX_DIR</requirement> --> + </requirements> + <command>paste $fastq1 $fastq2 + | paste - - - - + | awk -f \$DUPLEX_DIR/make-barcodes.awk -v TAG_LEN=$taglen -v INVARIANT=$invariant + | sort + > $output + </command> + <inputs> + <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/> + <param name="fastq2" type="data" format="fastq" label="Sequencing reads, mate 2"/> + <param name="taglen" type="integer" value="12" min="0" label="Tag length" help="length of each random barcode on the ends of the fragments"/> + <param name="invariant" type="integer" value="5" min="0" label="Invariant sequence length" help="length of the sequence between the tag and actual sample sequence (the restriction site, normally)"/> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> + <tests> + <test> + <param name="fastq1" value="smoke_1.fq"/> + <param name="fastq2" value="smoke_2.fq"/> + <param name="taglen" value="5"/> + <param name="invariant" value="1"/> + <output name="output" file="smoke.families.tsv"/> + </test> + <test> + <param name="fastq1" value="smoke_1.fq"/> + <param name="fastq2" value="smoke_2.fq"/> + <param name="taglen" value="5"/> + <param name="invariant" value="0"/> + <output name="output" file="smoke.families.i0.tsv"/> + </test> + </tests> + <help> + +**What it does** + +This tool is for processing raw duplex sequencing data, removing the barcodes and grouping by them into families of reads from the same fragment. + +----- + +**Output** + +The output will be a tabular file where each line corresponds to a pair of input reads. + +The columns are:: + + 1: barcode (both tags joined and ordered) + 2: tag order in barcode ("ab" or "ba") + 3: read1 name + 4: read1 sequence (minus the tag and invariant sequences) + 5: read1 quality scores (minus the same tag and invariant) + 6: read2 name + 7: read2 sequence (minus the tag and invariant sequences) + 8: read2 quality scores (minus the same tag and invariant) + +----- + +**Barcode creation** + +For each pair, the tool will remove the tag at the beginning of each read and create a barcode by concatenating the two tags. The order of the tags is determined by a string comparison so that it will make an identical barcode from pairs of either order. The original tag order will be noted in the second column. + +Since pairs from opposite strands will have the same tags, but in the reverse order, this produces the same barcode for reads from the same fragment, regardless of strand. Then a simple sort will group all reads from the same strand together, separated into strands by the different "order" values. + +Examples:: + + +---------------+-----------------+ + | input tags | output | + +-------+-------+-------+---------+ + | read1 | read2 | order | barcode | + +-------+-------+-------+---------+ + | ATG | CCT | ab | ATGCCT | + +-------+-------+-------+---------+ + | CCT | ATG | ba | ATGCCT | + +-------+-------+-------+---------+ + + </help> +</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 galaxy/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/tool_dependencies.xml Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,30 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="mafft" version="7.221"> + <repository changeset_revision="97adbeef2294" name="mafft" owner="rnateam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="duplex" version="0.1"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://github.com/makrutenko/duplex/archive/master.tar.gz</action> + <action type="shell_command">make</action> + <action type="move_file"> + <source>*.so</source> + <destination>$INSTALL_DIR</destination> + </action> + <action type="move_file"> + <source>*.py</source> + <destination>$INSTALL_DIR</destination> + </action> + <action type="move_file"> + <source>*.awk</source> + <destination>$INSTALL_DIR</destination> + </action> + <action type="set_environment"> + <environment_variable action="set_to" name="DUPLEX_DIR">$INSTALL_DIR</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + </package> +</tool_dependency> |
b |
diff -r 13bcc2f459b0 -r af383638de66 loeb-2.0.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/loeb-2.0.sh Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,313 @@\n+#!/usr/bin/env bash\n+if [ x$BASH = x ] || [ ! $BASH_VERSINFO ] || [ $BASH_VERSINFO -lt 4 ]; then\n+ echo "Error: Must use bash version 4+." >&2\n+ exit 1\n+fi\n+set -ue\n+\n+BarcodeLen=${BarcodeLen:=12}\n+SpacerLen=${SpacerLen:=5}\n+StartClip=${StartClip:=5}\n+EndClip=${EndClip:=5}\n+BwaCmd=${BwaCmd:="bwa"}\n+SamtoolsCmd=${SamtoolsCmd:="samtools"}\n+PythonCmd=${PythonCmd:="python"}\n+JavaCmd=${JavaCmd:="java"}\n+PicardDir=${PicardDir:="$HOME/src/picard-tools-1.100"}\n+GatkDir=${GatkDir:="$HOME/src/GenomeAnalysisTK"}\n+\n+Usage="Usage: \\$ $(basename $0) [-d|-c|-a] ref.fa reads_1.fq reads_2.fq readlen [outdir]\n+Run the Loeb pipeline as it was published in the Kennedy et al. 2014 paper\n+(release 2.0). If -d (\\"duplex\\") is given, it will stop after producing the\n+final duplex reads (step 62). This is the default. If -c (\\"cleanup\\") is given,\n+it will skip producing the duplex reads, assuming it\'s already been done, and\n+just do the filtering, realignment, and trimming (steps 63-71). If -a (\\"all\\")\n+is given, it will do the whole pipeline (both halves). If it\'s not doing the\n+second part, Picard and GATK are not required. Otherwise, provide the paths to\n+the directories containing their .jars by setting \\$PicardDir and \\$GatkDir.\n+Dependencies:\n+Python >= 2.7 (and < 3.0)\n+BWA <= 0.6.2\n+Samtools <= 0.1.18\n+BioPython 1.62\n+PySAM 0.7.5\n+Picard 1.107\n+GATK 2.4-9\n+To just check your dependency versions, run \\$ $(basename $0) -v"\n+\n+function main {\n+\n+ script_dir=$(real_dir)\n+\n+ duplex=true\n+ cleanup=\'\'\n+ if [[ $# -ge 1 ]]; then\n+ if [[ $1 == \'-v\' ]]; then\n+ print_versions $script_dir\n+ exit\n+ elif [[ $1 == \'-d\' ]]; then\n+ duplex=true\n+ cleanup=\'\'\n+ elif [[ $1 == \'-c\' ]]; then\n+ duplex=\'\'\n+ cleanup=true\n+ elif [[ $1 == \'-a\' ]]; then\n+ duplex=true\n+ cleanup=true\n+ fi\n+ shift\n+ fi\n+ if [[ $# -lt 4 ]] || [[ $1 == \'-h\' ]]; then\n+ fail "$Usage"\n+ else\n+ ref="$1"\n+ fastq1="$2"\n+ fastq2="$3"\n+ readlen="$4"\n+ fi\n+ if [[ $# -ge 5 ]]; then\n+ outdir="$5"\n+ else\n+ outdir=.\n+ fi\n+\n+ if ! echo "$readlen" | grep -qE \'^[0-9]+$\'; then\n+ fail "ERROR: Invalid read length \\"$readlen\\"."\n+ fi\n+ if ! [[ -d $outdir ]]; then\n+ fail "ERROR: Invalid output directory \\"$outdir\\"."\n+ fi\n+\n+ print_versions $script_dir\n+\n+ echo "\n+Parameters:\n+ref: $ref\n+fastq1: $fastq1\n+fastq2: $fastq2\n+readlen: $readlen\n+"\n+\n+ refdict=$(echo "$ref" | sed -E \'s/\\.fa(sta)?$//\').dict\n+ rlenreal=$((readlen-BarcodeLen-SpacerLen))\n+ end_clip_start=$((rlenreal-EndClip+1))\n+\n+ if [[ $duplex ]]; then\n+ echo \'===== 56 =====\' && echo \'===== 56 =====\' >&2\n+ # Concatenate the 12-nt tag sequences from the paired reads and evaluate for tag quality\n+ $PythonCmd $script_dir/tag_to_header.py --infile1 $fastq1 --infile2 $fastq2 \\\n+ --outfile1 $outdir/read_1.fq.smi --outfile2 $outdir/read_2.fq.smi \\\n+ --barcode_length $BarcodeLen --spacer_length $SpacerLen\n+ echo \'===== 57 =====\' && echo \'===== 57 =====\' >&2\n+ # Align each read to the reference genome\n+ $BwaCmd aln $ref $outdir/read_1.fq.smi > $outdir/read_1.aln\n+ echo \'===== 57.2 =====\' && echo \'===== 57.2 =====\' >&2\n+ $BwaCmd aln $ref $outdir/read_2.fq.smi > $outdir/read_2.aln\n+ echo \'===== 58 =====\' && echo \'===== 58 =====\' >&2\n+ # Make a single paired-end .sam file\n+ $BwaCmd sampe -s $ref $outdir/read_1.aln $outdir/read_2.aln \\\n+ $outdir/read_1.fq.smi $outdir/read_2.fq.smi > $outdir/PE_reads.sam\n+ echo \'===== 59 =====\' && echo \'===== 59 =====\' >&2\n+ # Convert to .bam format and sort by position\n+ $SamtoolsCmd view -Sbu $outdir/PE_reads.sam | $SamtoolsCmd sort - $outdir/PE_reads.sort\n+ echo \'===== 60 =====\' && echo \'===== 60 =====\' >&2\n+ # Run the Python program \'ConsensusMaker.py\' to collapse PCR duplicates into SSCS\n+ $PythonCmd $script_dir/ConsensusMaker.py --tagfile $outdir/PE_reads.tagcounts \\\n+ --infile $outdir/PE_reads.sort.bam --ou'..b' -R $ref \\\n+ --cyclesToTrim "1-$StartClip,$end_clip_start-$rlenreal" --clipRepresentation SOFTCLIP_BASES\n+ fi\n+ echo \'===== DONE =====\' && echo \'===== DONE =====\' >&2\n+}\n+\n+\n+# Get the script\'s actual directory path\n+function real_dir {\n+ # Does readlink -f work? (It doesn\'t on BSD.)\n+ if readlink -f dummy >/dev/null 2>/dev/null; then\n+ dirname $(readlink -f ${BASH_SOURCE[0]})\n+ else\n+ # If readlink -f doesn\'t work (like on BSD).\n+ # Read the link destination from the output of ls -l and cd to it.\n+ # Have to cd to the link\'s directory first, to handle relative links.\n+ # With help from https://stackoverflow.com/a/246128/726773\n+ unset CDPATH\n+ local source="${BASH_SOURCE[0]}"\n+ while [[ -h "$source" ]]; do\n+ local dir="$(cd -P $(dirname "$source") && pwd)"\n+ local link="$(ls -l "$source" | awk \'{print $NF}\')"\n+ # absolute or relative path?\n+ if [[ "$link" == /* ]]; then\n+ source="$link"\n+ else\n+ source="$dir/$link"\n+ fi\n+ done\n+ dir="$(cd -P $(dirname "$source") && pwd)"\n+ echo "$dir"\n+ fi\n+}\n+\n+\n+function print_versions {\n+ script_dir="$1"\n+ echo -e \'VERSIONS\\trecommended\\tpresent\'\n+ # pipeline\n+ echo -en \'pipeline:\\te0897da\\t\\t\'\n+ if ! [[ -d $script_dir ]]; then\n+ echo \'MISSING\'\n+ elif ! which git >/dev/null 2>/dev/null; then\n+ echo \'ERROR 1\'\n+ else\n+ unset CDPATH\n+ cd $script_dir\n+ if ! git log >/dev/null 2>/dev/null; then\n+ echo \'ERROR 2\'\n+ else\n+ git log --oneline -n 1 | grep --color=never -Eo \'^\\S+\'\n+ fi\n+ cd - >/dev/null\n+ fi\n+ # Python\n+ echo -en \'Python:\\t\\t2.7\\t\\t\'\n+ if which $PythonCmd >/dev/null 2>/dev/null; then\n+ $PythonCmd --version 2>&1 | sed -E \'s/python\\s//I\'\n+ else\n+ echo \'MISSING\'\n+ fi\n+ # BWA\n+ echo -en \'BWA:\\t\\t0.6.2\\t\\t\'\n+ if which $BwaCmd >/dev/null 2>/dev/null; then\n+ $BwaCmd 2>&1 | sed -En \'s/^.*version.*\\s([0-9].*)$/\\1/Ip\'\n+ else\n+ echo \'MISSING\'\n+ fi\n+ # Samtools\n+ echo -en \'Samtools:\\t0.1.18\\t\\t\'\n+ if which $SamtoolsCmd >/dev/null 2>/dev/null; then\n+ $SamtoolsCmd 2>&1 | sed -En \'s/^.*version.*\\s([0-9].*)$/\\1/Ip\'\n+ else\n+ echo \'MISSING\'\n+ fi\n+ # PySAM\n+ echo -en \'PySAM:\\t\\t0.7.5\\t\\t\'\n+ if $PythonCmd -c \'import pysam\' 2>/dev/null; then\n+ $PythonCmd -c \'import pysam; print pysam.__version__\'\n+ elif which $PythonCmd >/dev/null 2>/dev/null; then\n+ echo \'MISSING\'\n+ else\n+ echo \'ERROR 1\'\n+ fi\n+ # BioPython\n+ echo -en \'BioPython:\\t1.62\\t\\t\'\n+ if $PythonCmd -c \'import Bio\' 2>/dev/null; then\n+ $PythonCmd -c \'import Bio; print Bio.__version__\'\n+ elif which $PythonCmd >/dev/null 2>/dev/null; then\n+ echo \'MISSING\'\n+ else\n+ echo \'ERROR 1\'\n+ fi\n+ if ! which $JavaCmd 2>/dev/null >/dev/null; then\n+ echo "ERROR: Java command \\"$JavaCmd\\" not found." >&2\n+ return\n+ fi\n+ # Picard\n+ echo -en \'Picard:\\t\\t1.107\\t\\t\'\n+ if [[ -f $PicardDir/picard.jar ]]; then\n+ $JavaCmd -jar $PicardDir/picard.jar AddOrReplaceReadGroups --version 2>&1 >/dev/null | sed -E \'s/\\(.*\\)//\'\n+ elif ! [[ -f $PicardDir/AddOrReplaceReadGroups.jar ]]; then\n+ echo \'MISSING\'\n+ elif [[ $($JavaCmd -jar $PicardDir/AddOrReplaceReadGroups.jar 2>&1 >/dev/null | sed -En \'s/^Version:?\\s//ip\') ]]; then\n+ $JavaCmd -jar $PicardDir/AddOrReplaceReadGroups.jar 2>&1 >/dev/null | sed -En \'s/^Version:?\\s//ip\'\n+ else\n+ echo \'ERROR 1\'\n+ fi\n+ # GATK\n+ echo -en \'GATK:\\t\\t2.4-9\\t\\t\'\n+ if ! [[ -f $GatkDir/GenomeAnalysisTK.jar ]]; then\n+ echo \'MISSING\'\n+ else\n+ set +e\n+ version=$($JavaCmd -jar $GatkDir/GenomeAnalysisTK.jar --version 2>/dev/null)\n+ exit_code=$?\n+ set -e\n+ if [[ $exit_code == 0 ]]; then\n+ echo $version\n+ else\n+ version=$($JavaCmd -jar $GatkDir/GenomeAnalysisTK.jar 2>&1 >/dev/null | sed -En \'s/^.*version\\s([0-9.-]+[0-9.]).*$/\\1/p\')\n+ if [[ $version ]]; then\n+ echo $version\n+ else\n+ echo \'ERROR 1\'\n+ fi\n+ fi\n+ fi\n+}\n+\n+\n+function fail {\n+ echo "$@" >&2\n+ exit 1\n+}\n+\n+\n+main "$@"\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 make-barcodes.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make-barcodes.awk Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,59 @@ +# The awk code that transforms the one-line fastq record pair into the output that can be sorted +# by barcode. +# Input columns (the 4 FASTQ lines for both reads in a read pair): +# 1: read1 name +# 2: read2 name +# 3: read1 sequence +# 4: read2 sequence +# 5: read1 + line +# 6: read2 + line +# 7: read1 quality +# 8: read2 quality +# Output columns: +# 1: the barcode, put into a canonical form +# 2: the order of the barcode halves ("ab" or "ba") +# 3: read1 name +# 4: sequence of read 1, minus the 12bp barcode and 5bp invariant sequence +# 5: read1 quality scores, minus the same first 17bp +# 6: read2 name +# 7: sequence of read 2, minus the first 17bp +# 8: read2 quality scores, minus the first 17bp +# The canonical form of the barcode is composed of two concatenated tags, one from each read. +# By default, each tag is the first 12bp of the read. The tag from the first read is the "alpha" and +# the tag from the second is the "beta". The barcode is formed by concatenating them in an order +# determined by a string comparison of the two. The lesser tag is first (if they are equal, the +# beta is first, but then you have bigger problems). + +BEGIN { + FS = "\t" + OFS = "\t" + # The number of bases from the start of each read that form the two halves of the barcode. + # (this should be half the size of the full, canonical barcode). + if (TAG_LEN == "") { + TAG_LEN = 12 + } + # The number of bases in the read that are between the barcode and the start of the actual sample + # sequence (the restriction site in the Loeb 2014 protocol). + if (INVARIANT == "") { + INVARIANT = 5 + } +} + +$3 && $4 { + alpha = substr($3, 1, TAG_LEN) + beta = substr($4, 1, TAG_LEN) + if (alpha < beta) { + barcode = alpha beta + order = "ab" + } else { + barcode = beta alpha + order = "ba" + } + name1 = $1 + name2 = $2 + seq1 = substr($3, TAG_LEN + INVARIANT + 1) + seq2 = substr($4, TAG_LEN + INVARIANT + 1) + qual1 = substr($7, TAG_LEN + INVARIANT + 1) + qual2 = substr($8, TAG_LEN + INVARIANT + 1) + print barcode, order, name1, seq1, qual1, name2, seq2, qual2 +} |
b |
diff -r 13bcc2f459b0 -r af383638de66 make_families.xml --- a/make_families.xml Mon Nov 23 18:07:11 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,83 +0,0 @@ -<?xml version="1.0"?> -<tool id="make_families" name="Make families" version="0.1"> - <description>from duplex sequencing data</description> - <requirements> - <requirement type="package" version="0.1">duplex</requirement> - <!-- <requirement type="set_environment">DUPLEX_DIR</requirement> --> - </requirements> - <command>paste $fastq1 $fastq2 - | paste - - - - - | awk -f \$DUPLEX_DIR/make-barcodes.awk -v TAG_LEN=$taglen -v INVARIANT=$invariant - | sort - > $output - </command> - <inputs> - <param name="fastq1" type="data" format="fastq" label="Sequencing reads, mate 1"/> - <param name="fastq2" type="data" format="fastq" label="Sequencing reads, mate 2"/> - <param name="taglen" type="integer" value="12" min="0" label="Tag length" help="length of each random barcode on the ends of the fragments"/> - <param name="invariant" type="integer" value="5" min="0" label="Invariant sequence length" help="length of the sequence between the tag and actual sample sequence (the restriction site, normally)"/> - </inputs> - <outputs> - <data name="output" format="tabular"/> - </outputs> - <tests> - <test> - <param name="fastq1" value="smoke_1.fq"/> - <param name="fastq2" value="smoke_2.fq"/> - <param name="taglen" value="5"/> - <param name="invariant" value="1"/> - <output name="output" file="smoke.families.tsv"/> - </test> - <test> - <param name="fastq1" value="smoke_1.fq"/> - <param name="fastq2" value="smoke_2.fq"/> - <param name="taglen" value="5"/> - <param name="invariant" value="0"/> - <output name="output" file="smoke.families.i0.tsv"/> - </test> - </tests> - <help> - -**What it does** - -This tool is for processing raw duplex sequencing data, removing the barcodes and grouping by them into families of reads from the same fragment. - ------ - -**Output** - -The output will be a tabular file where each line corresponds to a pair of input reads. - -The columns are:: - - 1: barcode (both tags joined and ordered) - 2: tag order in barcode ("ab" or "ba") - 3: read1 name - 4: read1 sequence (minus the tag and invariant sequences) - 5: read1 quality scores (minus the same tag and invariant) - 6: read2 name - 7: read2 sequence (minus the tag and invariant sequences) - 8: read2 quality scores (minus the same tag and invariant) - ------ - -**Barcode creation** - -For each pair, the tool will remove the tag at the beginning of each read and create a barcode by concatenating the two tags. The order of the tags is determined by a string comparison so that it will make an identical barcode from pairs of either order. The original tag order will be noted in the second column. - -Since pairs from opposite strands will have the same tags, but in the reverse order, this produces the same barcode for reads from the same fragment, regardless of strand. Then a simple sort will group all reads from the same strand together, separated into strands by the different "order" values. - -Examples:: - - +---------------+-----------------+ - | input tags | output | - +-------+-------+-------+---------+ - | read1 | read2 | order | barcode | - +-------+-------+-------+---------+ - | ATG | CCT | ab | ATGCCT | - +-------+-------+-------+---------+ - | CCT | ATG | ba | ATGCCT | - +-------+-------+-------+---------+ - - </help> -</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/00README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/00README.txt Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,7 @@ +bug1/diff.family.msa.tsv +bug1/diff.sscs.after.fa +bug1/diff.sscs.before.fa +-------------------- +Files on the differences between the outputs of two versions of duplex.py. The diff.sscs.*.fa files are the SSCS's (from --sscs-file) which are present in both outputs, but have different sequences. diff.family.msa.tsv contains the MSA's which produced the SSCS's. +before = a0d599c +after = 665ebe2 |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/ACCGACACAGACTAGGGATCAAAG.msa.qual.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/ACCGACACAGACTAGGGATCAAAG.msa.qual.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,58 @@\n+>ab.1\n+AGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGAAATAGAATGATCAGTACTGCGGCGGGTAGGCCTAGGATTGTGGGGGCAATGAATGAAGCGAACAGATTTTCGTTCATTTTGGTTNTNNGGGTTTGTTN\n+........................................................................................................................................................... ...... ...................... ....................... . . . ..... . ... .. .. ........ .. ....... . ... .. . .. .. \n+..................................................................................... .. ......................... .... ................ .......................... ......... ......... ................... .. .. . .. ......... .......... . . . ..... .. .. . \n+......... ............................................................................................................................................... .. ............................. .. .... ........ ......... ...... ........ . ...... .. .. ..C ... ...... CA. .. \n+................................................................................................................................................................................. . ....... ........................... . .. . .......... ........... . ... . . . . . . . \n+........................................... ................... ...... ........................................................................................... . ... ....... .... ..... .. ............... .................... ..... .. . .. ...... .. . . . .. ..... \n+................................................................................................................................................................... ......................... .. ............. ............ .. .. ... ...... .. ... . . \n+............ .... ..................... ................................................ ..... .......... .............. . ................................... .................. ...... . .. .. ............. .. . .. .. ...... . . .. ...C. AC.. \n+.................................................................................................................................... ........... ............................... ...... . . ...... .... ..................... . ....................... . . . . ... \n+... ..... ....... ......................... ................................ .. .......................... ....... . ........... .......... ........... . ..... ... ........... ........................... ............ ..... .. . . . .. . . ... .. \n+>ab.2\n+GGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTANCATTAACCTTNTANNNCANAGATTAANCNAACCAACACCTNNNNNCANCNNANTNCCCNAACNACATANNACCNN---------\n+. ............................... .................................................................. ...... ........... ....... .. .... ........ ........ . ....T.... .. . .. ..... ... . ..... ... . . . . . . . ..... C. . .........\n+.................... ... ............. ...... . . .................................... .... . . . . ... .... . . .. . ...........T..... . . .....T...---------.. ........ .. . .. ... . . . A . . . . \n+....................................................................... ....... ........... ............................ ......... ............................ . .... .. . ... . .. ... . ... ... ... . .. .'..b'........ ...... ............ . .. .......... ........... . . .... . .... . ....... .. .C.......... ......... .. ......... . . ........ ... . .... . . . . .... .... ... .. .. .. .. . . . \n+ .. ..... ........................... ....... .......... ...T.............................. .... ...... ........ .... ..... . ......... .. ...... ........ .. ... ........ ...... . .. . . . .. .... .... \n+... ... . ........ ........ .. . .................... ..... ........... ........... ... . . ....... ..... .. ....... ....... .. . . . .......... . . . . .. . . . ... .. . . ... .. . .. \n+................................................... .............. .................................... .... ..................... . ........... .................. ....... .. . . . .. . ...................... . . . . ..... . . . .. \n+................. .................. .......... ...................... .................. ...... ... ......... ....... . .............. . ...... ........ . ............ .. .... .G ... ... .. ...... . . . ... .. . . . \n+ . . ... .. ........ .. ...... ....... . . .. ........ ..... ........ . ... ........ .. ........ .. ...... ..... .. ...... ............ ..... . C.. . ..... . . . . .. . . . ... .... ... . . . .T. . . . \n+. ....... .................................................................... .................................... ............. ..... . .. . ......... . ... .... . . .C ... .. ............ .. . . ... . . .. . . . \n+............................................................................................... .. ........ .... ...... ............... . ...... ...... . .. . .... . . . . .. .... .. .. ...T. . . . . . ... .. . . . . \n+. . ..... ...... ... .......... ............. .. ... . ... .... . .... . .. ... .. . ....... . . ...G . ...... . ....... ... . . .C.... ... . .. . .. ... .. .... .A . \n+... ... . ... ....... ... .............. .. .......... ....... .... . .. .. .. .... . ..... ... . ..... . . ... ... . .. .. .. . .. . . . . . . . G . . .. . T . . . ... \n+.................... ...... ....................... ... ............................ ........... ... ..... . ............... ... ... . .... ... . .. .. ... ... ...... ... ... . .. .... . . . . . . .. ... . \n+............ ........ ................ .... .T.............. ........ . .. ... .... . .. .G.... ....... . .... ...... .... ....... ...G..G. .. . . .T . .. . . . . .... . ... ... ...... .. . .... . . . \n+......... ................................. .................................. ............ .. . .. .. ... ............. . ... .. . . .. . ......... . .... ...... . C ... G .. ..... ....... C.. .. . \n+................... ......... ............................................ ............ ...................... ................. ...... ...... ........... ................. .. ...... ..... ... ....... ...... .. . .. . .. . . .. . ..... \n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/ACCGACACAGACTAGGGATCAAAG.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/ACCGACACAGACTAGGGATCAAAG.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,58 @@\n+>ab.1:\n+AGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGAAATAGAATGATCAGTACTGCGGCGGGTAGGCCTAGGATTGTGGGGGCAATGAATGAAGCGAACAGATTTTCGTTCATTTTGGTTCTCAGGGTTTGTTT\n+......................................................................................................................................................................................................................................................T...............................G...G.\n+.........................................................................................................................................................................................T.........................................................C.....T......................AT.T........\n+.............................................................................................................................................................................................................................................................C......T.G......G..............\n+.................................................................................................................................................................................C............................................................................A.......G.....................\n+...........................................C..................................................................................................................................................................................................C.............................................\n+.........................................................................................................................................................................................................................................C..............A.......................A...T.......\n+.........................................................................................................................C..................................................................G.....................................................T..........G......AC......C...AC..........\n+.......................................................................................................................................................................................T.......C...........................................................................T.....GT.........\n+.................C......................................................................................................................................................T.............................................................T..........C..A.T.......G..............GT.G..T.......G\n+>ab.2:\n+GGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAGCCCACTCTAAAGCTAACTTAGCATTAACCTTTTCCGTTAACGTTTAAGCAAACCAACACCTCTCTACAGTGATATGCCCCAACCACATCCTACCGC\n+...........................................................................................................................................................T......T....T.................T..........C..C..........C........C.....C................C..........C.......C.CA........AC.........\n+...............................................G.........................................A.....A.............................A...........T.................T.................G..G.G..CACT.TAAAGCTAACTTCT.ATTAACCTTT.A.GA.AAAG.TTAAG.GAAC...CGCCT.T.TACAT.TAAATGCCAC.C..AATTACTA.CGGATGGC.AA.\n+............................................................................................................................................................................C..............T...................T.............AA......TA......G.........'..b'..................................................................A.......................C......................C..........................................................G.T......T...................A..........T.....T.A............A.CTC..A..C.....G.G..AG.....CT.G..GAG..AC.C.A..\n+............................................................T..............................................................C.......................................................................C.....................T....C.C........C...G.C....C.......GCG.A.G.GC..........AT...A.....C\n+.........T.................T....T.....................................................T...T............................................C....CT...............T.........C...G.......G.C...TG.....C..C.........T...T....T.T......G.............T....C.....G..G.GG.T..CGC.G.G.....GCT....G....G\n+.........................................................................................................................................................................................................................A.C..C.........G....T......C.G.A.....G......C...G...GG...T...A....G\n+.................................................................................................................................................................................C...........G.............................C..C.G.......G.G.........A.TG......G........G.CT.G...C...T.......\n+...C.....................................................T....................A........................T........................T.....................C..TT........T...A......G..T...A........................A....T.G........G.A....A.T.TC..AT.T.GTC.T..TAC..G.AT.CT....AT.G...AC........GC\n+...........................................................................................................................................................C.....................C.......C....................................CG..C....AG.C..G......C...TCTG.........C.....C.CT.C...TGC...CC\n+..........................................................................................................................................................................T................C.........A........T......G..AT.....G...A..T..G.G.....G..C.........G.T...G.........GG.GT..G.....G\n+....................C............C......................C.............................A.A.......GC...G....C.....G.................C..C................TG...C....C..........CA....T.C.T....C....C.GAT.....C........A.....TA....TG..AT..T.AC.G.A.....T..G....C....G...GTG.AC.TG.G..TT...C.A...\n+.........C................................................................................G.....C...A......................................T..........G..T.TC.GG..T....T...C.GTT......TA..G.G.TC.....AT.....A..TG..AG...GTG..T....G......TG..G......A..GT.....G.T.G.G....G....A.T.T.TG......\n+................................................................................................C............................G.........T....................G......T....................................C...........CT..A..C......G......G...AG..T..C....C..G.......GC..G.....G.CGT..G......\n+..............................................T..............T...........A..G...........A.....A.G....A..................A........................G..G..G..........TC..A.....G...................................A..........T..G...........G..A...CG..T.CA..GGG......A.G.G...AG..TG.A.GC...GG\n+.........................................................................................................................................................C...C.............CG...C..C.....G.TGC...........C...............A.C..C.G...C.....C..ATG....AC.C...............GA..A.GT.CT....G.....\n+.........................................................................................................................................................................................................C...............A....G.G...T..TAC........C.A......G....G..A............CC..........\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/ACCGACACAGACTAGGGATCAAAG.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/ACCGACACAGACTAGGGATCAAAG.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,25 @@\n+ACCGACACAGACTAGGGATCAAAG\tab\t@M02286:57:000000000-AGCM5:1:1105:17191:19189 1:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGAAATAGAATGATCAGTACTGCGGCGGGTAGGCCTAGGATTGTGGGGGCAATGAATGAAGCGTACAGATTTTCGTTCATTTTGGTTCTCAGGGTGTGTGT\tGFGGGGGGDEEGGCFFGFGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGAFGGFFGGGGGGFFGDEGGGDGGGGDFFCGGGGGDGGFFAFGGFFGGGFGGGCFGFFGGEGGGFGFDGGGEGGCFFDDFG:CFGGEFGGFGGDFGGGGGGGGCD,ADF9FG,,@7EE777CEFCDFC>BBCEDG7*1?DF7>FCGC?CFGFCGFGG5=CC35/7*9++>FG6:**:*>5<)7536C*9@CB7B?D)3)06@)9C<FGB?)9-<F?)<B(-44-6(7>(9:1((\t@M02286:57:000000000-AGCM5:1:1105:17191:19189 2:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATATCACCCCTTCTATCCCCTCTACCCCCTCTATAGCCCACTCTCAACCTAACTTAGCCTTAACCTTCTCCGTCAACGTTTAAGCAAACCCACACCTCTCTCCAGTGATCTCACCCAACCAACTCCTACCGC\t:,C@@8CFEEFGGGGGGGCFFAEFGGGGGE@FG,@FEED@C7=FF<FEGCFCCGGGFE9EFFG9F<EGGGGGA8FA<7=EF==FG<599@FGF9E<EEF:+?FCFDF,??<<D9F9DE<+D9<AF@F,E9,D>FF,3,DEEG:58C3DCBCDCCE,;,EF:BC8:C:,@D*=,AD*<CGGG4CF7,<+=6CCC4)4*2+*20(;>6+*2+)/*)2;3:*.2++./5,.*:)2(72-6)(.5((,8@?6704()972(-)))))))..3:(((-423))))).((\n+ACCGACACAGACTAGGGATCAAAG\tab\t@M02286:57:000000000-AGCM5:1:1105:8085:17960 1:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGTAATAGAATGATCAGTACTGCGGCGGGTAGGCCTAGGATTGTGGGGGCAATGAATGAACCGAACTGATTTTCGTTCATTTTGGTTCTATGTGTTTGTTT\tGCCFECCECFGEGDGGGFGFGGGGGGFG@EEF<EE@E9@AEC@AGGGGGGGCFF9CFGFGGGGFGGGGGGGD9<C@CCFGC<FGF,CF,<EDFFFGGGF<AD??EGFF<D@<CA,CB?F,C9F?FFFEFGGGDGF<,FFFGFDFC7FFFGFGFGF9C;=AFFG,37B=:>EEEG,7EB;ECC6>**41>DC7>CEC9;<9D@DFF7:45C*/*::32A+9?2<7C8DFDCD/C>D9CCFFC=**.)7()).9)9*)7C:?;)44/95).442))..96(,2,8(\t@M02286:57:000000000-AGCM5:1:1105:8085:17960 2:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATAGTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTAATGCCAATCGTCCTAGAATTAATTCCCCTAAAAATATTTGAAATAGGTCCCGTATTTACCCTATATCACCCCCTCTACCCCCTGTAGCGCCCACTGTAAAGCTAACTTCTCATTAACCTTTTAAGACAAAGCTTAAGAGAACTAACGCCTCTATACATTTAAATGCCACACATAATTACTACCGGATGGCCAAC\t:DFFEGCFCFC<<<DFEEFD,,,6CF,B@FGGGDB::BE7+F@=CEF,9,<,+C?:<CFF9?<EACEGGGGC<@EFGGF<EF8<=E?AF,BEFE,,5,,?4,:+A,??A,<FFF,,A,,A,7:,A,@?A@AFFFDC8>D9==<+3,@,@,@9@F9CE;EG?*BEE?;EF8*1**5=,,1;*)440<=+=8:+*11/=+4+0+)3;+1*;3+1+6++)92/*)*/+1)3*;)*)143)(,.),)./64-))))-9424)).(().)))).).)(((,((((((,(\n+ACCGACACAGACTAGGGATCAAAG\tab\t@M02286:57:000000000-AGCM5:1:1108:14879:20886 1:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGAAATAGAATGATCAGTACTGCGGCGGGTAGGCCTAGGATTGTGGGGGCAATGAATGAAGCGAACAGATCTTCGTTTAGTTTGGTGCTCAGGGTTTGTTT\tFCGGGGFG;+FFCGD@FFGFGFGGGGGGGGGGGGGGGGGGF<FGFGGGGGGGGGDDFCFGGGGFFGGGGGGGGG<FGFGGGGGG9EFGGGGGGCFFGED@F8DCFCEEEGGGFGEFFF?EFF9FGEGGGGGGDGGGFGGGGGGGGDGGGG@;F,A>,>DD9=9=DGECFGC@:CC8CAFCFFECG5*,2=F,9C;<,2=EFGD7DG*=DGEGEEGG46CFF?<*09<CDGGG>3)7*9:DFF64F9*)7>)797*9>?1))2)6<<B??)436>?0((.,<F((\t@M02286:57:000000000-AGCM5:1:1108:14879:20886 2:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCCCTACCCCCTCTAGATCCCACTCTAAAGCTAACTTTGCATTAACCTTTTAAGTTAACTATTAAGCGAACCAACACCTCTTTACATCGTCATGCCCCAACTACATAATCACAT\tCFFFGCFGGEFGGGGAFGEDGCFGG9FFGEGGGGGGDGGECEFEGGGGGCFGGGGGGGGD9EEF?FFFEC?,C?CCE@E,E=FGCDFCDEF,<FFGGGGGGGFEGF9FDGG9BDFGGGFF,,CAFGFGF>B3>FGAD=FCG8FCFFCCFGGGCCDEF9F>*>*358DB,>C***:*=CC*?)AD+1++16C7*;.+1+49:D47D7++/47::):4<9+*09*)1+6***60<+6**253)34()2,-)0444/..))((,-.)))5((-(,.)..))))),))\n+ACCGACACAGACTAGGGATCAAAG\tab\t@M02286:57:000000000-AGCM5:1:1116:18034:3824 1:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCC'..b'CF9FEEGD,C<@+@@:+A,@E@F,,@7>+3+3@,,,98,,6>,C=93@C834>6@6:@,,2>;D,BE8**/2,D+=C,86:;,*+0,2;//(8*7*/*8(*2.*;)(**)*11:*()17<*)****/))()02-(1.:66,440())*.)(,))(*(-())9((((,\n+ACCGACACAGACTAGGGATCAAAG\tba\t@M02286:57:000000000-AGCM5:1:2109:13003:15058 1:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTTAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTAACCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGAATTAACATTTTAAGTTAAAGCTTAAGAGCACAAACACATCTATACAGTGAAATACCCAAACTAAATACTACAGT\t,9C<FBEDFFFGGFDEFFAFFFGGFGEGDGGGFEEGG7:+@+7FGG9FGGGCCF,BFFEFGEF,EFCGFGGF,BBFFGF8,BAFGFFDFGCD9?F<FFG7FB:FGFG9<A?,FEFD<B@F@><B>>FA,,F,@DEGC,F:FEGGG@FGG<8,@<3@9,7,@*@<EE9,7D,166,,@,5**:E,<,@9@:::BFGC?994AC+;<?1+=,>+=?FE+<99;C6+5>9F+;+>>77+<**2C:*<FFG:09F*8>F*95*00)6*.36*(0)*)4<9A=*)))..\t@M02286:57:000000000-AGCM5:1:2109:13003:15058 2:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATATTAGTATAAGAGATCTGGTTCGTCCTTAAGGGTTGTGTATGGATATCAATGGTTTAGAGGTTAGTTTGATTAGTAATTGTTGGGTGGTGATTAGTCGGTGGTGGAGGAGATATTTGTCGGAGGGGAGCAATAGAGGGGGAAATAGAATGATCAGTACTGCGGAGGGTAGGCCTTGGGTTGTGGGGGCAGTGAATGCGGGTACAAGGGGTTCGTTAAGTGTGGAGCTTGGAGGCTGTGG\t:<DCAF89C7@6+8BFCGD<6,<BFFC9FCGF@FEGDF,C<F9,,6CEF9FGAFAFCCEE<,CECE6@@B,8,,C?,:F8+88C<,54,5C,,,,<5A<E<,,,CB==99?,,C,,A<F?,,C<5C9B,6@?7+A9FFF9@4>>@:@8FC+,3@E,8,8,,6=,6++5:+8+*5*,,53,7,58C:*;1:C7++=:7+>9CDC=++**(3:5*+/93+*00*)/**1/7C8<(7*,<4*):*()()(0)0.(()2((0,)*())***)**(.(04)))((0(,(\n+ACCGACACAGACTAGGGATCAAAG\tba\t@M02286:57:000000000-AGCM5:1:2118:27229:11823 1:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTACCCCCTCTCGTGCCCCCTGTAAAGCTAACTTAGCATTCACCTTTTAAGTCAAAGATTAAGATAACACAAACCTCTTTACAGTGCAATTCCCCACATACATACTACCGT\t9FFFGGGGGGGGFCFGGGGCDDFEFGGGGCFGGFGGGGC7FEEGGGFG9FGDGGDFGGGGGGGFGEFGGGGDAECEEGGFGGGGGG9@FF9FEGFGFGG>DFC:F,E,CFDEFF,FGDF7>DGDFFGFG?FG9FG<=,3C7@+>AB=F,@>><<@,<;,>::FBE,>,,>F*5>>9<DBF:*=C,1*6*42:*/*3C++5?9+2,@C++5++3++>CA:9:<6C1+<C+3++<*2*<299F++*).*8476:7*0)0-***672*./9,1(024429*4:<F14\t@M02286:57:000000000-AGCM5:1:2118:27229:11823 2:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGCGATCTTTGGAGGTGGGGCGCAACAGCGGGGGGATGCGAATGATCAGTCCTGCGGCGGGTAGGCATCGGCTGGTGCGGGCACTGATGGAAGACACCAGATTTTCGTTCATGATGATGTTCTGGGTGTGTTT\t6:C6@FFFE+@CFDC6CFFGGGGGGCAFFGDCFGGGECFGGGF,6:CF9FEE@FFCGGFGDD@CGFEE@CCCA6FFEG,9BFGGGCGG?9?,5<,<,CE,9A,8B<4<<A9C<F<FAF@E,A,F<9,:A+4:+@4,97,,;,>@F>7@EG9,,,3383,@=CD,@=EDCE+3**3**6164;5B**2);)2/)96*++3+*2+9<CG53CGFC:77*+2785)*)0)2*2C81,1.))+)0)*((*().),.)**))-,((0)/)-:)*1.)/)..*2)(-.((\n+ACCGACACAGACTAGGGATCAAAG\tba\t@M02286:57:000000000-AGCM5:1:2119:25043:11501 1:N:0:1\tGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCCACCCCCTCTCGACCCCACTGTAAAGCTAACTTAACCTTAACCTCTTAAATTAAAGATTAAGCGAACCAAAACCCCTTTACAGTGAAATGCCCCAACTAAATACTACCGT\tEGGGGGGGCGGGGGGGGGFGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGFGGGGG9DEGGGFG9FGGGGFFGGGGGGFDFFDEEFFGGGFGGGDFGGGGGGFGGFFGCGAFF9AEFG,DFDFGGGCFG9DFGCD8FGGGG7FFG9F,DF7FE<F,,@*FF*33,,?<DC8C*5DFCGC8FC,*6*:CF1:B+++2<+?A2;@1;++?+ACC7FC7+0+12++3<9C0<<?F?4+0;08)***.:)*1.707D494*:*25==:5*(0*99,)*2<):*8(,\t@M02286:57:000000000-AGCM5:1:2119:25043:11501 2:N:0:1\tAGGAGTCCGAGGAGGTTAGTTGTGGCAATAAAAATGATTAAGGATACTAGTATAAGAGATCAGGTTCGTCCTTTAGTGTTGTGTATGGTTATCATTTGTTTTGAGGTTAGTTTGATTAGTCATTGTTGGGTGGTGATTAGTCGGTTGTTGATGAGATATTTGGAGGTGGGGATCAATAGAGGGGGAAATAGAATGATCAGTCCTGCGGCGGGTAGGCATAGGGTGGTGTGGTACATGCATGACGAGAACAGGTTTTGGTACATTTTGGTTCTCCGGGTTTGTTT\t:9CDFGFGG@:F@@GFFG@,E<FGGGE@C,;E96FFGCCFFF9CFFGD9E<AFCFFF?DDCFFGGDGGEFGFCA,6C?FEFFGGFFG,CEGGGGCFAEGGGGGEE<FGGF,EE9<?D?<FC?FGEEG8,CFFGC<,;EF?AE+@=CEEGGF9=;,;BFGGGGGFGEGDCCC6+7B,@DG<FC*8C7:92CGG+197=EFGG,*=F?BDD*:5)/73**2*<C*;(27F*1C0))0*9)/9=*)-/,(06))).0010(,+))).*=B>7:0-*)(-)1-*()..\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,8 @@ +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:1116:22967:7077 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAA-CGACGAGCGTGACCCCACGAGG FGGGGGGGGFGGGGGGGGGGGGGG<FDGGGGGGGGGGGGGGAFGEGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGG,EFGGGGFGGGGGGGFDGGGGGGDGGFEGGGGGGGC;;FGGGGFGGGCC8BCDFGFCDFFFC9EGGGGGGGG7DGGGEC*:CCF7FFGGFGDCCFGFGGFCG+8?7)95D>G>BFGAFFFAFEA:>BFF;*:>:@?F0(<2:1-399>:?)<<BA9A77?42132?? <?69>>>((-(((*.41(,((( +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:1118:14605:8689 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGACGACCGAAGCAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGACCCGGAGCTGCATGAAGCCATACCCAC-CGACGCGCGTGACACCCCGCTT FAAFDE8FEDECB6@@CC@7FGEE<FDCEFFFGGGEGGGGGCEFGGGGC8FCCEFCEFGD9FFFGG,:@F8<A6EFGGF<FCCC<<BEAF=CEF9E9<,:,5?9?ADD9F@FFEGGG9?,BE,,CE,9CFG;FEAD9,8CEG6+3@37AD=;DF9A9+38+8D8=8A++3479@EG*,=41253,*4(6-0///;=6+*2*86**(5/;;;++;C1.A8)4>D:?B*).0(344(*(,-((42(.(.)5A)9?0<4<7?+5( (,(,,(((->18:0,((02-92 +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:1118:21309:6959 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCACTTTTTTGCACAACCTGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGCATGAAGCCATCCCAAA-CGACGACCGTGACACCACCATT GGGGGGGGGGFGDCGGGGGGGGDGGGGGGGGFGGGFFFGGGCFGF@CCGFFGGGGGGGGFGGGGFGFCFDFFDEGE?@FFF7FC?FFGGGGGGGAF?FDFFGGGFGFEFGCEFGGGGGGGFEFFGGGGGFDCFGGGGFD@EGGGFEEEFE,EDDEFF5DD@FCFEE>CDCGGD>ED5CDFFGAFFGF@CEEFG4C:A:8?*//C5577?F;FACCFFF4D@EB33=675A1(7284<?9>FB9?B02)6<29???A(23+43 :<767(-(4C<((0)-()(()) +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:2101:17733:13519 2:N:0:1 AAGGGCACCCGGGTGGCGGCAACCATAATTCTAAGATTGCTTGGGTGGGGTATTACTTAGCACAGGAAAAGAATCTAAGGAAGGGCAGACAGGAAAGGAATTAATGCATTCCTGCATAACCAAGGAGGGAAAAACCGGCGGCCAACTTCCTTCGACAAAGGTAGGGGGGACCAAAGGGGCAAACCGCTTTTTTCCACAAATGGGGGCATAATGTAACGCCCCTTG-TTGTTGGGGACCGGGCCCGAAAGGACCCAAACCAACACGACGACCCTATCACAAAACGG B9,,,,:,,,+8+++8C+++++,,,,,,<,,,,,,:9,,,,C,B,B,++++,,,5,,5,,,<E,,,C4E,,+,,,<5,,:,,4B+8+++,,,+,B,,,,:,75,,,,7,,5,,,83,5C3<<+,+,8+++,@,,++++3@*1*1,*,2,,1,,***4:C,,1,,3***28E**;/2***)*1*9*)/9:*7*1*0***)1**1095))))0*00/**))()-80) **)0.439>>4*-(,)((-(()((.((,4((-((4(),((-((-(()()/).))(((4- +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:2103:23125:15471 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGCAGACATAACAAA-CGCCGAGCGTTACACCCCGATG FGGGGGGFGGGGGGGGGGDGGGGGGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGCGFGGFGGGGGGGGGGGGGGGGGGGGCEGGGGGGGGGGGG9FGGGGGGGGGGFGGGEBEFGFBFFGGGGGFGGFGGGGGGGGGFDEEEGFGDDFGGGGGG,@EEFEFFGGG6CDEGFEC8?*,79CFCFGGGGDGGFGGGGFGGGF4*8*6=7>FD+788FC7:37GEA@<8F?5:?46C),<(9B90??>?4*)1..406B).5)2 4<((49>07()--4/4(2((-( +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:2104:14576:24265 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACATTAAGAGAATTCTGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAAAGATTGGAGGAACGAAGTGTATAACCACTTTTTTGCACAACATGCGGGATCGTGTAACTCGCTTTGTTCGTTGCTCACCGGAAGCGATAGCGACCATGCCACC-CGTACCGCGGTCAACACCGTTT <<F,;C6CFGGCFG7C:BCD7C:9,,CCFG,,,<,,,<E,<FC6<:DFCAEF,,,66CC<9CFGGGDGCE6=ECFA<F<A7:@FGC4ED,<A,9,:,C,,4,,,,,5@A,?F,55EFFFGGG@9,4=,9E;@DGGF9,E+++++3A@,6,=;DD,6@,@=,6,,7,,+6+,0+4@8+,,,+2257,3@,2,219@+4+*/406**)*02?*/;C)=>8+).**/)1)1):;4(++./26()(((((0)(.,)(0())(.64( ()--()()(.((,0).(((.(( +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:2104:25265:19405 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAACGAGCTAACCGCTTTTTTGCACAACCTGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCTGAGCTGAATGAAGCCCTACCAAC-CGACGAGCGTGACACCACGATG GFFGGGGGGFGGDEFGGGGGGGGGGGGGGGGFGGAD<FAFGDG9FGGCEGFFFGGCGGGCGGGGGGC@FEFGGFFGGGG?7FGGFGGGGGGGDGDFFFDFFFGDGGGFE<FEFDC@<FFEFFFGCCFAF9FCAFFGG?EFGGGGGGCFFGA@>EF;E?DFGGF?EE@+8@DD6E>*@C574=B:DEG>=*ADGBFGC=D4*;*;76C378;A6CACCDD59CC()+*.8*.)45*3>7((0,,54)/*)426))(.4:())( 4)--6073(8?((633(36((( +CTGCGACACAATATTGGGCTCCCC ab 2 @M02286:46:000000000-AEG11:1:2119:22759:6520 2:N:0:1 AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACCACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGACTGAAGCACTACCCAA-CGACTACCGTCACACCACGATT GGGFFCFGGGGEGGGGGGGGGGDFGGGGGGGGGGFGAEGGGFGGFFGGGGFGGGGGGGFGGGGGGGCFGDDFGGGGGGGGGGGEDFGGFGGGGGFGGFFGGGGGGFGFFGFCFFGFCD@?FDGGFFG4EFFFGGDGGFGGGEGGFFGFDA9EFGG=9,@F+8+@>E6@E68:E5*;7C>CCE@FFGD9?96:57DFGFCGBC8?3(:CD3;8:@:+8+;3CDE<+27:FF5,:5A,73*((170(4).*/4+,)(.:?B:<, 8<(-((((-((((*,7(4(((( |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>CTGCGACACAATATTGGGCTCCCC.ab.2 8 +AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATNGGGGATCATGTAACTCGCCTTGATCGTTGGNAACCNGAGCTGAATGANGCCATACCCAACGNNGAGNNTGNCNNNNNNNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/CTGCGACACAATATTGGGCTCCCC.ab.2.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>CTGCGACACAATATTGGGCTCCCC.ab.2 8 +AAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATNGGGGATCATGTAACTCGCCTTGATCGTTGGNAACCNGAGCTGAATGANGCCATACCCAANCGNNGAGNNTGNCNNNNNNNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,3 @@ +GAGAACTGAAACAGCAACTATCCG ba 2 @M02286:46:000000000-AEG11:1:2107:12224:17165 2:N:0:1 AATAGCAAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCCTTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGCTGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTACACCACTGATCCCTTTCATCAGTTCCATAGTGTGGAATCTAAAATATACAAACACTTAGCTTCAGTTGTTAACAGATTTATACAC- GCFD,FCFEFAE,,<,,,;6CEF,,<@E,CEG<FGF?CF8@FAEFCEG<E,CFF6C@EE9E<,CF9FGD8@,@CF9E,<69@7F,AE<CFGGDG9F@EFAEEFG,=FFGGFC=<E,CFEG,,,5AF84E9E@,AD;?FEGFFGGAF,=E=,,,,8,88DEGFGG;DBD@FFCD,@=D?2@,6?,6@?E7,+2,26=8A+@8;:D??2,5+B:*+9=*+5?A@+;?;9**+*313.*3*7@;**2*+*4*7*)/(*00***1*8:***/-*1***201)+19<1+ +GAGAACTGAAACAGCAACTATCCG ba 2 @M02286:46:000000000-AEG11:1:2113:9530:10512 2:N:0:1 AATAGCAAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCCTTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGACTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATT-CACCACTGCTCCCATTCATCAGTTCCATAGGTTGGACTTTAAAATACATAAACAATTAGAATCAGTAGTTTAACACATTATACACT GGGGGGGGGGGGGGGG8FFGGGGCFGGGGGGFGGGGGGGGGFFGGGGEFGCECGFFFGEFG<FGGCEGGFGGGGDFCCFFGCEFGCCGGGGGGGGFGGFEGGGGFGGG8EFEGGGGGFGGGF9FF8:FGGCCA8F;9,>BD8FFDDFFDGGFDFAF8FGGA;FF;EDACF@FD88D?ED?DFDDEBEDFDC7D+?C+2 =@F7A::?A?+;?CCFC9EFF5;BEC@A<@*1>>)92***4:6*2*+4*2+*1A268*)5*058*174>4/**>*3().9<)79)) +GAGAACTGAAACAGCAACTATCCG ba 2 @M02286:46:000000000-AEG11:1:2114:17623:15531 2:N:0:1 AATAGCAAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCCTTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGATCTCCTCATTAAAGGCATTCCACCACTGCTCCCATTCATCAGTTCCATAGGTTGGAATATAAAATACACAACCAATTAGAATCAGTAGTTTACCACCTTCTACAC- GGC<FCFFGFCFGDCDEGGGGFGFFGGCAGFGGGGFFFG@<AEFDGGGEFFGGDGGG9FFFF@FAFDF?F?EFEEFEFGDFFF:<FGGC<5ECFFGFGGGGGD<F@@F<=C@CFFF9FFFEDFCGGCFFGAFC8EFGFG9E8FECGA,>FF?EDDDGGGF=@FGGGG,EGC,DD?FFF+2,@=A,,=FFG?DGGDDFGCGGGF7DFF?A?*:6:8BE695:@?F5B@>@5;B8@*1**=;*;+;AA>96<(5(/8=:3*9**8.774@C*1*)./)6=(;?).( |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>GAGAACTGAAACAGCAACTATCCG.ba.2 3 +AATAGCAAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCCTTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTCCACCACTGCTCCCATTCATCAGTTCCATAGNTTGGNNNNTAAANTACANNANNANNTANNNNNNGNNNTTNNNCNNNNTNTACNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GAGAACTGAAACAGCAACTATCCG.ba.2.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>GAGAACTGAAACAGCAACTATCCG.ba.2 3 +AATAGCAAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCCTTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTNCACCACTGCTCCCATTCATCAGTTCCATAGNTTGGNNNNTAAANTACANNANNANNTANNNNNNGNNNTTNNNCNNNNTNTACNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +GCCTGAAATGACGGTTGTTACATT ab 1 @M02286:46:000000000-AEG11:1:2107:14361:14714 1:N:0:1 TGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTCTGCTCTGAATACTTTGAGCAAACTCAGCCACAGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFFFGGFGGGFGGGGFGGGGD=EFGGGGFGGGGGGGGGGDECGFGGGGDGGCFGFGAFC8EGFGGGE+DAFGCFGGGGG95<@FGACFGFCAFGF@AFGFFG+=AFFFF7=2*?:F=AEF63CBF4;3 +GCCTGAAATGACGGTTGTTACATT ab 1 @M02286:46:000000000-AEG11:1:2113:19415:18691 1:N:0:1 TGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAACCTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTCTGCTCTGAATCCTTTGAGCAACTTCAGCCACAGG GFGFGDF8FGGGGFCGGGGGG?FGFGGGGGGGGGGGGDEFGGGGFCFGGGGGEDGGGGGGG8ECEGFFFGCFGGGGGGCFFFFFGGGG7,FFFEFGGGGGGFFGGGCEGGGGEFFGCGGDGGGGG9EFFGDFGGGCGGGGGGGFGGEFFGGGFGFGF9AFCBFGCCBAEFCFGGGGG9=FGCF;@,87;,=,EEFGECFCCFEG,=D,@,+3:7EE:CFFGCC::E7A>:7CDGGG:<++2,*;?9*/+191:++=9=*=+,188=*)).)()00=/+?><(2) |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>GCCTGAAATGACGGTTGTTACATT.ab.1 10 +TGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTCTGCTCTGAATACTTTGAGCAANCTCAGCCACAGNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/GCCTGAAATGACGGTTGTTACATT.ab.1.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>GCCTGAAATGACGGTTGTTACATT.ab.1 10 +TGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTCTGCTCTGAATACTTTGAGCAANCTCAGCCACAGN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,5 @@ +TAATACGATGACATTTCGCACCGA ab 2 @M02286:46:000000000-AEG11:1:1107:16019:3802 2:N:0:1 AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGACAGGGGGCTGTGCTGCAGGGCGAGTCAGTTGGGTAACGCCCGGGTTTTCCCAGTCACGACGTGGTAAAACG-ACGGCCAGTGACTTTGACTACGAGTCACTATAGGACGAATTGG -FFGGGG:@EFFGGGGCDCFGGC77E<<CEFGFGGD?C7EFGGGGGGEGFGDGFFECDFC?CGDGEDGGGGG9EFCGGGECFCE<>D+8@FEC7CFGF@@CG:FGF9@AFFGGGG*<CC,7@<AF<:C:CCF7CECEB8C58EEC*;:?CCCGEG7FFFEGECFGGG5/:***5/*:*)2C7*0+0/+*2)**)0*2:*0*9)8C@F507)7>)81537*9<44*)*-0-5,()-6).443 9;(-((-(),*))-).)))..(,(-4).44).6)*)3((((.( +TAATACGATGACATTTCGCACCGA ab 2 @M02286:46:000000000-AEG11:1:1112:7443:21645 2:N:0:1 AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGGTGTGCTGCAAGGCGGTTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGCCGTTGTAAAACG-ACGGCCAGTGGAGTGTCAGTCGACTCACTCTAGGGCGTATTTG GGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGCFGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGEGGGGGGGGGGGGGEGGGGGGGGGGGGGDGGF@EGCCFFFFFGGFGGGCCGFFGGFFG7FFGGF7F7CFF*1?8EGGGGFCFCECC8?<C@8CEEE57+CCEE*:C=EDE*/855?FFF*<CF5C)1).*::7@4>766?37)0<>D3>D?<7*-0<?)4@0>B((.4*<462( ,-311(50).).(..-)..).4>>?2,(5-))))((((,((). +TAATACGATGACATTTCGCACCGA ab 2 @M02286:46:000000000-AEG11:1:2108:2493:15900 2:N:0:1 AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGAGCGTAACCACCACACCAGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCATCTTAGCTATTACGCCAGCTGGCGCAAGGAGGAAGGGCTAGCAGGCGATTCAGTTGGGTAACGACAGGGATTTCACAGTCAAGGCGTTGTACAACG-ACGCCTAGGGACTTGTAATACGAGTCACTATAGGGCGAATTGG G@FFA,:FE@FFFDEEFF?EEDECCFDDFEGFFFEEGGGCC,,6+CCF+B,8,F+:,BB:B+B,?:+>=>:7?@,<C:7C7+@>FFGG::7:+@+5@EA,3<CEGAFFGG,6F7:*158F,@CCE,2=*=BF*43<7*4=B8CE**0+3+3<++++++22/8?2+<CE/2:5=5**2:*:*:C***)+0*))**)0*:***99:4<7/)))0)>*.(90/**0/)**1)(,-53(0)0)), 23(()(-8(74((/.<6*6))4)((.:-6*)-)*,(0.,(-+0 +TAATACGATGACATTTCGCACCGA ab 2 @M02286:46:000000000-AEG11:1:2111:24850:13036 2:N:0:1 AAGGAGCGGGCGCTAGGGCGCTGGCAACTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAACGCGCACCTACCTTGCCCGTCCATTAGCCATTCAGCCTGCGCAACTGTCGGGACGGGCGCTCGGTTCGTGCCCCTTCTCTATCACGTCACCTGCTGCACGGAGCATCTCCCTCACGCCCACTACGTTGGCTAGCGCCATGTTTTTACCTTCCGCGCCTGTGTAACAAGCACGCACTGGAGTTTGAAGTCCCCTCCATCGTAGCTCAATTGT- GG<<,,C@++6+@:@+;,E::F+FC,:,:6CF<E+6CF:,,,6:FCE47>+BB,48=DF?:F7CC+@:++33,,+8>C+6:,,D,,,,:,3<@<FG9B,,@,,,33,,,75>D*>7>F<,@,1::4*1**11*:<****2**//;*22;E+0+++++2**;;*<*0)+*+00***2*1++0*0***)1*1)1C)*9)/**1*)+**)*,1)**)2047*9)*.*(*())()(6)6*74-*),)(()()(.(.(4)-))4*.4)*.3,,()/((.77)))))-)) +TAATACGATGACATTTCGCACCGA ab 2 @M02286:46:000000000-AEG11:1:2118:11759:4034 2:N:0:1 AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGGTGTGCTGCAAGGCGCTTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGCCGTTGTAACACG-ACGGCCCGGGAATTGCAATACGACTCACTATCGGGCGAGTTGG GGGGGGGGGGGGGGEGGGGFFGEFGGGGGGGFAF@CFGEGGGGGGGGGEGDFGGCGGGGGGGGDF+BFEECFGFGGGGGGGGDEGG3>B:7>FFEECFGCCFGE;F;F:C>FFFG*CF:CCFGFEGFGG7@C5:;ECG@EE>58ED6++<C5838C7ECEC5*:6A@EGGE=CFG*:)298*:>@FFFF:<>*)1)/09@498(999/..)->().4.7<?0/442@3*0()-1946)<:: ;9>B99(4)(*-2),)).))4(-,.4)9-)*)0((4(((-()( |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>TAATACGATGACATTTCGCACCGA.ab.2 5 +AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGANGTGCTGCAAGGCGANNNNGTTGNGTAACNNNNGGGNTTTCCCANTCNCGNCGNNNNNANACGACGGNNNNNNNNNNNTNNNNNNACTNANNNNNNNNNNNNNNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TAATACGATGACATTTCGCACCGA.ab.2.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>TAATACGATGACATTTCGCACCGA.ab.2 5 +AAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGANGTGCTGCAAGGCGANNNNGTTGNGTAACNNNNGGGNTTTCCCANTCNCGNCGNNNNNANACGNACGGNNNNNNNNNNNTNNNNNNACTNANNNNNNNNNNNNNNNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,9 @@ +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:1111:10934:19340 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCACCCTGATGC- GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGFGGGGCFFGGFFGGGGDGGGGGGFFDCFGGGGEGGGGGGGGGGGGGGGFGGGGFFF;BFGGGGGDGGGGGGGGGGC9;AE7CEFG7ACFGGC@@9DFFFGAGFC>9B+59CGFA?DFGF8AE:++=5+9A5CC6:)@4)(;)7:@E<AEF@8@9<9@;36(2193:AA26<?BF0;((.):))((-4:96- +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:1112:24472:13834 2:N:0:1 TTAAGATCCAGTTCGCTGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTGCACCAGCGTCTCTGGCTGAGCAAAAACAGGAAGGCAAAATGCCGCAAACAAGGGAATAAGGGCGACACGGACAGGTTGAATACTCATACTCTTCCTTTTTCACTATTATTGAAGCATCTATCAGGGTTATTGTCTCATTAGAGGATACATATTTCAGTGTATTTAGAATAATCAACAAATAGTCGTTCCGCACACTTTTCCCCGTACAGTGCCATCTGATGC- GC,CEFG<,,,6,,,+6,BFG<6CFCA<F@FC@<,BCD:CEE6,,6,,;CFF8<C,6,:69,<FFG<8@@8,,:C,A,,,:BB,,46BFG,,BF,?=F<AF<<,=+448+,,:A<=A=:BGA5,8FC::BFEG+C++CFDE,E=B9339,3,3@3>,EEF93A,+@<<D;,EC:@99@A,59;A+5*55;*4++2++2+:+2+,4*94A911+=*++;*289>=:*@=)+**1=*1*(**/4;;)).)/(5)((.((()))1)2(2(,-(.)).)-)()-.)6) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:1113:12129:21325 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACATGATGC- GGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFDFGGGGGGGGGGGEGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGFGGGGFGGGGGGGGGGGGGFDGGGGGGGGGGGGG9EGGG9;DCFGGGGGG7C;EEGGGFFCFGGGFFGFGCFGGGGGFGGGFCA7>A4EFGFFAF6FFGFEFFFFFF?FFFFE3=;F3AABFFFFFFFBF6@8<DFFF>;<9>B>)42339927:A)67(5>AAA) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:1115:11255:23962 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCTCGACAAGTGCCACCTGATGC- GGGGGGGGGFGGGGGGGGGGGGGGEEGGGGGGGGGGGGGGGFGGGFGGGGGGGGGFFGGGGGGGGGGGGGFEGGGGGGGGGFGGGGGGGGGFGGGGGGGGCFFCEGGGGGGGGGGGFFFGFA:FEGDFDEGGFGEGGDGGGGFFGGGGGGGCGFGGGGCFFGFGFFGEGGGAFCFGFGGGGGGGGGGGGF59FCFCGGFEE928@?79DFFFAFFCFA=CF6=8>8EF:7:=<FE@3=@22>>E8;)3:7ED0>>E;><=>A4).(((,((.4)69<((((69) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:2104:22219:19734 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCCAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTGAGTAAAATAAACAAATAGGGGTTCCGCACCCATTTCCCCGAAAAGTGCCAACTGATGC- GGGGGGGGGGGGGGGGGFGGGGGDFEGFFGFGECFGGFGGGGGCGGGGGGGGFFFGGGFGFGGGGGGGGGEGGFDAEFEECFF,B<EFGGFG?EG>,ABFGGGGFC:=F:FFG+=FAFFF9<<FC@:FEEGGG+CCAFF9EFGAADFFGGGGGEF;FFGGGGGGEFC9BGFGEC,@EGFGGGGGGDF>FG58EAGFGFCF;>4:*7*3*8+1++=@5C785=;985+0++4+*;7);C./))1.;6DFF;130),(,,.@>2A*47170>(?))4*)3(.8:A) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:2108:15124:17674 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGACGCATTTCTCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATACACAAATAGGTGTTCCGCGCACATTTCCCCGAAAAGTGCAACCTGATTG- GGGGGGAGGFGFGFGFGFGGGGAFFFGFGFGGFGG>CFEE<FGGGGGGDCEFGGGCFGFGGAE<@EGFG7@:FGCFGGGG,BEFGGGGGGGGGG8@AEGFEFG9FFEGG@FC@FCB<<@B,C9<FEEGE@C@EC>DFG9CDFGGGADF9ADD,@DFFA=FDF6DFE;EFFF;;@,3+;CCG,+@9;F,;D?CGFB=EFGG+=4<295ED;BFBDC+*4=@81+=*+++219;;A;+/9<5=EF<@+)/59+624B>>>F>2A<*(46;(365-6*))0).3)0+ +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:2113:3515:13547 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATTAGCGGATACATATTTGAATGTATGTAGAAAAAGAAACAAATAGGGTTTCCGCGCACATTTCCCCGACAAGTGCACCCTGATTT- GGGFEGGGGGGCFGBFFGGGGGGCCFF8<FGGGGFGGGGGGGGGGGGGGGGGGFA9EEEFGGGGGGGGGGBCEGGGFF@FGGGGDGGFDFGGGGGGGGGG8AEFFEGGGGFG@FE:FF?,AFEDCGGGGDE@BEGCFEGCDGDCFDGFGGGFFC;,BF=FD?DFGGGFGG7;ED77ACFCFCA+@ACFGGGCCCF9CF;75=+A>4C4?FFGEGGFFFFA8=@F*==C8*//()2(.3=<<<7345(/8>E@0>@FB69<2<224(-(-6?94)).:((,4<2) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:2116:16001:8442 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGCATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCCTTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTTACTGTATTTAGAAAAATACACAAATAGGGTTTCCGCGCACATTTCCCCGTACAGTGCCACCTGATGC- GGFCE@FGGFGEFGGFGGGGGGGGGGGGG<<FGGGGG?FGGGGGGGFCFG<FGGFGGFGGGGGGGGGGGGGD@DEFFF<?,CFFGG<ECFGG8=FCCFFGFFFGGGGCCCC7+4:=+9==59E@FGEGGGGCEEFDE=;EFB==ECG,D,EFGGFFGGGFGGGG:=FGFDEGA+7,:,=+E7CF<,@8D75>=3@CDCFF,=99BECD>F=CFFC+*0,=BFFFGF@F)=9(35=)*;*6(718)6)//;C58)171(.6.)-*.((-(,4=)-).5(4)(66) +TTTTAAGCGAAATTTACCCGTTAA ab 2 @M02286:46:000000000-AEG11:1:2118:20541:18054 2:N:0:1 TTGAGATCCAGTTCGATGTAACCCACTCGTGCA-CCAACTGATCTTCAGCATCTTTTACTTTCACCAACGTTTCTGGGTTAGCAAAAACCGGAAGGCAAAATGCCGCCAAAAAGGGAATAAGGGCGACACGGCAATGTTGAATACTCCTACCCTTCCTTTTTCAATATTATTTAAGCATTTATCAGGGTTATTGTCTCATTAGCGGATACATATTTTAATTTATTTTGAAAAATAAACAAATAGGGGTTCCCCGCACATTTCCCCGAACAGTGCCACCTGATGCG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG GGGG7@,6CC<FEGFFGGFCGGGC<FGGA6,EF,869BE,,,CEA,,,:,CEG?@,7=+>+CD=FFEEFGG:B+=FGFG:FG?FCF,,AFEBF==E:C++@,,3C9@C;;:,=>=,2,,,6>DEEG<@,,32=@ECGG;AE:CCFGC7@F+,*@AD5:AAFGFFAF,:+5):?>?AAD4@7=+*2=/87:/*+40+;>A14:.67E;<3))/679())*,2,314.4:6<01)--7,.*6952(.51-)6, |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>TTTTAAGCGAAATTTACCCGTTAA.ab.2 9 +TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGANCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCNNCNNGATGNN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/TTTTAAGCGAAATTTACCCGTTAA.ab.2.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>TTTTAAGCGAAATTTACCCGTTAA.ab.2 9 +TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGANCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCNNCNNGATGN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/cmp.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/cmp.sh Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,43 @@ +#!/usr/bin/env bash +if [ x$BASH = x ] || [ ! $BASH_VERSINFO ] || [ $BASH_VERSINFO -lt 4 ]; then + echo "Error: Must use bash version 4+." >&2 + exit 1 +fi +set -ue + +TmpSscsBefore="tmp.sscs.before.fa" +TmpSscsAfter="tmp.sscs.after.fa" +TmpMsa="tmp.family.msa.tsv" + +Usage="Usage: \$ $(basename $0) diff.family.msa.tsv diff.sscs.before.fa diff.sscs.after.fa > cmp.txt" + +function main { + if [[ $# -lt 3 ]] || [[ $1 == '-h' ]]; then + fail "$Usage" + else + msa_input="$1" + sscs_before="$2" + sscs_after="$3" + fi + + lines=$(cat $sscs_before | wc -l) + choice=$(python -c "import random; print 2*random.randint(1, $lines/2)") + + echo $lines $choice >&2 + + head -n $choice $sscs_before | tail -n 2 > $TmpSscsBefore + head -n $choice $sscs_after | tail -n 2 > $TmpSscsAfter + cat $msa_input | ../msa_sscs_matcher.py $TmpSscsBefore > $TmpMsa + + tail -n +2 $TmpSscsBefore + tail -n +2 $TmpSscsAfter + echo + cut -f 5,6 $TmpMsa +} + +function fail { + echo "$@" >&2 + exit 1 +} + +main "$@" |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/cmp.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/cmp.txt Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,7 @@ +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGNGCNNNNACATATTTGAATGTA +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGNGCNNNNACATATTTGAATGTAN + +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTT- GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGFGGGGGGDGDDFGGGG?GGGGGGFGGGGGGGGGGGGGGCCFGGGEEGGG?FGGGGGGGECEGGCC7CDCCFCGGGFGGD9CFGGGFGGGGGGGGGG7FFFGGGGGCFFFFFFGFFFG0,C?GFGDFAF<?-962.)(4>?AFBF>BDF<A>7* +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATTAGGGTTATTGTCTCATGAGCGGTTACATATTTGAATGTT- GGGGGGGGGGGGGGGGGFFGGFGGGGGGGGCED@FGGGGGGGGGGGGG@FF@FGGGGGGGGGGGGGGGDFEGGFGGDEFGGGGGGFCEFFGGGGGGGGGGGGFCFCCE?FGGCGGAFGGGGFFGGGGGGGGFCFFFGGGGEFGGGGFG@FGGGGGGDCFFFGGGFGECC*3:3>EC7D:EFED?8CCEGDGGGGGDGF2:CFG9E7FC3:=CGFCCFC9AFGGGGCFGGFGFF9=>?E6C7@7/*9DF>FGG<?FF96*-68)25)4/4>)-6=<BFF?<BAF) +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATC-TTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGCAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTACTGAGCGGCTCCATATTTGAATGTAT GGGGGFFGDECFGGGGGGGFGGGGGGGGGGGGG>FGGGGGGGGFFGGGGCFFFGGGGGGGGGGGGFGGGGGGGFGGGGGGGGGECFCGGGCFG7FGGGGGGGGGGGFFFGGGGG FGGGAEFFFFFGGCBBFGGFGGGDDCGF8EG8FFFGGGFD<+<3DF,FFGGGGGGGDGEECCFFGGGFFC7:@CECC8*8CCFGGGGGGGGCFFGGGFGFGCFGFFFGFGFEGGGGGFGGGCGGGGFFF+*9<<FFGFF=2)10*1)9D()/(0).,*.849**6@AG<4 +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGATCGGCTACATATTTGAATGTT- GGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGG7@F8CEEFGGGGGEFFDGGGG5CGGGEFGGGG9@979EFCFECFGGFGFGFGGGFGGC7CFFGGFGFFFFGGGFFGGFFFFFGFF9<*:B9>>2*:7))1)4)=?AB?*65*.6) |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/diff.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/diff.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,26086 @@\n+AAAACAGCAAAACATGCTGTAGAT\tba\t2\t@M02286:46:000000000-AEG11:1:2101:18863:4476 2:N:0:1\tAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTGCATCCC-CTGAGTCTTCCATGTTCTTCTCTCCACTTTCTTT\tGGGGGGGGGGGGCGGGGFDGGGFCFFGGGGGGGFCCFFFGGGGGFGFGGFGGGGGGGGGFGGGAFGFGGGGGGFGFGGGGGGGFGGFGFEGGFGGFGCGAFGGF<FGDFGGFGGDC>:CGGG>FGGGCCEGGGGG8EGDGGFGF;EDFGFCGGGGG8DGFGFE8F@DFDCGGGCEFCFGFGGDDFFFDG??DD7EAGF5+=+?D+*0CF*;01<@>BAFE4:>=FFFFC6;AC9*1C;@FFFE0*5+/08 2:2:349)/7<F??75)76))2))0/:)))))1)\n+AAAACAGCAAAACATGCTGTAGAT\tba\t2\t@M02286:46:000000000-AEG11:1:2103:10458:6445 2:N:0:1\tAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTACAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAATTGAGTATTGGGACTGTGAATCAATGCCTGATTCATGCA-CTGCGTCTTCCATGTTCTTCTCCACAACATCCTT\tGGFGEGCCFEGG9FAGCF,6CCFFFGGGGAGGGGGGDFGGGGGGGGFGGFGGFGGGGGGGGFEFCE,CFFEGAFFFGCFGGGGEF9DDFFFGGG9AEAGEFE9EEGGF,A?AEEGG>BFFG,@CFF=F4AE7=C8A=,EF@GGG9;DCFFDDFGGG,=9EGE83;@,@EG?AE,=FFEF+8=D9E@D?@?D8,+?,3=FFFGFD+:C*@AC+?>FG*5596*5*-1;?/=76>69+>6.40(007?+=@* *83*(0.1=9>)9419)6:5?.61/6)(,6)(/)\n+AAAACAGCAAAACATGCTGTAGAT\tba\t2\t@M02286:46:000000000-AEG11:1:2107:21148:11093 2:N:0:1\tAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCC-CTGAGTCTTCCATGTTCTTCTCCCCTACATCTTT\tGGGGGGGGGGGGGGGGGGGGGCFEFGGGGGGGGGGGGEGGGGGFFGGGGFGFGGGGGGGGGFGGGACCFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAFGGGGGGGGFG,F7FGGGGGGGGGGGGCGGGGGFFGFFGGGGGEGGGCEF9FGGFGGGFFG8ADDGGGCGCGGGGFGGGFFEFGFGFGFGGGGCGFGGGFC;@DEE>7;;BCC6@CGF4A>5?DF@FF?EFFEFFFFFE*8@EC?>=: ;>9(*9;E337;)+.<3C));;/31*,/29<4=+\n+AAAACAGCAAAACATGCTGTAGAT\tba\t2\t@M02286:46:000000000-AEG11:1:2110:11463:11919 2:N:0:1\tAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACATCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGAC-CTGAGTATTCACAGTTCTTCGTCCCACCATCTTC\tFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFGGGGGGGGGGGGGGGGGGGGGGGGFEGGGGGGGGGGGGGGGGGGG8FCGGGGGGGGFEFDGGFDGGGGGFGGFGFGDGGGGEGDG>@FCGCFD7BCFFFCFC@F=C@CFG,EE9DDFEGGGGFFGFGGG?FDGGGGGGGCDFGGGFGGGD685DFCDF?9F?;,59)43589D9B7+=@9AF3*77B*);@974*+:8*(0=*4*99?CE10 0=:((**.:)+/)//)).))((,/(*,8;)=F))\n+AAAACAGCAAAACATGCTGTAGAT\tba\t2\t@M02286:46:000000000-AEG11:1:2111:10042:23877 2:N:0:1\tAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAAAAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTTAAGCAAGTAAAACCTCTAAAAATGTGTTATGGCTGATTATGATCATGAATAGACTGTGCGGACGGAGGGGCCTGAAATGAGCATTGGGACTGTGATTAAAGGCCGTTTTCAGGCCAGTGAGCATTCAGTGTCTTGTCTAGTCCCTCATT-\tFGFG@FAFGGGG9<FGGGGGG<FFFFGEFCFFG9FG,<FF9EF,CFFGFC<FGFGFCFFF@FCFCCCAEFCEAEC9@EGGC,B4<ECFFFG9<AF<,@EC@,:,ABEFC9<<AEEGCGG,AFC77=FF:F9C@++94,E>=@CE@FGGE,@E,,8>,>,8:,,CD;63+63@:D,DEF:EED9=,,:=F99+:7+3*+*@;***5**15*)*+0BA:DCB@8262*/*1)+).1)+*(/).(2/);)*(/,())+)().*.)-*).).))*)))**.).),()- \n+AAAACATAGTGGCCGCGAGTTCTT\tab\t1\t@M02286:46:000000000-AEG11:1:1102:22486:1531 1:N:0:1\tTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCC-TTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACT\tGGFGFGEFGCGGCFFGGGGGFEFGGGFGFFFGGGGGFGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGFGFGGGGDEEEE@FGGFFGFFGG?FFGGGGGGGGDEFGGGCF9FGF<FFFGDFGGGFC<EEFGFCFFFFFGFGGG8<ADF@FEFDGGGG:FGGGFGEGGGGGGGFFF?D;:,7=>;D,DFAFFGFG CFEFFGECCE:CGBEDE>@AC9:FFAC9?*BCFGE4C;5DC?<58CAFB>FFF:7BB:5,=F38).<=4<5:?BFF?2<:>?21231(\n+AAAACATAGTGGCCGCGAGTTCTT\tab\t1\t@M02286:4'..b';FFF?=EFAFFF)>E<77035AC59)30))*:E;AC9AB22596:EFB4)5(833@3??\n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2107:26502:21197 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGACTGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTTTGACATATTTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTAAACTACTGATTCTACTTGTTTGTGTATTTTAGCTTCCATCCTATGGAACTGATGAATGGCCGCAGTGTTGGAATGCTT-\t-68<E@<@@F<@FCC6,<,CCF9,CF<F,CFFF86,C<FECFGGGFGFFCECFCE8@F9EGGGFDC6C@F,CAFGGGGGA86E<@FFGGF,B,,:5A,C?B?FEF,,,:E,,,4=?F49FDE9F,=F+E7,;=F=,CA,=,E8,,9D,A?F9F8>,>,@@;E,4@DCDF8=CFE8E,@:EA,+7?=B,?=9,@=@++++=+?==:9,7:D5FFD+?CCF9+421B:00AF8*9?CA63*=E955*+195>8)8**..*21**:1)))//9)8>*B/5)+776+4 \n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2108:20398:11389 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGAATGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTGTGACATAATTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTAAACTACTGATTCTAATTGTTTGTGTATTTTAGATTCCAACCTATGGAACTGATGAATGGGAGCAGTGGTGGAATGCCT-\tEGGGGGGGGGGGGGGGGGGGGFEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGFDFFFFFGG9@FGGFGGC<FGGGG<FGGGGG7EFGGGBAAFFFGGGGGGFEFFGGDG?>EGGGGGGGFFGGGCDFCFCFGGG?CCFCFEFFGGGFFGFECDFGG=D8DGGGE,=CFGGGFDAFFFC,9F<=DFFD@A<DFGGGGF+=:9CDDBDFB6EFECGFFAGF=;CFFF5*?+;CCF@FFE**3*1)0>F@C@F4*4+>*6ACEB)<)-0*73=(62/>@C,5 \n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2115:5903:8910 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGAATGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTGTGACATAATTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTACACTACTGATTCTAATTGTTTGTGTATTTTAGATTCCAACCTATGGAACTGATGAATGGGAGCAGTGGTGGAATGCCT-\tGGGGGGGGGGFGGGGGGGGGGCDFFGGGCFGFGFEGGGGGGGGCEFF<FECDFCFGGGGGGGGFGGGGGGCF,B9EEFGGGGGGFFFFFGEGGGCFFG9FFFFGGGFGFFGFGG@?E?FGGGGGFGGGGGGFFGDFFGGGGEG@EGAFFGFFCEFGGFGC9,D@EFFGGF=FFFCFDEGGCGG,,=CD8==9FG9=CFFFGF;EGD,;=?FF+3*0@CCCFEFFFFFF@DFFEF7CAD*>EE+;;5?F@A@587)*6<EF>6EEF22>*((*(7@@225356./ \n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2118:23131:17696 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGAATGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTGTGACATAATTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTAAACTACTGATTCTAATTGTTTGTGTATTTTAGATTCCAACCTATGGAACTGATGAATGGGAGCAGTGGTGGAATGCCT-\tGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGAEGGGGGGDGGGGGGGGGGGGGGGGGGGCFFGGAFFGEDFGGGGGGGFGGGGGGGGGGFG8;DGGGEAFGDFGGGDGFDGFGGGG9DAFGFFGGEGFGFFFBGBFC;EFF=8F?DC=DFFFGF7FFEEFFCC?=C<EFEFC,>EEEE5)>))3*@*9=EE>?;<*6AB6>DD(3BB)*((587@F)9 \n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2119:15589:18585 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGAATGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTGTGACATAATTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTAAACTACTGATTCTAATTGTTTGTGTATTTTAGATTCCAACCTATGGAACTGATGAATGGGAGCAGTGGTGGAATGCCT-\tGGGGGGDGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGG<FFFGFGGGGGGGGGGGGGGGGGGCACFGFFGGGFGGGGGFGGFCDGGGGCDCEFFFGGGFGGFF=DEGGGGGGGGFFGGEFFFGGFCFE7=FGF+=DFBFFFFFFFC@5CFEFEGFGFF@EFEEFB7A;AAA;=>@9A@C595BF9AEE4AEF@@4:>:>:BE:>5>=@*)8 \n+TTGCTCAGCTCAAGTGTTATATCG\tba\t2\t@M02286:46:000000000-AEG11:1:2119:2379:17240 2:N:0:1\tAGGATGAAGCATGAAAATAGAAAATTATACAGGAAAGATCCACTTGTGTGGGTTGATTGCTACTGCTTCGATTGCTTTAGAATGTGGTTTGGACTTGATCTTTGTGAAGGAACCTTACTTCTGTGGTGTGACATAATTGGACAAACTACCTACAGAGATTTAAAGCTCTAAGGTAAATATAAAATTTTTAAGTGTATAATGTGTTAAACTACTGATTCTAATTGTTTGTGTATTTTAGATTCCAACCTATGGAACTGATGAGTGGGAGCAGTGGTGGAATGCCT-\tGGGGGDGGGGGGGGFGGGGGFFGGGDFGGGGG?FDGGGGFF@<@@FFFFFGG@CFGGGDGGFFFGGCCFGF9FGG9FG<FGFDGFFG4F,CCDFF9EFDF9FGGGGCFGGGF?AFE?,CEEFFFG8=D==@E>9EC>BEFGFCFCDED@D@FGG8AD,;,,@==D9,6;=>FFGGFFDGGDGGGCDCDFD?88=FFFFGF7=+?F7D=+?D?+;FDFAFEGFFE<;7==FF@*C,5BC7E@**3;55)3<:@>>@3=BE=C)/.)/49>2=@=294@F:*)101 \n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/diff.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/diff.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,9426 @@\n+>AAAAAAACGGAACCACGTCACATT.ba.2 3\n+CCCGTATCGTAGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAACGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCANANTCCCTTAACGTGANTTTTCGTTCCNNTGAGCGTCAGACCCCGTANNNNNGNNCANAGGNTCTTCTTGAGNTCCTTTTT\n+>AAAAAAAGCATGCTGCGGAATGAC.ab.1 7\n+GAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTTCTTTCCTGCNTTATCCNCTGATTCTGTGGNN\n+>AAAAAAATCTATCCTACCGTCATA.ba.2 5\n+GAAACTCCTTGCATTTTTTTAAATATGCCTTTCTCATCAGAGGAATATTCCCCCAGGCACTCCTTTCAAGACCTAGAAGGTCCATTAGCTGCAAAGATTCCTCTCTGTTTAAAACTTTATCCATCTTTGCAAAGCTTTTTGCAAAAGCCTAGGCCTCCAAAAAAGCCTCCTCACTACTTCTGGAATAGCTCAGAGGCCGNGGCGGCCTCGGCCTCTGCATNAATAAAAAAANTTAGTCANCCATGGGNNNNNNNNNGGGCNNNNNNNGGCNGNNTTNGGGNNGG\n+>AAAAACGACGCTCGTTCTCGAGAT.ba.2 5\n+ATCATAACATACTGTTTTTTCTTACTCCACACAGGCATAGAGTGTCTGCTATTAATAACTATGCTCAAAAATTGTGTACCTTTAGCTTTTTAATTTGTAAAGGGGTTAATAAGGAATATTTGATGTATAGTGCCTTGACTAGAGATCCATTTTCTGTTATTGAGGAAAGTTTGCCAGGTGGGTTAAAGGAGCATGATTTTAATCCAGAAGAAGCAGAGGAAACTAAACAAGTGTCCTGGAAGCTTGTAACAGAGTATGCAATGGAAACAAANTGTGATGATGTNN\n+>AAAAAGAAAACATAGCGCTGTGAA.ab.1 5\n+CAATAACCCTGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGGCATTTTGCCTTCCTGTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCAGTTGGGTGCACGAGTGGGTTACATCGAACTGGATCTCAACAGCGGTAAGATCCTTGAGAGTTTTCGCCCCGAAGAACGTTTTCCAATGATGAGCACTTTTAAAGTTCTGCTATGTGGCGCGGTATTATCN\n+>AAAAAGATATACAGGATTAATAAG.ab.2 7\n+AGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGNTTAATCAGTGAGNNN\n+>AAAAAGCATGTGTAGCTAATAAAA.ab.1 7\n+AAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCANCACANCCGCCGCGCNN\n+>AAAAATGATAAAACAGGCATGAGA.ab.1 7\n+AATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTCNNN\n+>AAAACACGCTCCCGGACGTTGTAC.ab.2 5\n+TGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCANTTTCCTGGNNGTCTTCATNN\n+>AAAACAGAGAATCGGAAGGAATAT.ab.1 5\n+GTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAANAGGAGAGCGCANN\n+>AAAACAGCAAAACATGCTGTAGAT.ba.2 5\n+AAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGANNCTTNCATGNTNTTNTCNNNNCNANCTTN\n+>AAAACATACCAGTAGGACTTGAGA.ba.1 9\n+AGGAGGCACATTTTCCCCACCTGTGTAGGTTCCAAAATATCTAGTGTTTTCATTTTTACTTGGATCAGGAACCCAGCACTCCACTGGATAAGCATTATCCTTATCCAAAACAGCCTTGTGGTCAGTGTTCATCTGCTGACTGTCAACTGTAGCATTTTTTGGGGTTACAGTTTGAGCAGGATATTTGGTCCTGTAGTTTGCTAACACACCCTGCAGCTCCAAAGGTTCCCCACCAACAGCAAAAAAATGAAAATTTGACCCTTGAATGGGTTTTCCAGCACCATN\n+>AAAACATAGTGGCCGCGAGTTCTT.ab.1 8\n+TGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGA'..b'TTGCATAATGCTTTTCATGGTACTTATAGTGGCTGGGCTGTTCTTTTTTAATACATTTTAAACACATTTCAAAACTGTACTGAAATTCCAAGTACATCCCAAGCAATAACAACACATCATCACATTTTGTTTCCATTGCATACTCN\n+>TTTCCTGGCCCAAAGATCTTTTCA.ab.2 5\n+TGCAAAGCATGCATCTCAATTAGTCAGCAACCAGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCATAGTCCCGCCCCTAACTCCGCCCATCCCGCCCCTAACTCCGCCCAGTTCCGCCCATTCTCCGCCCCATGGCTGACTAATTTTTTTTATTTATGCAGAGGCCGAGGCCGCCTCGGCCTCTGAGCTATTCCAGAAGTAGTGAGGAGGCTTTTTTNNNNGNCTNNNCNTTNNCNAANNNNNTN\n+>TTTCGACTCCCGTGAATGTGTCGA.ab.2 12\n+TGGAACAAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGNNAGGAAGGGAAGAAAGCGAANGGAGCGGGCGCNNNNGCGCTGNNANGTGTNGNNNTCACGCNNNNCNNNACNACCANNNNCNCN\n+>TTTCGCGTAAACTCCCCTTGTGAA.ba.1 5\n+AAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTANN\n+>TTTCTCACGCACCATTCTTTAAAG.ab.1 10\n+TTCTTCTGGATTAAAATCATGCTCCTTTAACCCACCTGGCAAACTTTCCTCAATAACAGAAAATGGATCTCTAGTCAAGGCACTATACATCAAATATTCCTTATTAACCCCTTTACAAATTAAAAAGCTAAAGGTACACAATTTTTGAGCATAGTTATTAATAGCAGACACTCTATGCCTGTGTGGAGTAAGAAAAAACAGTATGTTATGATTATAACTGTTATGCCTACTTATAAAGGTTACAGAATATTTTTCCATAATTTTCTTGTATAGCAGTGCAGCTT\n+>TTTGACATCAACAGAGTACGTTTC.ba.2 3\n+AGTTTGGCAAGGTTTTTAGAGGAAACTACTTGGACAGTAATTAATGCTCCTGTTAATTGGTATAACTCTTTACAAGATTACTACTCTACTTTGTCTCCCATTAGGCCTACAATGGTNAGACAAGTAGCCAACAGGGAAGGGTTGCAAATATCNTTTGNGCACACCTATGATAANATTGATGAANCANACAGTATTNNGCAAGTANCTGNGNGGTNGGNAGNNNAAANNNNANGTCNTAANNTNNNNNNNNNNNANTTNNNTNNAAANNNNNNNNNNNNTGGTNNN\n+>TTTGAGCAGATTGGTCACTTTTCG.ba.1 4\n+GAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTNAGGGATTTTGGTCAT\n+>TTTGCCAGATCCGCTTACCTCCTT.ab.1 4\n+TATCCCCTGATTCTGTGGATAACCGTATTACCGCCTTTGAGTGAGCTGATACCGCTCGCCGCAGCCGAACGACCGAGCGCAGCGAGTCAGTGAGCGAGGAAGCGGAAGAGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTGAGCGCAACGCAATTAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTTATGCTTCCGGCTCGTANGTTGTGNN\n+>TTTGCTAGGAAACGCTACCGTATT.ba.2 4\n+GTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTCCACCACTGCTCCCATTCATCAGTTCCATAGGTTGGAATCTAAAATACACAAACAATTAGAATCAGTAGTTTAACACATTATACACTTAAAAATTTTATATTTACCTTAGAGCTTTAAATCTCTGTAGGTAGTTTGTCCAATTATGTCACACCACAGAAGTAAGGTTCCTTCACAAAGATCAAGTCCAANCNNCATTNN\n+>TTTGGAGACAGATGCCTACGCCGT.ab.1 6\n+TACAGGACCAAATATCCTGCTCAAACTGTAACCCCAAAAAATGCTACAGTTGACAGTCAGCAGATGAACACTGACCACAAGGCTGTTTTGGATAAGGATAATGCTTATCCAGTGGAGTGCTGGGTTCCTGATCCAAGTAAAAATGAAAACACTAGATATTTTGGAACCTACACAGGTGGGGAAAATGTGCCTCCTGTTTTGCACATTACTAACACAGCAACCACAGTGCTTCTTGATGAGCAGGGTGTTGGGCCCTTGTGCAAAGCTGACAGCTTGTATGTTTCT\n+>TTTGGCGAACGGGATTGCTTCACC.ba.2 5\n+AGCAGAGCGCAGATACCAAATACTGTTCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCNTNNNNCNNNNGNCNTANNCCNNNCTGNNANNNNNNCNNNNNNNNCTNTNNNNNNNCGN\n+>TTTGGGACAAATTTATTAGGGCTT.ab.1 4\n+AGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAACGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCNN\n+>TTTTAAGCGAAATTTACCCGTTAA.ab.2 9\n+TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGANCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCNNCNNGATGNN\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/diff.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/diff.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,9426 @@\n+>AAAAAAACGGAACCACGTCACATT.ba.2 3\n+CCCGTATCGTAGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAACGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCNNANTCCCTTAACGTGANTTTTCGTTCCNNTGAGCGTCAGACCCCGTANNNNNGNNCANAGGNTCTTCTTGAGNTCCTTTTN\n+>AAAAAAAGCATGCTGCGGAATGAC.ab.1 7\n+GAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTTCTTTCCTGCNTTATCCNCTGATTCTGTGGN\n+>AAAAAAATCTATCCTACCGTCATA.ba.2 5\n+GAAACTCCTTGCATTTTTTTAAATATGCCTTTCTCATCAGAGGAATATTCCCCCAGGCACTCCTTTCAAGACCTAGAAGGTCCATTAGCTGCAAAGATTCCTCTCTGTTTAAAACTTTATCCATCTTTGCAAAGCTTTTTGCAAAAGCCTAGGCCTCCAAAAAAGCCTCCTCACTACTTCTGGAATAGCTCAGAGGCCGNGGCGGCCTCGGCCTCTGCATNAATAAAAAAANTTAGTCANCCATGGGNNNNNNNNNGGGCNNNNNNNGGCNGNNTTNGGGNNGGN\n+>AAAAACGACGCTCGTTCTCGAGAT.ba.2 5\n+ATCATAACATACTGTTTTTTCTTACTCCACACAGGCATAGAGTGTCTGCTATTAATAACTATGCTCAAAAATTGTGTACCTTTAGCTTTTTAATTTGTAAAGGGGTTAATAAGGAATATTTGATGTATAGTGCCTTGACTAGAGATCCATTTTCTGTTATTGAGGAAAGTTTGCCAGGTGGGTTAAAGGAGCATGATTTTAATCCAGAAGAAGCAGAGGAAACTAAACAAGTGTCCTGGAAGCTTGTAACAGAGTATGCAATGGAAACAAANTGTGATGATGTN\n+>AAAAAGAAAACATAGCGCTGTGAA.ab.1 5\n+CAATAACCCTGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTTTTGCGGCATTTTGCCTTCCTGTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCAGTTGGGTGCACGAGTGGGTTACATCGAACTGGATCTCAACAGCGGTAAGATCCTTGAGAGTTTTCGCCCCGAAGAACGTTTTCCAATGATGAGCACTTTTAAAGTTCTGCTATGTGGCGCGGTATTATC\n+>AAAAAGATATACAGGATTAATAAG.ab.2 7\n+AGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGNTTAATCAGTGAGNN\n+>AAAAAGCATGTGTAGCTAATAAAA.ab.1 7\n+AAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCANCACANCCGCCGCGCN\n+>AAAAATGATAAAACAGGCATGAGA.ab.1 7\n+AATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCTAAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCAATCTCTCTTTCCACTCCACAATTC\n+>AAAACACGCTCCCGGACGTTGTAC.ab.2 5\n+TGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCCACCATCTTCATTTTTATCAGCANTTTCCTGGNNGTCTTCATN\n+>AAAACAGAGAATCGGAAGGAATAT.ab.1 5\n+GTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAANAGGAGAGCGCAN\n+>AAAACAGCAAAACATGCTGTAGAT.ba.2 5\n+AAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTAAAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGGCCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCNCTGANNCTTNCATGNTNTTNTCNNNNCNANCTTN\n+>AAAACATACCAGTAGGACTTGAGA.ba.1 9\n+AGGAGGCACATTTTCCCCACCTGTGTAGGTTCCAAAATATCTAGTGTTTTCATTTTTACTTGGATCAGGAACCCAGCACTCCACTGGATAAGCATTATCCTTATCCAAAACAGCCTTGTGGTCAGTGTTCATCTGCTGACTGTCAACTGTAGCATTTTTTGGGGTTACAGTTTGAGCAGGATATTTGGTCCTGTAGTTTGCTAACACACCCTGCAGCTCCAAAGGTTCCCCACCAACAGCAAAAAAATGAAAATTTGACCCTTGAATGGGTTTTCCAGCACCAT\n+>AAAACATAGTGGCCGCGAGTTCTT.ab.1 8\n+TGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTT'..b'CAGCATTTGCATAATGCTTTTCATGGTACTTATAGTGGCTGGGCTGTTCTTTTTTAATACATTTTAAACACATTTCAAAACTGTACTGAAATTCCAAGTACATCCCAAGCAATAACAACACATCATCACATTTTGTTTCCATTGCATACTC\n+>TTTCCTGGCCCAAAGATCTTTTCA.ab.2 5\n+TGCAAAGCATGCATCTCAATTAGTCAGCAACCAGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCATAGTCCCGCCCCTAACTCCGCCCATCCCGCCCCTAACTCCGCCCAGTTCCGCCCATTCTCCGCCCCATGGCTGACTAATTTTTTTTATTTATGCAGAGGCCGAGGCCGCCTCGGCCTCTGAGCTATTCCAGAAGTAGTGAGGAGGCTTTTTTNNNNGNCTNNNCNTTNNCNAANNNNNTT\n+>TTTCGACTCCCGTGAATGTGTCGA.ab.2 12\n+TGGAACAAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGNNAGGAAGGGAAGAAAGCGAANGGAGCGGGCGCNNNNGCGCTGNNANGTGTNGNNNTCACGCNNNNCNNNACNACCANNNNCNCC\n+>TTTCGCGTAAACTCCCCTTGTGAA.ba.1 5\n+AAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTAN\n+>TTTCTCACGCACCATTCTTTAAAG.ab.1 10\n+TTCTTCTGGATTAAAATCATGCTCCTTTAACCCACCTGGCAAACTTTCCTCAATAACAGAAAATGGATCTCTAGTCAAGGCACTATACATCAAATATTCCTTATTAACCCCTTTACAAATTAAAAAGCTAAAGGTACACAATTTTTGAGCATAGTTATTAATAGCAGACACTCTATGCCTGTGTGGAGTAAGAAAAAACAGTATGTTATGATTATAACTGTTATGCCTACTTATAAAGGTTACAGAATATTTTTCCATAATTTTCTTGTATAGCAGTGCAGCTTN\n+>TTTGACATCAACAGAGTACGTTTC.ba.2 3\n+AGTTTGGCAAGGTTTTTAGAGGAAACTACTTGGACAGTAATTAATGCTCCTGTTAATTGGTATAACTCTTTACAAGATTACTACTCTACTTTGTCTCCCATTAGGCCTACAATGGTNAGACAAGTAGCCAACAGGGAAGGGTTGCAAATATCNTTTGNGCACACCTATGATAANATTGATGAANCANACAGTATTNNGCAAGTANCTGNGNGGTNGGNAGNNNAAANNNNANGTCNTAANNTNNNNNNNNNNNANTTNNNTNNAAANNNNNNNNNNNNTGGTNN\n+>TTTGAGCAGATTGGTCACTTTTCG.ba.1 4\n+GAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTNAGGGATTTTGGTCAN\n+>TTTGCCAGATCCGCTTACCTCCTT.ab.1 4\n+TATCCCCTGATTCTGTGGATAACCGTATTACCGCCTTTGAGTGAGCTGATACCGCTCGCCGCAGCCGAACGACCGAGCGCAGCGAGTCAGTGAGCGAGGAAGCGGAAGAGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTGAGCGCAACGCAATTAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTTATGCTTCCGGCTCGTANGTTGTGN\n+>TTTGCTAGGAAACGCTACCGTATT.ba.2 4\n+GTAGAATGTTGAGAGTCAGCAGTAGCCTCATCATCACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTCCACCACTGCTCCCATTCATCAGTTCCATAGGTTGGAATCTAAAATACACAAACAATTAGAATCAGTAGTTTAACACATTATACACTTAAAAATTTTATATTTACCTTAGAGCTTTAAATCTCTGTAGGTAGTTTGTCCAATTATGTCACACCACAGAAGTAAGGTTCCTTCACAAAGATCAAGTCCAANCNNCATTN\n+>TTTGGAGACAGATGCCTACGCCGT.ab.1 6\n+TACAGGACCAAATATCCTGCTCAAACTGTAACCCCAAAAAATGCTACAGTTGACAGTCAGCAGATGAACACTGACCACAAGGCTGTTTTGGATAAGGATAATGCTTATCCAGTGGAGTGCTGGGTTCCTGATCCAAGTAAAAATGAAAACACTAGATATTTTGGAACCTACACAGGTGGGGAAAATGTGCCTCCTGTTTTGCACATTACTAACACAGCAACCACAGTGCTTCTTGATGAGCAGGGTGTTGGGCCCTTGTGCAAAGCTGACAGCTTGTATGTTTC\n+>TTTGGCGAACGGGATTGCTTCACC.ba.2 5\n+AGCAGAGCGCAGATACCAAATACTGTTCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCNTNNNNCNNNNGNCNTANNCCNNNCTGNNANNNNNNCNNNNNNNNCTNTNNNNNNNCGNN\n+>TTTGGGACAAATTTATTAGGGCTT.ab.1 4\n+AGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAACGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCN\n+>TTTTAAGCGAAATTTACCCGTTAA.ab.2 9\n+TTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGANCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCNNCNNGATGN\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/tmp.family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/tmp.family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,4 @@ +CTCGAGCTATACCACCTTAGACGT ba 1 @M02286:46:000000000-AEG11:1:1108:8496:6724 1:N:0:1 AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTT- GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGFGGGGGGDGDDFGGGG?GGGGGGFGGGGGGGGGGGGGGCCFGGGEEGGG?FGGGGGGGECEGGCC7CDCCFCGGGFGGD9CFGGGFGGGGGGGGGG7FFFGGGGGCFFFFFFGFFFG0,C?GFGDFAF<?-962.)(4>?AFBF>BDF<A>7* +CTCGAGCTATACCACCTTAGACGT ba 1 @M02286:46:000000000-AEG11:1:1118:6636:16568 1:N:0:1 AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATTAGGGTTATTGTCTCATGAGCGGTTACATATTTGAATGTT- GGGGGGGGGGGGGGGGGFFGGFGGGGGGGGCED@FGGGGGGGGGGGGG@FF@FGGGGGGGGGGGGGGGDFEGGFGGDEFGGGGGGFCEFFGGGGGGGGGGGGFCFCCE?FGGCGGAFGGGGFFGGGGGGGGFCFFFGGGGEFGGGGFG@FGGGGGGDCFFFGGGFGECC*3:3>EC7D:EFED?8CCEGDGGGGGDGF2:CFG9E7FC3:=CGFCCFC9AFGGGGCFGGFGFF9=>?E6C7@7/*9DF>FGG<?FF96*-68)25)4/4>)-6=<BFF?<BAF) +CTCGAGCTATACCACCTTAGACGT ba 1 @M02286:46:000000000-AEG11:1:2108:5062:14791 1:N:0:1 AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATC-TTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGCAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTACTGAGCGGCTCCATATTTGAATGTAT GGGGGFFGDECFGGGGGGGFGGGGGGGGGGGGG>FGGGGGGGGFFGGGGCFFFGGGGGGGGGGGGFGGGGGGGFGGGGGGGGGECFCGGGCFG7FGGGGGGGGGGGFFFGGGGG FGGGAEFFFFFGGCBBFGGFGGGDDCGF8EG8FFFGGGFD<+<3DF,FFGGGGGGGDGEECCFFGGGFFC7:@CECC8*8CCFGGGGGGGGCFFGGGFGFGCFGFFFGFGFEGGGGGFGGGCGGGGFFF+*9<<FFGFF=2)10*1)9D()/(0).,*.849**6@AG<4 +CTCGAGCTATACCACCTTAGACGT ba 1 @M02286:46:000000000-AEG11:1:2116:21186:17735 1:N:0:1 AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGATCGGCTACATATTTGAATGTT- GGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGG7@F8CEEFGGGGGEFFDGGGG5CGGGEFGGGG9@979EFCFECFGGFGFGFGGGFGGC7CFFGGFGFFFFGGGFFGGFFFFFGFF9<*:B9>>2*:7))1)4)=?AB?*65*.6) |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/tmp.sscs.after.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/tmp.sscs.after.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>CTCGAGCTATACCACCTTAGACGT.ba.1 4 +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGNGCNNNNACATATTTGAATGTAN |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/bug1/tmp.sscs.before.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/bug1/tmp.sscs.before.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>CTCGAGCTATACCACCTTAGACGT.ba.1 4 +AAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGNGCNNNNACATATTTGAATGTA |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family.align.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family.align.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,20 @@ +>1 +cagcaccccctctaccccctctaccccctctagag +>2 +---------cagcaccccctctaccccctctagag +>3 +-agcaccccctctaccccctctaccccccctagag +>4 +-agcaccccctctaccccctctaccccctctagcg +>5 +-aacacacctttcac--------ccctctccagag +>6 +-agcaccccctctaccccctctaccccctctagaa +>7 +-agcaccccctctaccccctctaccccctctaaag +>8 +-agcaccccctctaccccctctaccccctctcgag +>9 +-agccccccctctaccccctctaccccctctagag +>10 +-agcaccccctctaccccctctaccccctctacag |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>EMBOSS_001 +nAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAG |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,20 @@ +>1 +CAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAG +>2 +CAGCACCCCCTCTACCCCCTCTAGAG +>3 +AGCACCCCCTCTACCCCCTCTACCCCCCCTAGAG +>4 +AGCACCCCCTCTACCCCCTCTACCCCCTCTAGCG +>5 +AACACACCTTTCACCCCTCTCCAGAG +>6 +AGCACCCCCTCTACCCCCTCTACCCCCTCTAGAA +>7 +AGCACCCCCTCTACCCCCTCTACCCCCTCTAAAG +>8 +AGCACCCCCTCTACCCCCTCTACCCCCTCTCGAG +>9 +AGCCCCCCCTCTACCCCCTCTACCCCCTCTAGAG +>10 +AGCACCCCCTCTACCCCCTCTACCCCCTCTACAG |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,11 @@ +ACCGGACAACGA CONSENSUS nAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAG +ACCGGACAACGA 1 cagcaccccctctaccccctctaccccctctagag +ACCGGACAACGA 2 ---------cagcaccccctctaccccctctagag +ACCGGACAACGA 3 -agcaccccctctaccccctctaccccccctagag +ACCGGACAACGA 4 -agcaccccctctaccccctctaccccctctagcg +ACCGGACAACGA 5 -aacacacctttcac--------ccctctccagag +ACCGGACAACGA 6 -agcaccccctctaccccctctaccccctctagaa +ACCGGACAACGA 7 -agcaccccctctaccccctctaccccctctaaag +ACCGGACAACGA 8 -agcaccccctctaccccctctaccccctctcgag +ACCGGACAACGA 9 -agccccccctctaccccctctaccccctctagag +ACCGGACAACGA 10 -agcaccccctctaccccctctaccccctctacag |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family2.align.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family2.align.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,8 @@ +>7 +ttagcctagccacacccccacgggaaacagcagtgattaacc +>8 +---------ccacacccc-----gaaacagcagtgatt---- +>9 +------tagccacacccccacgggaaac-------------- +>10 +ttagcctagccacacccccacgg------------------- |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family2.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family2.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>EMBOSS_001 +ttagccTAGCCACACCCCCACGGGAAACagcagtgattnnnn |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family2.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family2.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,8 @@ +>1 +TTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACC +>2 +CCACACCCCGAAACAGCAGTGATT +>3 +TAGCCACACCCCCACGGGAAAC +>4 +TTAGCCTAGCCACACCCCCACGG |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family3.align.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family3.align.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,10 @@ +>1 +TTAGCCT-GCCACACCCC-ACGG-AA-CAGCAGTGACTGATA +>2 +TTAGCCTAGCCACACCCCCACGGGAAACAGCAGT----GATT +>3 +TTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATT---- +>4 +TTA---TAGCCACACCCCCACGGGAAACA-CAGTGATT---- +>5 +TTA---TAGCCACACCCCCACGGGAAACA-CAGTGACTGATA \ No newline at end of file |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/family3.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/family3.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,2 @@ +>EMBOSS_001 +TTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGAnTGATn |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/msa_sscs_matcher.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/msa_sscs_matcher.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,37 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse + +OPT_DEFAULTS = {} +USAGE = "gunzip -c families.msa.tsv.gz | %(prog)s sscs.set.fa" +DESCRIPTION = """Find the input MSA's which produced a given set of SSCS's. Pipe the full set of +MSA's to stdin and it will filter them to the matching MSA's on stdout.""" + + +def main(argv): + + parser = argparse.ArgumentParser(usage=USAGE, description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('sscs', metavar='sscs.set.fa', + help='A set of SSCS\'s, as output from the duplex.py script with the --sscs-file option.') + + args = parser.parse_args(argv[1:]) + + sscs = set() + with open(args.sscs) as sscs_file: + for line in sscs_file: + if line.startswith('>'): + name = line.lstrip('>').split()[0] + sscs.add(name) + + for line in sys.stdin: + barcode, order, mate, rname, seq, qual = line.rstrip('\r\n').split('\t') + name = '.'.join((barcode, order, mate)) + if name in sscs: + sys.stdout.write(line) + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/read.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/read.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,3 @@ +>AAAAACAAGGATCTAAAACCAGAT.1 +TGACTGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTACCCTTTTAAGTTAAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGT + |
b |
diff -r 13bcc2f459b0 -r af383638de66 misc/sscs_diff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/sscs_diff.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,74 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse +import subprocess + +OPT_DEFAULTS = {} +USAGE = "%(prog)s [options]" +DESCRIPTION = """Find differences between the SSCS produced by one version of the pipeline and +another, when working on the same input MSA's.""" +EPILOG = """Warning: This injects raw command-line arguments into shell commands and executes them. +""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('sscs_before', metavar='sscs.all.before.fa', + help='SSCSs from earlier version (can be gzipped).') + parser.add_argument('sscs_after', metavar='sscs.all.after.fa', + help='SSCSs from later version (can be gzipped).') + parser.add_argument('-b', '--before', metavar='sscs.all.before.diffs.fa', required=True, + help='Output SSCSs from earlier version that differ from the SSCS in the later version here.') + parser.add_argument('-a', '--after', metavar='sscs.all.after.diffs.fa', required=True, + help='Output SSCSs from later version that differ from the SSCS in the earlier version here.') + + args = parser.parse_args(argv[1:]) + + sscs_before = {} + if args.sscs_before.endswith('.gz'): + command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_before) + else: + command = 'cat {} | paste - - | sort'.format(args.sscs_before) + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + for line in process.stdout: + fields = line.rstrip('\r\n').split('\t') + name = fields[0].lstrip('>').split()[0] + seq = fields[1] + sscs_before[name] = seq + + before_fh = open(args.before, 'w') + after_fh = open(args.after, 'w') + diffs = {} + if args.sscs_after.endswith('.gz'): + command = 'gunzip -c {} | paste - - | sort'.format(args.sscs_after) + else: + command = 'cat {} | paste - - | sort'.format(args.sscs_after) + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + for line in process.stdout: + fields = line.rstrip('\r\n').split('\t') + header = fields[0].lstrip('>') + name, fam_size = header.split() + seq_after = fields[1] + if name in sscs_before: + seq_before = sscs_before[name] + if seq_before != seq_after: + diffs[name] = (seq_before, seq_after) + before_fh.write('>{} {}\n'.format(name, fam_size)) + before_fh.write(seq_before+'\n') + after_fh.write('>{} {}\n'.format(name, fam_size)) + after_fh.write(seq_after+'\n') + before_fh.close() + after_fh.close() + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 pipeline.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pipeline.sh Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,32 @@ +#!/usr/bin/env bash +if [ x$BASH = x ] || [ ! $BASH_VERSINFO ] || [ $BASH_VERSINFO -lt 4 ]; then + echo "Error: Must use bash version 4+." >&2 + exit 1 +fi +set -ue + +# At the moment this isn't really a production version of the pipeline. +# It mainly just documents how the commands are used. +function main { + fastq1="$1" + fastq2="$2" + sscs="$3" + # This transforms the input fastq's into a format that can be sorted by family with the "sort" + # command. Mainly, it puts all the data for both read pairs on one line, and adds a column with + # the barcode. + # Warning: It assumes the fastq's have 4 lines per read! + paste "$fastq1" "$fastq2" \ + | paste - - - - \ + | awk -f make-barcodes.awk \ + | sort \ + | align_families.py \ + | duplex.py \ + > "$sscs" +} + +function fail { + echo "$@" >&2 + exit 1 +} + +main "$@" |
b |
diff -r 13bcc2f459b0 -r af383638de66 planemo-template/cat.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo-template/cat.xml Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,23 @@ +<tool id="cat" name="Concatenate datasets (for test workflows)" version="1.0"> + <description>tail-to-head</description> + <command> + cat $input1 #for $q in $queries# ${q.input2} #end for# > $out_file1 + </command> + <inputs> + <param name="input1" type="data" label="Concatenate Dataset"/> + <repeat name="queries" title="Dataset"> + <param name="input2" type="data" label="Select" /> + </repeat> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed"/> + <output name="out_file1" file="1.bed"/> + </test> + </tests> + <help> + </help> +</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 planemo-template/random_lines_two_pass.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo-template/random_lines_two_pass.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,71 @@ +#!/usr/bin/env python +#Dan Blankenberg +#Selects N random lines from a file and outputs to another file, maintaining original line order +#allows specifying a seed +#does two passes to determine line offsets/count, and then to output contents + +import optparse, random + +def get_random_by_subtraction( line_offsets, num_lines ): + while len( line_offsets ) > num_lines: + del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ] + return line_offsets + +def get_random_by_sample( line_offsets, num_lines ): + line_offsets = random.sample( line_offsets, num_lines ) + line_offsets.sort() + return line_offsets + +def get_random( line_offsets, num_lines ): + if num_lines > ( len( line_offsets ) / 2 ): + return get_random_by_subtraction( line_offsets, num_lines ) + else: + return get_random_by_sample( line_offsets, num_lines ) + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-s', '--seed', dest='seed', action='store', type="string", default=None, help='Set the random seed.' ) + (options, args) = parser.parse_args() + + assert len( args ) == 3, "Invalid command line specified." + + input = open( args[0], 'rb' ) + output = open( args[1], 'wb' ) + num_lines = int( args[2] ) + assert num_lines > 0, "You must select at least one line." + + if options.seed is not None: + random.seed( options.seed ) + + #get line offsets + line_offsets = [] + teller = input.tell + readliner = input.readline + appender = line_offsets.append + while True: + offset = teller() + if readliner(): + appender( offset ) + else: + break + + total_lines = len( line_offsets ) + assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines ) + + #get random line offsets + line_offsets = get_random( line_offsets, num_lines ) + + #write out random lines + seeker = input.seek + writer = output.write + for line_offset in line_offsets: + seeker( line_offset ) + writer( readliner() ) + input.close() + output.close() + print "Kept %i of %i total lines." % ( num_lines, total_lines ) + if options.seed is not None: + print 'Used random seed of "%s".' % options.seed + +if __name__=="__main__": __main__() |
b |
diff -r 13bcc2f459b0 -r af383638de66 planemo-template/randomlines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo-template/randomlines.xml Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,66 @@ +<tool id="random_lines1" name="Select random lines" version="2.0.1"> + <description>from a file</description> + <command interpreter="python">random_lines_two_pass.py "${input}" "${out_file1}" "${num_lines}" + #if str( $seed_source.seed_source_selector ) == "set_seed": + --seed "${seed_source.seed}" + #end if + </command> + <inputs> + <param name="num_lines" size="5" type="integer" value="1" label="Randomly select" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + <conditional name="seed_source"> + <param name="seed_source_selector" type="select" label="Set a random seed"> + <option value="no_seed" selected="True">Don't set seed</option> + <option value="set_seed">Set seed</option> + </param> + <when value="no_seed"> + <!-- Do nothing here --> + </when> + <when value="set_seed"> + <param name="seed" type="text" label="Random seed" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="num_lines" value="65"/> + <param name="input" value="1.bed"/> + <param name="seed_source_selector" value="no_seed"/> + <output name="out_file1" file="1.bed"/> + </test> + <test> + <param name="num_lines" value="1"/> + <param name="input" value="1.bed"/> + <param name="seed_source_selector" value="set_seed"/> + <param name="seed" value="asdf"/> + <output name="out_file1" file="1_bed_random_lines_1_seed_asdf_out.bed"/> + </test> + </tests> + <help> + +**What it does** + +This tool selects N random lines from a file, with no repeats, and preserving ordering. + +----- + +**Example** + +Input File:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56761 56781 D17003_CTCF_R4 220 + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +Selecting 2 random lines might return this:: + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56775 56795 D17003_CTCF_R4 207 + + + </help> +</tool> |
b |
diff -r 13bcc2f459b0 -r af383638de66 planemo-template/test-data/1.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo-template/test-data/1.bed Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,65 @@ +chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 - +chr1 147984545 147984630 CCDS990.1_cds_0_0_chr1_147984546_f 0 + +chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 - +chr1 148185136 148185276 CCDS996.1_cds_0_0_chr1_148185137_f 0 + +chr10 55251623 55253124 CCDS7248.1_cds_0_0_chr10_55251624_r 0 - +chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - +chr11 116206508 116206563 CCDS8377.1_cds_0_0_chr11_116206509_f 0 + +chr11 116211733 116212337 CCDS8378.1_cds_0_0_chr11_116211734_r 0 - +chr11 1812377 1812407 CCDS7726.1_cds_0_0_chr11_1812378_f 0 + +chr12 38440094 38440321 CCDS8736.1_cds_0_0_chr12_38440095_r 0 - +chr13 112381694 112381953 CCDS9526.1_cds_0_0_chr13_112381695_f 0 + +chr14 98710240 98712285 CCDS9949.1_cds_0_0_chr14_98710241_r 0 - +chr15 41486872 41487060 CCDS10096.1_cds_0_0_chr15_41486873_r 0 - +chr15 41673708 41673857 CCDS10097.1_cds_0_0_chr15_41673709_f 0 + +chr15 41679161 41679250 CCDS10098.1_cds_0_0_chr15_41679162_r 0 - +chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 + +chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 + +chr16 179963 180135 CCDS10401.1_cds_0_0_chr16_179964_r 0 - +chr16 244413 244681 CCDS10402.1_cds_0_0_chr16_244414_f 0 + +chr16 259268 259383 CCDS10403.1_cds_0_0_chr16_259269_r 0 - +chr18 23786114 23786321 CCDS11891.1_cds_0_0_chr18_23786115_r 0 - +chr18 59406881 59407046 CCDS11985.1_cds_0_0_chr18_59406882_f 0 + +chr18 59455932 59456337 CCDS11986.1_cds_0_0_chr18_59455933_r 0 - +chr18 59600586 59600754 CCDS11988.1_cds_0_0_chr18_59600587_f 0 + +chr19 59068595 59069564 CCDS12866.1_cds_0_0_chr19_59068596_f 0 + +chr19 59236026 59236146 CCDS12872.1_cds_0_0_chr19_59236027_r 0 - +chr19 59297998 59298008 CCDS12877.1_cds_0_0_chr19_59297999_f 0 + +chr19 59302168 59302288 CCDS12878.1_cds_0_0_chr19_59302169_r 0 - +chr2 118288583 118288668 CCDS2120.1_cds_0_0_chr2_118288584_f 0 + +chr2 118394148 118394202 CCDS2121.1_cds_0_0_chr2_118394149_r 0 - +chr2 220190202 220190242 CCDS2441.1_cds_0_0_chr2_220190203_f 0 + +chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 - +chr20 33330413 33330423 CCDS13249.1_cds_0_0_chr20_33330414_r 0 - +chr20 33513606 33513792 CCDS13255.1_cds_0_0_chr20_33513607_f 0 + +chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 - +chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 + +chr21 32707032 32707192 CCDS13614.1_cds_0_0_chr21_32707033_f 0 + +chr21 32869641 32870022 CCDS13615.1_cds_0_0_chr21_32869642_r 0 - +chr21 33321040 33322012 CCDS13620.1_cds_0_0_chr21_33321041_f 0 + +chr21 33744994 33745040 CCDS13625.1_cds_0_0_chr21_33744995_r 0 - +chr22 30120223 30120265 CCDS13897.1_cds_0_0_chr22_30120224_f 0 + +chr22 30160419 30160661 CCDS13898.1_cds_0_0_chr22_30160420_r 0 - +chr22 30665273 30665360 CCDS13901.1_cds_0_0_chr22_30665274_f 0 + +chr22 30939054 30939266 CCDS13903.1_cds_0_0_chr22_30939055_r 0 - +chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 + +chr5 131556601 131556672 CCDS4151.1_cds_0_0_chr5_131556602_r 0 - +chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 + +chr5 131847541 131847666 CCDS4155.1_cds_0_0_chr5_131847542_r 0 - +chr6 108299600 108299744 CCDS5061.1_cds_0_0_chr6_108299601_r 0 - +chr6 108594662 108594687 CCDS5063.1_cds_0_0_chr6_108594663_f 0 + +chr6 108640045 108640151 CCDS5064.1_cds_0_0_chr6_108640046_r 0 - +chr6 108722976 108723115 CCDS5067.1_cds_0_0_chr6_108722977_f 0 + +chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 + +chr7 116512159 116512389 CCDS5771.1_cds_0_0_chr7_116512160_r 0 - +chr7 116714099 116714152 CCDS5773.1_cds_0_0_chr7_116714100_f 0 + +chr7 116945541 116945787 CCDS5774.1_cds_0_0_chr7_116945542_r 0 - +chr8 118881131 118881317 CCDS6324.1_cds_0_0_chr8_118881132_r 0 - +chr9 128764156 128764189 CCDS6914.1_cds_0_0_chr9_128764157_f 0 + +chr9 128787519 128789136 CCDS6915.1_cds_0_0_chr9_128787520_r 0 - +chr9 128882427 128882523 CCDS6917.1_cds_0_0_chr9_128882428_f 0 + +chr9 128937229 128937445 CCDS6919.1_cds_0_0_chr9_128937230_r 0 - +chrX 122745047 122745924 CCDS14606.1_cds_0_0_chrX_122745048_f 0 + +chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 - +chrX 152691446 152691471 CCDS14735.1_cds_0_0_chrX_152691447_f 0 + +chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 - |
b |
diff -r 13bcc2f459b0 -r af383638de66 planemo-template/test-data/1_bed_random_lines_1_seed_asdf_out.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo-template/test-data/1_bed_random_lines_1_seed_asdf_out.bed Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,1 @@ +chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 + |
b |
diff -r 13bcc2f459b0 -r af383638de66 seqtools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqtools.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,66 @@ +import os +import ctypes + +script_dir = os.path.dirname(os.path.realpath(__file__)) +seqtools = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'seqtoolsc.so')) +seqtools.get_revcomp.restype = ctypes.c_char_p +seqtools.transfer_gaps.restype = ctypes.c_char_p + + +def get_revcomp(seq): + return seqtools.get_revcomp(seq) + + +def get_diffs_frac_simple(consensus, family): + c_consensus = ctypes.c_char_p(consensus) + c_family = (ctypes.c_char_p * len(family))() + for i, seq in enumerate(family): + c_family[i] = ctypes.c_char_p(seq) + seqtools.get_diffs_frac_simple.restype = ctypes.POINTER(ctypes.c_double * len(c_family)) + diffs = seqtools.get_diffs_frac_simple(c_consensus, c_family, len(c_family)) + return diffs.contents + + +def get_diffs_frac_binned(consensus, family, bins): + seq_len = None + c_consensus = ctypes.c_char_p(consensus) + c_family = (ctypes.c_char_p * len(family))() + for i, seq in enumerate(family): + if seq_len: + if seq_len != len(seq): + return None + else: + seq_len = len(seq) + c_family[i] = ctypes.c_char_p(seq) + double_array_pointer = ctypes.POINTER(ctypes.c_double * bins) + seqtools.get_diffs_frac_binned.restype = ctypes.POINTER(double_array_pointer * len(c_family)) + diffs_binned_c = seqtools.get_diffs_frac_binned(c_consensus, c_family, len(c_family), seq_len, bins) + diffs_binned = [] + for diffs_c in diffs_binned_c.contents: + diffs_binned.append(diffs_c.contents) + return diffs_binned + + +def transfer_gaps(aligned, seq, gap_char_in='-', gap_char_out='-'): + gap_char_in_c = ctypes.c_char(gap_char_in) + gap_char_out_c = ctypes.c_char(gap_char_out) + return seqtools.transfer_gaps(aligned, seq, gap_char_in_c, gap_char_out_c) + + +def transfer_gaps_multi(seqs, aligned, gap_char_in='-', gap_char_out='-'): + gap_char_in_c = ctypes.c_char(gap_char_in) + gap_char_out_c = ctypes.c_char(gap_char_out) + n_seqs = len(seqs) + assert n_seqs == len(aligned), 'Error: Unequal number of gapped and ungapped sequences.' + seqs_c = (ctypes.c_char_p * n_seqs)() + for i, seq in enumerate(seqs): + seqs_c[i] = ctypes.c_char_p(seq) + aligned_c = (ctypes.c_char_p * n_seqs)() + for i, seq in enumerate(aligned): + aligned_c[i] = ctypes.c_char_p(seq) + seqtools.transfer_gaps_multi.restype = ctypes.POINTER(ctypes.c_char_p * n_seqs) + output_c = seqtools.transfer_gaps_multi(n_seqs, aligned_c, seqs_c, gap_char_in_c, gap_char_out_c) + output = [] + for seq in output_c.contents: + output.append(seq) + return output |
b |
diff -r 13bcc2f459b0 -r af383638de66 seqtools.pyc |
b |
Binary file seqtools.pyc has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 seqtoolsc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqtoolsc.c Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,236 @@ +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <math.h> + +// ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz +#define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz" +#define TRANS_OFFSET 65 +#define TRANS_LEN 57 + +char* get_revcomp(char *input); +char get_char_comp(char c); +int *get_diffs_simple(char *cons, char *seqs[], int n_seqs); +double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs); +double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins); +char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2); +char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2); + + +// Return the reverse complement of a sequence. +// Makes a new copy of the string, so the original is not modified. +char* get_revcomp(char *input) { + int length = strlen(input); + char *output = malloc(sizeof(char) * length + 1); + int i, j; + for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) { + output[j] = get_char_comp(input[i]); + } + output[length] = '\0'; + return output; +} + + +// Return the complement of a base. +// Uses a simple lookup table: a string with the complements of all possible sequence characters. +char get_char_comp(char c) { + int i = c - TRANS_OFFSET; + if (i < 0 || i > TRANS_LEN) { + return c; + } else { + return TRANS[i]; + } +} + + +/* Take an existing alignment and consensus and compute the number of differences between each + * sequence and the consensus. + * Known bugs: + * 1. Counts no differences in the following sequences: + * consensus: GA---CA + * seq 1: GA----A + * seq 2: GA--ACA + * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff. + */ +int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) { + int *diffs = malloc(sizeof(int) * n_seqs); + int i = 0; + // Uppercase the consensus. + while (cons[i] != 0) { + cons[i] = toupper(cons[i]); + i++; + } + // Loop through the sequences in the alignment. + for (i = 0; i < n_seqs; i++) { + int in_gap; + diffs[i] = 0; + int j = 0; + // Compare each base of the sequence to the consensus. + while (seqs[i][j] != 0 && cons[j] != 0) { + if (cons[j] != '-' && seqs[i][j] != '-') { + in_gap = 0; + } + if (toupper(seqs[i][j]) != cons[j]) { + if (!in_gap) { + diffs[i]++; + } + } + if (cons[j] == '-' || seqs[i][j] == '-') { + in_gap = 1; + } + j++; + } + } + return diffs; +} + + +// Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence +// lengths. +//TODO: Don't count gaps in sequence length. +double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) { + int *diffs = get_diffs_simple(cons, seqs, n_seqs); + double *fracs = malloc(sizeof(double) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + int j = 0; + while (seqs[i][j] != 0 && cons[j] != 0) { + j++; + } + fracs[i] = (double)diffs[i]/j; + } + return fracs; +} + + +/* Take an existing alignment and consensus and compute the number of differences between each + * sequence and the consensus. Break each sequence into bins and tally the differences in each bin. + * Known bugs: + * 1. counts no differences in the following sequences: + * consensus: GA---CA + * seq 1: GA----A + * seq 2: GA--ACA + * 2. If a bin starts with a gap, each base in the gap will be counted as a diff. + */ +int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { + int bin_size = (int)round((float)seq_len/bins); + // Initialize the diffs 2d array. + int **diffs = malloc(sizeof(int*) * n_seqs); + int i, j; + for (i = 0; i < n_seqs; i++) { + diffs[i] = malloc(bins * sizeof(int)); + for (j = 0; j < bins; j++) { + diffs[i][j] = 0; + } + } + // Uppercase the consensus. + while (cons[i] != 0) { + cons[i] = toupper(cons[i]); + i++; + } + int bin, in_gap; + // Loop through the sequences in the alignment. + for (i = 0; i < n_seqs; i++) { + j = 0; + // Compare each base of the sequence to the consensus. + while (seqs[i][j] != 0 && cons[j] != 0) { + bin = j/bin_size; + if (bin >= bins) { + break; + } + if (cons[j] != '-' && seqs[i][j] != '-') { + in_gap = 0; + } + if (toupper(seqs[i][j]) != cons[j]) { + if (!in_gap) { + diffs[i][bin]++; + } + } + if (cons[j] == '-' || seqs[i][j] == '-') { + in_gap = 1; + } + j++; + } + } + return diffs; +} + + +// Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin +// lengths. +//TODO: Don't count gaps in bin length. +double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { + int bin_size = (int)round((float)seq_len/bins); + int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins); + double **fracs = malloc(sizeof(double*) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + fracs[i] = malloc(sizeof(double) * bins); + // Create and init array of lengths of the bins. + int bin_lengths[bins]; + int bin; + for (bin = 0; bin < bins; bin++) { + bin_lengths[bin] = 0; + } + // Tally size of each bin. + int j = 0; + while (seqs[i][j] != 0 && cons[j] != 0) { + int bin = j/bin_size; + if (bin >= bins) { + break; + } + bin_lengths[bin]++; + j++; + } + // For each bin, calculate the diff fraction = diffs / bin_length. + for (bin = 0; bin < bins; bin++) { + fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin]; + // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]); + } + // printf("\n"); + } + return fracs; +} + + +// Take an input sequence and insert gaps according to another, already-aligned sequence with gaps. +// Input strings must be null-terminated. "gap_char1" is the character used for gaps in +// "gapped_seq", and "gap_char2" is the gap character in "inseq". +// N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq". +char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) { + if (gap_char1 == 0) { + gap_char1 = '-'; + } + if (gap_char2 == 0) { + gap_char2 = '-'; + } + int gapped_len = strlen(gapped_seq); + char *outseq = malloc(sizeof(char) * gapped_len + 1); + + // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot + // (insert a gap there instead). + int g, o, i; + for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) { + if (gapped_seq[g] == gap_char1) { + outseq[o] = gap_char2; + } else { + outseq[o] = inseq[i]; + i++; + } + } + outseq[gapped_len] = '\0'; + + return outseq; +} + + +// Wrapper for transfer_gaps() when operating on a set of sequences at once. +char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, + char gap_char2) { + char **outseqs = malloc(sizeof(char *) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2); + } + return outseqs; +} |
b |
diff -r 13bcc2f459b0 -r af383638de66 seqtoolsc.so |
b |
Binary file seqtoolsc.so has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 swalign.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/swalign.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,57 @@ +import os +import ctypes + +script_dir = os.path.dirname(os.path.realpath(__file__)) +swalign = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'swalignc.so')) + + +# C struct for ctypes +class SeqPairC(ctypes.Structure): + _fields_ = [ + ('a', ctypes.c_char_p), + ('alen', ctypes.c_uint), + ('b', ctypes.c_char_p), + ('blen', ctypes.c_uint), + ] + + +# C struct for ctypes +class AlignC(ctypes.Structure): + _fields_ = [ + ('seqs', ctypes.POINTER(SeqPairC)), + ('start_a', ctypes.c_int), + ('start_b', ctypes.c_int), + ('end_a', ctypes.c_int), + ('end_b', ctypes.c_int), + ('matches', ctypes.c_int), + ('score', ctypes.c_double), + ] + + +# The Python version +class Align(object): + def __init__(self, align_c): + self.target = align_c.seqs.contents.a + self.query = align_c.seqs.contents.b + self.start_target = align_c.start_a + self.start_query = align_c.start_b + self.end_target = align_c.end_a + self.end_query = align_c.end_b + self.matches = align_c.matches + self.score = align_c.score + + +# Initialize functions (define types). +swalign.smith_waterman.restype = ctypes.POINTER(AlignC) +swalign.revcomp.restype = ctypes.c_char_p + + +def smith_waterman(target, query): + seq_pair = SeqPairC(target, len(target), query, len(query)) + align_c = swalign.smith_waterman(ctypes.pointer(seq_pair), 1).contents + return Align(align_c) + + +def revcomp(seq): + """WARNING: This will alter the input string in-place!""" + swalign.revcomp(seq) |
b |
diff -r 13bcc2f459b0 -r af383638de66 swalign.pyc |
b |
Binary file swalign.pyc has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 swalignc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/swalignc.c Mon Nov 23 18:44:23 2015 -0500 |
[ |
b'@@ -0,0 +1,351 @@\n+/*\n+ * Copyright (c) 2010 Nicolaus Lance Hepler\n+ * \n+ * Permission is hereby granted, free of charge, to any person\n+ * obtaining a copy of this software and associated documentation\n+ * files (the "Software"), to deal in the Software without\n+ * restriction, including without limitation the rights to use,\n+ * copy, modify, merge, publish, distribute, sublicense, and/or sell\n+ * copies of the Software, and to permit persons to whom the\n+ * Software is furnished to do so, subject to the following\n+ * conditions:\n+ * \n+ * The above copyright notice and this permission notice shall be\n+ * included in all copies or substantial portions of the Software.\n+ * \n+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\n+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES\n+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT\n+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,\n+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n+ * OTHER DEALINGS IN THE SOFTWARE.\n+ */\n+// All double-commented comments below are from Nicolaus Lance Hepler.\n+\n+#include "swalignc.h"\n+\n+// /* reverse a string in place, return str */\n+static char* reverse(char *str) {\n+ char *left = str;\n+ char *right = left + strlen(str) - 1;\n+ char tmp;\n+\n+ while (left < right) {\n+ tmp = *left;\n+ *(left++) = *right;\n+ *(right--) = tmp;\n+ }\n+\n+ return str;\n+}\n+\n+// Return the reverse complement of a sequence.\n+char* revcomp(char *str) {\n+ char *left = str;\n+ char *right = left + strlen(str) - 1;\n+ char tmp;\n+\n+ while (left < right) {\n+ tmp = get_char_comp(*left);\n+ *(left++) = get_char_comp(*right);\n+ *(right--) = tmp;\n+ }\n+\n+ return str;\n+}\n+\n+// Return the complement of a base.\n+// Uses a simple lookup table: a string with the complements of all possible sequence characters.\n+static char get_char_comp(char c) {\n+ int i = c - TRANS_OFFSET;\n+ if (i < 0 || i > 57) {\n+ return c;\n+ } else {\n+ return TRANS[i];\n+ }\n+}\n+\n+// // works globally\n+// Note: Currently the "local" flag isn\'t functional. It seems to always do a local alignment.\n+static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local) {\n+ align_t *result = malloc(sizeof(align_t));\n+ seq_pair_t *seqs = malloc(sizeof(seq_pair_t));\n+ unsigned int i = S->m - 1;\n+ unsigned int j = S->n - 1;\n+ unsigned int k = 0;\n+ // Create output strings. Allocate maximum potential length.\n+ char c[S->m + S->n + 1];\n+ char d[S->m + S->n + 1];\n+\n+ memset(c, \'\\0\', sizeof(c));\n+ memset(d, \'\\0\', sizeof(d));\n+\n+ // This wasn\'t finished by NLH. Not functioning correctly yet.\n+ // It seems the purpose is to start the traceback from the place where the score reaches its\n+ // maximum instead of the very end (set i and j to those coordinates).\n+ if (local == true) {\n+ unsigned int l, m;\n+ double max = FLT_MIN;\n+\n+ for (l = 0; l < S->m; l++) {\n+ for (m = 0; m < S->n; m++) {\n+ if (S->mat[l][m].score > max) {\n+ i = l;\n+ j = m;\n+ max = S->mat[l][m].score;\n+ } \n+ } \n+ }\n+ }\n+\n+ double score = DBL_MIN;\n+ int matches = 0;\n+ int start_a = 0;\n+ int start_b = 0;\n+ int end_a = 0;\n+ int end_b = 0;\n+ bool move_i = false;\n+ bool move_j = false;\n+ // Walk back through the matrix from the end, taking the path determined by the "prev" values of\n+ // each cell. Assemble the sequence along the way.\n+ if (S->mat[i][j].prev[0] != 0 && S->mat[i][j].prev[1] != 0) {\n+ while (i > 0 || j > 0) {\n+ unsigned int new_i = S->mat[i][j].prev[0];\n+ unsigned int new_j = S->mat[i][j].prev[1];\n+ \n+ // If we\'ve moved in the i axis, add the new base to the sequence. Otherwise, it\'s a gap.\n+ if (new_i < i) {\n+ *(c+k) = *(problem->a+i-1);\n+ move_i = true;\n+ } '..b'; \n+}\n+\n+static matrix_t *create_matrix(unsigned int m, unsigned int n) {\n+ matrix_t *S = malloc(sizeof(matrix_t));\n+ unsigned int i;\n+\n+ S->m = m;\n+ S->n = n;\n+\n+ S->mat = malloc(sizeof(entry_t) * m * n);\n+\n+ for (i = 0; i < m; i++) {\n+ S->mat[i] = malloc(sizeof(entry_t) * n);\n+ }\n+\n+ return S;\n+}\n+\n+void destroy_matrix(matrix_t *S) {\n+ unsigned int i;\n+ for (i = 0; i < S->m; i++) {\n+ free(S->mat[i]);\n+ }\n+ free(S->mat);\n+ free(S);\n+ return;\n+}\n+\n+// Print a visual representation of the path through the matrix.\n+void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair) {\n+ int i, j;\n+ for (i = 0; i < matrix->m; i++) {\n+ if (i == 0) {\n+ printf("\\t\\t");\n+ for (j = 0; j < seq_pair->blen; j++) {\n+ printf("%c\\t", seq_pair->b[j]);\n+ }\n+ printf("\\n");\n+ printf(" ");\n+ for (j = 0; j < matrix->n; j++) {\n+ printf("%d\\t", j);\n+ }\n+ printf("\\n");\n+ }\n+ if (i == 0) {\n+ printf(" 0 ");\n+ } else {\n+ printf("%c %4d ", seq_pair->a[i-1], i);\n+ }\n+ for (j = 0; j < matrix->n; j++) {\n+ printf("%d,%d|%0.0f\\t", matrix->mat[i][j].prev[0], matrix->mat[i][j].prev[1], matrix->mat[i][j].score);\n+ }\n+ printf("\\n");\n+ }\n+}\n+\n+void destroy_seq_pair(seq_pair_t *pair) {\n+ free(pair->a);\n+ free(pair->b);\n+ free(pair);\n+ return;\n+}\n+\n+align_t *smith_waterman(seq_pair_t *problem, bool local) {\n+ unsigned int m = problem->alen + 1;\n+ unsigned int n = problem->blen + 1;\n+ matrix_t *S = create_matrix(m, n);\n+ align_t *result;\n+ unsigned int i, j, k, l;\n+\n+ S->mat[0][0].score = 0;\n+ S->mat[0][0].prev[0] = 0;\n+ S->mat[0][0].prev[1] = 0;\n+\n+ for (i = 1; i <= problem->alen; i++) {\n+ S->mat[i][0].score = 0.0;\n+ S->mat[i][0].prev[0] = i-1;\n+ S->mat[i][0].prev[1] = 0;\n+ }\n+\n+ for (j = 1; j <= problem->blen; j++) {\n+ S->mat[0][j].score = 0.0;\n+ S->mat[0][j].prev[0] = 0;\n+ S->mat[0][j].prev[1] = j-1;\n+ }\n+\n+ for (i = 1; i <= problem->alen; i++) {\n+ for (j = 1; j <= problem->blen; j++) {\n+ int nw_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH;\n+\n+ S->mat[i][j].score = DBL_MIN;\n+ S->mat[i][j].prev[0] = 0;\n+ S->mat[i][j].prev[1] = 0;\n+\n+ for (k = 0; k <= 1; k++) {\n+ for (l = 0; l <= 1; l++) {\n+ int val;\n+\n+ if (k == 0 && l == 0) {\n+ continue;\n+ } else if (k > 0 && l > 0) {\n+ val = nw_score; \n+ } else if (k > 0 || l > 0) {\n+ if ((i == problem->alen && k == 0) ||\n+ (j == problem->blen && l == 0))\n+ val = 0.0;\n+ else\n+ val = GAP;\n+ } else {\n+ // do nothing..\n+ }\n+\n+ val += S->mat[i-k][j-l].score;\n+\n+ if (val > S->mat[i][j].score) {\n+ S->mat[i][j].score = val;\n+ S->mat[i][j].prev[0] = i-k;\n+ S->mat[i][j].prev[1] = j-l;\n+ }\n+ }\n+ }\n+ }\n+ }\n+\n+ result = traceback(problem, S, local);\n+\n+ // print_matrix(S, problem);\n+\n+ destroy_matrix(S);\n+\n+ return result;\n+}\n+\n+void print_alignment(align_t *result, int target_len, int query_len) {\n+ printf("Score: %0.0f Matches: %d\\n", result->score, result->matches);\n+ printf("Target: %3d %s %-3d\\n", result->start_a, result->seqs->a, result->end_a);\n+ printf("Query: %3d %s %-3d\\n", result->start_b, result->seqs->b, result->end_b);\n+}\n+\n+int main(int argc, const char **argv) {\n+\n+ if (argc != 3) {\n+ printf("usage: swalign TARGET_SEQ QUERY_SEQ\\n");\n+ exit(1);\n+ }\n+\n+ {\n+ seq_pair_t problem;\n+ align_t *result;\n+ char c[strlen(argv[1])], d[strlen(argv[2])];\n+ \n+ strcpy(c, argv[1]);\n+ strcpy(d, argv[2]);\n+ \n+ problem.a = c;\n+ problem.alen = strlen(problem.a);\n+ problem.b = d;\n+ problem.blen = strlen(problem.b);\n+ \n+ result = smith_waterman(&problem, false);\n+ \n+ print_alignment(result, problem.alen, problem.blen);\n+ }\n+\n+ exit(0);\n+} \n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 swalignc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/swalignc.h Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 Nicolaus Lance Hepler + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <float.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define GAP -1.0 +#define MATCH 2.0 +#define MISMATCH -0.5 +// ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz +#define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz" +#define TRANS_OFFSET 65 + +typedef enum { false, true } bool; + +typedef struct { + char *a; + unsigned int alen; + char *b; + unsigned int blen; +} seq_pair_t; + +// An entry is a cell in the matrix. +// prev holds the coordinates of the previous cell in the matrix. +typedef struct { + double score; + unsigned int prev[2]; +} entry_t; + +typedef struct { + unsigned int m; + unsigned int n; + entry_t **mat; +} matrix_t; + +typedef struct { + seq_pair_t *seqs; + int start_a; + int start_b; + int end_a; + int end_b; + int matches; + double score; +} align_t; + +static char* reverse(char *str); + +static char get_char_comp(char c); + +char* revcomp(char *str); + +static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local); + +static matrix_t *create_matrix(unsigned int m, unsigned int n); + +void destroy_matrix(matrix_t *S); + +void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair); + +void destroy_seq_pair(seq_pair_t *pair); + +align_t *smith_waterman(seq_pair_t *problem, bool local); + +void print_alignment(align_t *result, int target_len, int query_len); |
b |
diff -r 13bcc2f459b0 -r af383638de66 swalignc.so |
b |
Binary file swalignc.so has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 test.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,7 @@ +>one (1) +GATTACA +>two +TAAGTGTT +ACCA +>three +GGGGAAACCT |
b |
diff -r 13bcc2f459b0 -r af383638de66 test.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.fq Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,15 @@ +@1 (one) +GATTACA ++1 +asdlknn +@2 +TAAGTGTT +ACCA ++ +sdlkncsa +aknc +@3 +GGGGAAACCT ++three +aslknaoija + |
b |
diff -r 13bcc2f459b0 -r af383638de66 test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,42 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import print_function +import sys +import argparse +import subprocess + +OPT_DEFAULTS = {} +USAGE = "%(prog)s [options]" +DESCRIPTION = """""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('file1') + parser.add_argument('file2') + + args = parser.parse_args(argv[1:]) + + """ + cat $fastq1 | paste - - - - \ + | paste - <(cat $fastq2 | paste - - - -) \ + | awk -f make-barcodes.awk \ + | sort \ + | align_families.py \ + | duplex.py \ + > $sscs + """ + + cmd1 = subprocess.Popen(['cat', args.file1], stdout=subprocess.PIPE) + cmd2 = subprocess.Popen(['cat', args.file2], stdout=subprocess.PIPE) + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 test.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.sam Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,4 @@ +@SQ thing +read1 1 2 3 4 5 6 7 8 GATTACA 10 +read2 1 2 3 4 5 6 7 8 GGC 10 +read3 more info 1 2 3 4 5 6 7 8 TCTAATG 10 \ No newline at end of file |
b |
diff -r 13bcc2f459b0 -r af383638de66 test.sscs.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.sscs.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,16 @@ +>ACCGACACAGACTAGGGATCAAAG.ab.1 4 +TAAGGATACTAGTATAAGAG +>ACCGACACAGACTAGGGATCAAAG.ba.2 3 +TAAGGATACTAGTATAAGAG +>ACCGACACAGACTAGGGATCAAAG.ab.2 4 +AGAGTCAGGTTCGTCTTTAG +>ACCGACACAGACTAGGGATCAAAG.ba.1 3 +AGAGTCAGGTTCGTCTTTAG +>ATGATTAAGGCTACTAGTATAAGC.ab.1 3 +TCTATCATTATGTTTTGAGG +>ATGATTAAGGCTACTAGTATAAGC.ab.2 3 +GCCCCCTCTACCCCCTCTAG +>TTGTTGATGAGATATTTGGAGGTA.ba.1 3 +GGTGATTAGTCGGTTGTTGA +>TTGTTGATGAGATATTTGGAGGTA.ba.2 3 +ACTTTACAATGCAATGCCCA |
b |
diff -r 13bcc2f459b0 -r af383638de66 test_1.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_1.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,6 @@ +>one/1 +GATTACAG +>two/1 +TAAGTGTT +>three/1 +GGGGAAAC |
b |
diff -r 13bcc2f459b0 -r af383638de66 test_2.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_2.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,6 @@ +>one/2 +TTACAGAT +>two/2 +GTGTTTAA +>three/2 +AACGGGGA |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/families.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/families.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,4 @@ +>ACCGACACAGACTAGGGATCAAAG.1 4/3 +TAAGGATACTAGTATAAGAG +>ACCGACACAGACTAGGGATCAAAG.2 4/3 +AGAGTCAGGTTCGTCTTTAG |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/families.cons.incl-sscs.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/families.cons.incl-sscs.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,12 @@ +>ACCGACACAGACTAGGGATCAAAG.1 4/3 +TAAGGATACTAGTATAAGAG +>ACCGACACAGACTAGGGATCAAAG.2 4/3 +AGAGTCAGGTTCGTCTTTAG +>ATGATTAAGGCTACTAGTATAAGC.1 3 +TCTATCATTATGTTTTGAGG +>ATGATTAAGGCTACTAGTATAAGC.2 3 +GCCCCCTCTACCCCCTCTAG +>TTGTTGATGAGATATTTGGAGGTA.2 3 +GGTGATTAGTCGGTTGTTGA +>TTGTTGATGAGATATTTGGAGGTA.1 3 +ACTTTACAATGCAATGCCCA |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/families.in.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/families.in.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,16 @@ +ACCGACACAGACTAGGGATCAAAG ab pair1.ab.1 TAAGGATACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA pair1.ab.2 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab pair2.ab.1 TAAGGATACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA pair2.ab.2 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab pair3.ab.1 TAAGGATACTAGATAAGAGC AAAAAAAAAAAAAAAAAAAA pair3.ab.2 AGAGTCACGTTTCGTCTTTA AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab pair4.ab.1 TAAGGCTACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA pair4.ab.2 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba pair5.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA pair5.ba.2 TAAGGCTACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba pair6.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA pair6.ba.2 TAAGGATACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba pair7.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA pair7.ba.2 TAAGGATACTAGTAGAAGAG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab pair8.ab.1 TCTATCATTATGTTTTGAGG AAAAAAAAAAAAAAAAAAAA pair8.ab.2 GCCCCCTCTACCCCCTCTAG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab pair9.ab.1 TCTATCATTATGTCTTGAGG AAAAAAAAAAAAAAAAAAAA pair9.ab.2 GCCCCCTCTACCCCCTCTAG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab pair10.ab.1 TCTATCATTATGTTTTGAGG AAAAAAAAAAAAAAAAAAAA pair10.ab.2 GCCCCTCTACCCCCTCTAGC AAAAAAAAAAAAAAAAAAAA +TCTTAATAAGAACCAACACACTGT ab pair11.ab.1 TCGGTTGTTGATGAGATATT AAAAAAAAAAAAAAAAAAAA pair11.ab.2 GATTAAGAGAACCAACACCT AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba pair12.ab.1 GGTGATTAGTCGGTTGTTGA AAAAAAAAAAAAAAAAAAAA pair12.ab.2 ACTTTACAATGCAATGCCCA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba pair13.ab.1 GGTGATTAGTCGGATGTTGA AAAAAAAAAAAAAAAAAAAA pair13.ab.2 ACTTTACCATGCAATGCCCA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba pair14.ab.1 GGTGACTAGTCGGTTGTTGA AAAAAAAAAAAAAAAAAAAA pair14.ab.2 ACTTTACAATGCAATGCACA AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ba pair15.ba.1 TCAATGCTCTGAAATCTGTG AAAAAAAAAAAAAAAAAAAA pair15.ba.2 GTTGATGAGATATTTGGAGG AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ab pair16.ab.1 GTTGATGAGATACTTGGAGG AAAAAAAAAAAAAAAAAAAA pair16.ab.2 TCAATGCTCTGAAATCTGTG AAAAAAAAAAAAAAAAAAAA |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/families.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/families.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,32 @@ +ACCGACACAGACTAGGGATCAAAG ab 1 pair1.ab.1 TAAGGATACTAGTATAAGAG- AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 1 pair2.ab.1 TAAGGATACTAGTATAAGAG- AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 1 pair3.ab.1 TAAGGATACTAG-ATAAGAGC AAAAAAAAAAAA AAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 1 pair4.ab.1 TAAGGCTACTAGTATAAGAG- AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 2 pair5.ba.2 TAAGGCTACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 2 pair6.ba.2 TAAGGATACTAGTATAAGAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 2 pair7.ba.2 TAAGGATACTAGTAGAAGAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 2 pair1.ab.2 AGAGTCA-GGTTCGTCTTTAG AAAAAAA AAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 2 pair2.ab.2 AGAGTCA-GGTTCGTCTTTAG AAAAAAA AAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 2 pair3.ab.2 AGAGTCACGTTTCGTCTTTA- AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ab 2 pair4.ab.2 AGAGTCA-GGTTCGTCTTTAG AAAAAAA AAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 1 pair5.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 1 pair6.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ACCGACACAGACTAGGGATCAAAG ba 1 pair7.ba.1 AGAGTCAGGTTCGTCTTTAG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 1 pair8.ab.1 TCTATCATTATGTTTTGAGG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 1 pair9.ab.1 TCTATCATTATGTCTTGAGG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 1 pair10.ab.1 TCTATCATTATGTTTTGAGG AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 2 pair8.ab.2 GCCCCCTCTACCCCCTCTAG- AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 2 pair9.ab.2 GCCCCCTCTACCCCCTCTAG- AAAAAAAAAAAAAAAAAAAA +ATGATTAAGGCTACTAGTATAAGC ab 2 pair10.ab.2 -GCCCCTCTACCCCCTCTAGC AAAAAAAAAAAAAAAAAAAA +TCTTAATAAGAACCAACACACTGT ab 1 pair11.ab.1 TCGGTTGTTGATGAGATATT AAAAAAAAAAAAAAAAAAAA +TCTTAATAAGAACCAACACACTGT ab 2 pair11.ab.2 GATTAAGAGAACCAACACCT AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 1 pair12.ab.1 GGTGATTAGTCGGTTGTTGA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 1 pair13.ab.1 GGTGATTAGTCGGATGTTGA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 1 pair14.ab.1 GGTGACTAGTCGGTTGTTGA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 2 pair12.ab.2 ACTTTACAATGCAATGCCCA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 2 pair13.ab.2 ACTTTACCATGCAATGCCCA AAAAAAAAAAAAAAAAAAAA +TTGTTGATGAGATATTTGGAGGTA ba 2 pair14.ab.2 ACTTTACAATGCAATGCACA AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ba 1 pair15.ba.1 TCAATGCTCTGAAATCTGTG AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ab 2 pair16.ab.2 TCAATGCTCTGAAATCTGTG AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ba 2 pair15.ba.2 GTTGATGAGATATTTGGAGG AAAAAAAAAAAAAAAAAAAA +GACTAGGGATCAAAACCGACACAG ab 1 pair16.ab.1 GTTGATGAGATACTTGGAGG AAAAAAAAAAAAAAAAAAAA |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/gapqual.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/gapqual.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,4 @@ +>ACCGGACAACGACACCCTGCATAA.1 5 +TGGGAGAACACAACTAAACTCGGGAAGT +>ACCGGACAACGACACCCTGCATAA.2 4 +CNCATCACCAGGAACAACTCTGCTGTACTT |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/gapqual.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/gapqual.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,9 @@ +ACCGGACAACGACACCCTGCATAA ab 1 read1.1 --G-GAGAACACAACTAAACTCGGGAAGT- 0 ?@AAAAAAAAAAAAAAAAAAAAAAA +ACCGGACAACGACACCCTGCATAA ab 1 read2.1 --G-GAGAACACAACTAAACTCGGGAAGT- 0 ?@AAAAAAAAAAAAAAAAAAAAAAA +ACCGGACAACGACACCCTGCATAA ab 1 read3.1 --G-GAGAACACAACTAAACTCGG-AAGTA 0 ?@AAAAAAAAAAAAAAA;;1 1;;AA +ACCGGACAACGACACCCTGCATAA ab 1 read4.1 TGG-GAG-ACACAACT-AACTCGG-AAGTA AAA AAA AAAAAAAA AAAA;;1 1;;AA +ACCGGACAACGACACCCTGCATAA ab 1 read5.1 TGGAGCGAAC-CAACTGAA-TCGG-AAGT- AAAAAAAAAA AAAAAAAA AAAA AAAA +ACCGGACAACGACACCCTGCATAA ab 2 read1.2 CGCGTCACCAGGAACAACTCTGCTGTACTT AAA1AAAAAAAAAAAAAAAAAAAAAAAAAA +ACCGGACAACGACACCCTGCATAA ab 2 read2.2 CGCGTCACCAGGAACAACTCTGCTGTACTT AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +ACCGGACAACGACACCCTGCATAA ab 2 read3.2 CACATCACCAGGAACAACTCTGCTGTACTT AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +ACCGGACAACGACACCCTGCATAA ab 2 read4.2 CACATCACCAGGAACAACTCTGCTGTACTT AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/gaps-diffs.out.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/gaps-diffs.out.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,11 @@ +ACCGGA 0.0 CAGCACCCCCTCTACCC------CCCCCTCTAGAG +ACCGGA 0.0 CAGCACCCCCTCTACCC-----ACCCCCTCTAGAG +ACCGGA 0.0 CAGCACCCCCTCTACCC-------CCCCTCTAGAG +ACCGGA 0.0286 CAGCACCCCCTCTACC-------CCCCCTCTAGAG +ACCGGA 0.0286 CAGCACCCCCTCTACC------CCCCCCTCTAGAG +ACCGGA 0.0286 CAGCACCCCCTCTACCCC------CCCCTCTAGAG +ACCGGA 0.0 CAGCACCCCCTCTACCC------CCCCCTCTAGAG +ACCGGA 0.0286 CAGCACCCCCTCTACCCCCTCTA----CTCTAGAG +ACCGGA 0.0571 CAGCACCCCCTCTACCCCCTCTAC----TCTAGAG +ACCGGA 0.0571 CAGCACCCCCTCTACCCCCTCTACC----CTAGAG +ACCGGA 0.0286 CAGCACCCCCTCTACCCCCTCTACCCCCTCTAGAG |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/gaps.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/gaps.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,12 @@ +ACCGGA CONSENSUS CAGCACCCCCTCTACCC------CCCCCTCTAGAG +ACCGGA 0 cagcaccccctctaccc------ccccctctagag +ACCGGA 1 cagcaccccctctaccc-----accccctctagag +ACCGGA 2 cagcaccccctctaccc-------cccctctagag +ACCGGA 3 cagcaccccctctacc-------ccccctctagag +ACCGGA 4 cagcaccccctctacc------cccccctctagag +ACCGGA 5 cagcaccccctctacccc------cccctctagag +ACCGGA 0 cagcaccccctctaccc------ccccctctagag +ACCGGA 6 cagcaccccctctaccccctcta----ctctagag +ACCGGA 7 cagcaccccctctaccccctctac----tctagag +ACCGGA 8 cagcaccccctctaccccctctacc----ctagag +ACCGGA 9 cagcaccccctctaccccctctaccccctctagag |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/qual.cons.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/qual.cons.fa Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,4 @@ +>ACCGGACAACGACACCCTGCATAA.1 4 +TGCAGAGAACACAACTAAACTCGGGAAGTA +>ACCGGACAACGACACCCTGCATAA.2 4 +CGCATCACCAGGAACAACTCTGCTGTACTT |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/qual.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/qual.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,8 @@ +ACCGGACAACGACACCCTGCATAA ab 1 read1.1 TGCAGAGAACACAACTAAACTCGGGAAGTA EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 1 read2.1 TGCAGAGAACACAACTAAACTCGGGAAGTA EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 1 read3.1 TGGAGAGAACACAACTGAACTCGGGAAGTA EEEEEEEEEEEEEEEE1EEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 1 read4.1 TGCAGCGAACACAACTGAACTCGG-AAGTA EEEEEEEEEEEEEEEEEEEEEEEE EEEEE +ACCGGACAACGACACCCTGCATAA ab 2 read1.2 CGCATCACCAGGAACAACTCTGCTGTACTT EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 2 read2.2 CGCATCACCAGGAACAACTCTGCTGTACTT EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 2 read3.2 CGCATCACCAGGAACAACTCTGCTGTACTT EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +ACCGGACAACGACACCCTGCATAA ab 2 read4.2 CGCATCACCAGGAACAACTCTGCTGTACTT EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE \ No newline at end of file |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/quirks.msa.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/quirks.msa.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
b'@@ -0,0 +1,16 @@\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:1116:22967:7077 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAAC\tGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCFGGGGGGDGGGGGGGFGGGGGGGGGGGGGFGGGGGGGGGGGGGGGG?FGGGFGGGFGGGFF7BFGDG@EEGGGGGGGCFEG@FGGGGGGGDFFFFGGGGGGGDEGGGGGGGGGGGGGGGFGG76CGGGGD4?DFGFFGGGFFDFFGEDEGFB?EB*-*3;028?)9)46FFAF706>FFFFF::\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:1118:14605:8689 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAAC\tFGGGGGCEEEEGGGGGGGGEGGGFGGGEGGGGGGGGGGGGGGGAFGGGDGGGGGGGGGGGGGGGEEDEGAFFDGGGFGGFGGFGGCGGGGGGGGGFGECGCFEGGGGBC7BFGEFGGDGGGGGGGGCGGFCGGGGGEGGGGGGFCFG,7EDCFCFGGF9FGGGFGGGEC7FF@CGGCFCE;BFCGGGG8CEFFBFFFFFFGGGGC7EEEEFGDGCF7C7=CGGGG4<<C<C@EGFD@E0<>?DFGDG55>C335B5=?F?058C4F9FAFF9?::2>9D?24?2\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:1118:21309:6959 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAAC\tGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGFFGGGGGFGGGGGGGFFFGGGGGDGGGFCFGCFGGF9>DCGFGGGGG;DFFGFGFDCFGFGEGFE@CD0)7@*9F??F?F9@2:>F4)4<<;>FFF>A?8\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:2101:17733:13519 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTACGGTTCCCAAC\tGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGFGEGGGDGGGGGGGGGGGFGGGGGGGGGGGGGGGGFGGGGGGGFFGGGGGGFGCGCFGFGGGGGGGGGGGGFGGGFGGFGGGG=;;FEEGGEFFGGGGCEGGGGGGEGGEGGGGGFFGGGGGGGGF6>CFFFFGGG77>@CFFFFCE75;>>575BGBFFFFF??F9<F>FB4*434F?6>?BF0\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:2103:23125:15471 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAAC\tGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGFGGGCCFGFGGGFGGGDGGFGGFGGGGGGGGGGFGFCFFGGGGGGGGGGGCEBFGG?CFFFGGGGGGGGCFGFGFCFGGGGFGGGGGGFFGFGGGGEFGFG?E@FGFGFEEC>F?G4@FFF@FF:49<F?AFAF?515>:B@??:\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG11:1:2104:14576:24265 1:N:0:1\tGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTCTCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGCGTAAGTAGTTCGCCAGTTACTAGTTCACGCAACGTTCTTGCCATTGCTACAGCCATCGTGGTGTGTCGCTCGTCGTTTGGTATGGCTTCATTTAGCGCCGGTTCCCCAC\tF9FAFCFE7:EGGFGGFFGF@C9,8FEDEGGGFF9CEFFGGGGFCEGC@C@FGGGGGGGGGFGEGEGCG8E999CFGGCFFFFFGF9FGF@CGGFGDFDBFCEEGDD773=FE7:FCFGGGGGGGGBFFGFE,37DBEGG@FCC;DD9FC;@;=@2DF,<@>AEEFG*=C,CEC,*=*=CF9,3BC8EGE*52;+5;9+2++2**=CEEE**/0+07+3C9907CF6+1*<*557)1*)*)38G/))9C)<*5)11**)*0)00-6)**175(4759D(6,).-\n+CTGCGACACAATATTGGGCTCCCC\tab\t1\t@M02286:46:000000000-AEG'..b').0(344(*(,-((42(.(.)5A)9?0<4<7?+5( (,(,,(((->18:0,((02-92\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:1118:21309:6959 2:N:0:1\tAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCACTTTTTTGCACAACCTGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGCATGAAGCCATCCCAAA-CGACGACCGTGACACCACCATT\tGGGGGGGGGGFGDCGGGGGGGGDGGGGGGGGFGGGFFFGGGCFGF@CCGFFGGGGGGGGFGGGGFGFCFDFFDEGE?@FFF7FC?FFGGGGGGGAF?FDFFGGGFGFEFGCEFGGGGGGGFEFFGGGGGFDCFGGGGFD@EGGGFEEEFE,EDDEFF5DD@FCFEE>CDCGGD>ED5CDFFGAFFGF@CEEFG4C:A:8?*//C5577?F;FACCFFF4D@EB33=675A1(7284<?9>FB9?B02)6<29???A(23+43 :<767(-(4C<((0)-()(())\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:2101:17733:13519 2:N:0:1\tAAGGGCACCCGGGTGGCGGCAACCATAATTCTAAGATTGCTTGGGTGGGGTATTACTTAGCACAGGAAAAGAATCTAAGGAAGGGCAGACAGGAAAGGAATTAATGCATTCCTGCATAACCAAGGAGGGAAAAACCGGCGGCCAACTTCCTTCGACAAAGGTAGGGGGGACCAAAGGGGCAAACCGCTTTTTTCCACAAATGGGGGCATAATGTAACGCCCCTTG-TTGTTGGGGACCGGGCCCGAAAGGACCCAAACCAACACGACGACCCTATCACAAAACGG\tB9,,,,:,,,+8+++8C+++++,,,,,,<,,,,,,:9,,,,C,B,B,++++,,,5,,5,,,<E,,,C4E,,+,,,<5,,:,,4B+8+++,,,+,B,,,,:,75,,,,7,,5,,,83,5C3<<+,+,8+++,@,,++++3@*1*1,*,2,,1,,***4:C,,1,,3***28E**;/2***)*1*9*)/9:*7*1*0***)1**1095))))0*00/**))()-80) **)0.439>>4*-(,)((-(()((.((,4((-((4(),((-((-(()()/).))(((4-\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:2103:23125:15471 2:N:0:1\tAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGCAGACATAACAAA-CGCCGAGCGTTACACCCCGATG\tFGGGGGGFGGGGGGGGGGDGGGGGGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGCGFGGFGGGGGGGGGGGGGGGGGGGGCEGGGGGGGGGGGG9FGGGGGGGGGGFGGGEBEFGFBFFGGGGGFGGFGGGGGGGGGFDEEEGFGDDFGGGGGG,@EEFEFFGGG6CDEGFEC8?*,79CFCFGGGGDGGFGGGGFGGGF4*8*6=7>FD+788FC7:37GEA@<8F?5:?46C),<(9B90??>?4*)1..406B).5)2 4<((49>07()--4/4(2((-(\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:2104:14576:24265 2:N:0:1\tAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACATTAAGAGAATTCTGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAAAGATTGGAGGAACGAAGTGTATAACCACTTTTTTGCACAACATGCGGGATCGTGTAACTCGCTTTGTTCGTTGCTCACCGGAAGCGATAGCGACCATGCCACC-CGTACCGCGGTCAACACCGTTT\t<<F,;C6CFGGCFG7C:BCD7C:9,,CCFG,,,<,,,<E,<FC6<:DFCAEF,,,66CC<9CFGGGDGCE6=ECFA<F<A7:@FGC4ED,<A,9,:,C,,4,,,,,5@A,?F,55EFFFGGG@9,4=,9E;@DGGF9,E+++++3A@,6,=;DD,6@,@=,6,,7,,+6+,0+4@8+,,,+2257,3@,2,219@+4+*/406**)*02?*/;C)=>8+).**/)1)1):;4(++./26()(((((0)(.,)(0())(.64( ()--()()(.((,0).(((.((\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:2104:25265:19405 2:N:0:1\tAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAACGAGCTAACCGCTTTTTTGCACAACCTGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCTGAGCTGAATGAAGCCCTACCAAC-CGACGAGCGTGACACCACGATG\tGFFGGGGGGFGGDEFGGGGGGGGGGGGGGGGFGGAD<FAFGDG9FGGCEGFFFGGCGGGCGGGGGGC@FEFGGFFGGGG?7FGGFGGGGGGGDGDFFFDFFFGDGGGFE<FEFDC@<FFEFFFGCCFAF9FCAFFGG?EFGGGGGGCFFGA@>EF;E?DFGGF?EE@+8@DD6E>*@C574=B:DEG>=*ADGBFGC=D4*;*;76C378;A6CACCDD59CC()+*.8*.)45*3>7((0,,54)/*)426))(.4:())( 4)--6073(8?((633(36(((\n+CTGCGACACAATATTGGGCTCCCC\tab\t2\t@M02286:46:000000000-AEG11:1:2119:22759:6520 2:N:0:1\tAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCACAGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACCACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATGGGGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGACTGAAGCACTACCCAA-CGACTACCGTCACACCACGATT\tGGGFFCFGGGGEGGGGGGGGGGDFGGGGGGGGGGFGAEGGGFGGFFGGGGFGGGGGGGFGGGGGGGCFGDDFGGGGGGGGGGGEDFGGFGGGGGFGGFFGGGGGGFGFFGFCFFGFCD@?FDGGFFG4EFFFGGDGGFGGGEGGFFGFDA9EFGG=9,@F+8+@>E6@E68:E5*;7C>CCE@FFGD9?96:57DFGFCGBC8?3(:CD3;8:@:+8+;3CDE<+27:FF5,:5A,73*((170(4).*/4+,)(.:?B:<, 8<(-((((-((((*,7(4((((\n' |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/run.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/run.sh Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,126 @@ +#!/usr/bin/env bash +if [ x$BASH = x ] || [ ! $BASH_VERSINFO ] || [ $BASH_VERSINFO -lt 4 ]; then + echo "Error: Must use bash version 4+." >&2 + exit 1 +fi +# get the name of the test directory +dirname=$(dirname $0) + +USAGE="Usage: \$ $(basename $0) [options] [test1 [test2]]" + + +function main { + + do_all=true + verbose=true + # Run the requested tests + for arg in "$@"; do + # Check for options + #TODO: option to keep test data at end instead of removing it. + if [[ ${arg:0:1} == '-' ]]; then + case "$arg" in + -h) + echo "$USAGE" >&2 + echo "Currently valid tests:" >&2 + list_tests >&2 + exit 1;; + -q) + verbose='';; + -v) + verbose=true;; + *) + echo "Unrecognized option \"$arg\"." >&2;; + esac + continue + fi + # Execute valid tests (if they're existing functions). + if [[ $(type -t $arg) == function ]]; then + do_all='' + if [[ $verbose ]]; then + $arg + else + $arg 2>/dev/null + fi + else + echo "Unrecognized test \"$arg\"." >&2 + do_all='' + fi + done + + # If no tests were specified in arguments, do all tests. + if [[ $do_all ]]; then + if [[ $verbose ]]; then + all + else + all 2>/dev/null + fi + fi +} + +function fail { + echo "$@" >&2 + exit 1 +} + +function list_tests { + while read declare f test; do + # Filter out functions that aren't tests. + if echo "$initial_declarations" | grep -qF 'declare -f '"$test"; then + continue + else + echo "$test" + fi + done < <(declare -F) +} + +# Capture a list of all functions defined before the tests, to tell which are actual functions +# and which are tests. +initial_declarations=$(declare -F) + +########## Functional tests ########## + +# Do all tests. +function all { + align + align_p3 + duplex + duplex_qual + stats_diffs +} + +# align_families.py +function align { + echo -e "\talign_families.py ::: families.in.tsv:" + python "$dirname/../align_families.py" "$dirname/families.in.tsv" | diff -s - "$dirname/families.msa.tsv" +} + +# align_families.py with 3 processes +function align_p3 { + echo -e "\talign_families.py ::: families.in.tsv:" + python "$dirname/../align_families.py" -p 3 "$dirname/families.in.tsv" | diff -s - "$dirname/families.msa.tsv" +} + +# duplex.py defaults on toy data +function duplex { + echo -e "\tduplex.py ::: families.msa.tsv:" + python "$dirname/../duplex.py" "$dirname/families.msa.tsv" | diff -s - "$dirname/families.cons.fa" + python "$dirname/../duplex.py" --incl-sscs "$dirname/families.msa.tsv" | diff -s - "$dirname/families.cons.incl-sscs.fa" +} + +# duplex.py quality score consideration +function duplex_qual { + echo -e "\tduplex.py ::: qual.msa.tsv:" + python "$dirname/../duplex.py" --incl-sscs -q 20 "$dirname/qual.msa.tsv" | diff -s - "$dirname/qual.cons.fa" +} + +function duplex_gapqual { + echo -e "\tduplex.py ::: gapqual.msa.tsv:" + python "$dirname/../duplex.py" --incl-sscs -q 25 "$dirname/gapqual.msa.tsv" | diff -s - "$dirname/gapqual.cons.fa" +} + +function stats_diffs { + echo -e "\tstats.py diffs ::: gaps.msa.tsv:" + python "$dirname/../stats.py" diffs "$dirname/gaps.msa.tsv" | diff -s - "$dirname/gaps-diffs.out.tsv" +} + +main "$@" |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/smoke.families.aligned.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/smoke.families.aligned.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,6 @@ +GATTATTT ab 1 @one/1 CATTGGTCAATT nnacaeagasda +GATTATTT ab 2 @one/2 GTGTGATTAACC nnacaeagasda +TAAGCCCT ab 1 @two/1 GTTACGAAGTGG csacaeagadss +TAAGCCCT ab 2 @two/2 ACCCACCTCTTG csacaeagadss +GTGTGCGG ba 1 @three/1 ATCTTTGGGCTA aocasdefadva +GTGTGCGG ba 2 @three/2 CTCTTCATCAAT aocasdefadva |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/smoke.families.i0.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/smoke.families.i0.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,3 @@ +GATTATTT ab @one/1 ACATTGGTCAATT knnacaeagasda @one/2 CGTGTGATTAACC knnacaeagasda +TAAGCCCT ab @two/1 TGTTACGAAGTGG ncsacaeagadss @two/2 AACCCACCTCTTG ncsacaeagadss +GTGTGCGG ba @three/1 CATCTTTGGGCTA naocasdefadva @three/2 ACTCTTCATCAAT naocasdefadva |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/smoke.families.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/smoke.families.tsv Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,3 @@ +GATTATTT ab @one/1 CATTGGTCAATT nnacaeagasda @one/2 GTGTGATTAACC nnacaeagasda +TAAGCCCT ab @two/1 GTTACGAAGTGG csacaeagadss @two/2 ACCCACCTCTTG csacaeagadss +GTGTGCGG ba @three/1 ATCTTTGGGCTA aocasdefadva @three/2 CTCTTCATCAAT aocasdefadva |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/smoke_1.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/smoke_1.fq Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,13 @@ +@one/1 +GATTACATTGGTCAATT ++ +asdlknnacaeagasda +@two/1 +TAAGTGTTACGAAGTGG ++ +sdlkncsacaeagadss +@three/1 +GCGGCATCTTTGGGCTA ++ +aslknaocasdefadva + |
b |
diff -r 13bcc2f459b0 -r af383638de66 tests/smoke_2.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/smoke_2.fq Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,13 @@ +@one/2 +ATTTCGTGTGATTAACC ++ +asdlknnacaeagasda +@two/2 +CCCTAACCCACCTCTTG ++ +sdlkncsacaeagadss +@three/2 +GTGTACTCTTCATCAAT ++ +aslknaocasdefadva + |
b |
diff -r 13bcc2f459b0 -r af383638de66 tool_dependencies.xml --- a/tool_dependencies.xml Mon Nov 23 18:07:11 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,18 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="mafft" version="7.221"> - <repository changeset_revision="97adbeef2294" name="mafft" owner="rnateam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="duplex" version="0.1"> - <install version="1.0"> - <actions> - <action type="download_by_url">https://github.com/makrutenko/duplex/archive/master.tar.gz</action> - <action type="shell_command">make</action> - <action type="set_environment"> - <environment_variable action="set_to" name="DUPLEX_DIR">$INSTALL_DIR</environment_variable> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable> - </action> - </actions> - </install> - </package> -</tool_dependency> |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/get_msa.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/get_msa.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,156 @@ +#!/usr/bin/env python +from __future__ import division +import os +import sys +import argparse +import tempfile +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +import consensus +import seqtools + +OPT_DEFAULTS = {'format':'plain', 'qual':20, 'qual_format':'sanger'} +USAGE = "%(prog)s [options]" +DESCRIPTION = """""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('seqs', metavar='sequence', nargs='*', + help='The alignment.') + parser.add_argument('-i', '--input', + help='Provide the sequences in this input file instead of as command-line arguments. ' + 'Give "-" to use stdin.') + parser.add_argument('-f', '--format', choices=('plain', 'duplex'), + help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is ' + 'the 8-column format of the family-sorted read data from the duplex pipeline. It must be ' + 'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and ' + 'beta-alpha strands). If "duplex" is given, you must also specify which of the four ' + 'possible alignments to output with --mate and --order.') + parser.add_argument('-m', '--mate', type=int, choices=(1, 2)) + parser.add_argument('-o', '--order', choices=('ab', 'ba')) + parser.add_argument('-F', '--qual-format', choices=('sanger',)) + parser.add_argument('-q', '--qual', type=int, + help='Quality threshold: Default: %(default)s') + + args = parser.parse_args(argv[1:]) + + qual_thres = ' ' + if args.qual_format == 'sanger': + qual_thres = chr(args.qual + 33) + else: + fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format)) + # Check arguments. + if not (args.seqs or args.input): + fail('Error: You must provide sequences either in a file with --input or as arguments.') + elif args.seqs and args.input: + fail('Error: You cannot provide sequences in both a file and command-line arguments.') + if args.format == 'duplex' and not (args.mate and args.order): + fail('Error: If the --format is duplex, you must specify a --mate and --order.') + + # Read input. + quals = [] + if args.input: + if args.format == 'plain': + if args.input == '-': + seqs = [line.strip() for line in sys.stdin] + else: + with open(args.input) as infile: + seqs = [line.strip() for line in infile] + elif args.format == 'duplex': + if args.input == '-': + (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order) + else: + with open(args.input) as infile: + (seqs, quals) = parse_duplex(infile, args.mate, args.order) + else: + seqs = args.seqs + + align = make_msa(seqs) + if quals: + quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ') + cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True) + + output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres)) + + for seq in output: + print seq + + +def parse_duplex(infile, mate, order): + seqs = [] + quals = [] + for line in infile: + (bar, this_order, name1, seq1, qual1, name2, seq2, qual2) = line.rstrip('\r\n').split('\t') + if this_order == order: + if mate == 1: + seqs.append(seq1) + quals.append(qual1) + elif mate == 2: + seqs.append(seq2) + quals.append(qual2) + return seqs, quals + + +def make_msa(seqs): + """Perform a multiple sequence alignment on a set of sequences. + Uses MAFFT.""" + i = 0 + #TODO: Replace with tempfile.mkstemp()? + with tempfile.NamedTemporaryFile('w', delete=False, prefix='msa.') as family_file: + for seq in seqs: + i+=1 + header = '>{}\n'.format(i) + family_file.write(header) + family_file.write(seq+'\n') + with open(os.devnull, 'w') as devnull: + try: + command = ['mafft', '--nuc', '--quiet', family_file.name] + output = subprocess.check_output(command, stderr=devnull) + except (OSError, subprocess.CalledProcessError): + return None + os.remove(family_file.name) + return read_fasta(output) + + +def read_fasta(fasta): + """Quick and dirty FASTA parser. Return only the list of sequences (no names). + Warning: Reads the entire contents of the file into memory at once.""" + sequences = [] + sequence = '' + for line in fasta.splitlines(): + if line.startswith('>'): + if sequence: + sequences.append(sequence) + sequence = '' + continue + sequence += line.strip() + if sequence: + sequences.append(sequence) + return sequences + + +def format_alignment(cons, seqs, quals=(), qual_thres=32, id_char='.'): + output = [cons.upper()] + for i, seq in enumerate(seqs): + outseq = '' + for j, seq_base in enumerate(seq.upper()): + if quals and seq_base != '-' and ord(quals[i][j]) < qual_thres: + outseq += ' ' + elif cons[j] == seq_base: + outseq += id_char + else: + outseq += seq_base + output.append(outseq) + return output + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/getreads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/getreads.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,156 @@ +"""A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and +sequence. +All format parsers follow this API: + with open('sequence.fasta') as fasta: + for read in getreads.getparser(fasta, filetype='fasta'): + print "There is a sequence with this FASTA identifier: "+read.id + print "Its sequence is "+read.seq +The properties of Read are: + name: The entire FASTA header line, SAM column 1, etc. + id: The first whitespace-delimited part of the name. + seq: The sequence. + qual: The quality scores (unless the format is FASTA). +""" + + +def getparser(filehandle, filetype='fasta'): + if filetype == 'fasta': + return FastaReader(filehandle) + elif filetype == 'fastq': + return FastqReader(filehandle) + elif filetype == 'sam': + return SamReader(filehandle) + elif filetype == 'tsv': + return TsvReader(filehandle) + else: + raise ValueError('Illegal argument: filetype=\''+filetype+'\'') + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message) + + +class Read(object): + def __init__(self, name='', seq='', id_='', qual=''): + self.name = name + self.seq = seq + self.id = id_ + self.qual = qual + + +class Reader(object): + """Base class for all other parsers.""" + def __init__(self, filehandle): + self.filehandle = filehandle + def __iter__(self): + return self.parser() + + +class TsvReader(Reader): + """A parser for a simple tab-delimited format. + Column 1: name + Column 2: sequence + Column 3: quality scores (optional)""" + def parser(self): + for line in self.filehandle: + fields = line.rstrip('\r\n').split('\t') + if len(fields) < 2: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[1] + if len(fields) >= 3: + read.qual = fields[2] + yield read + + +class SamReader(Reader): + """A simple SAM parser. + Assumptions: + Lines starting with "@" with 3 fields are headers. All others are alignments. + All alignment lines have 11 or more fields. Other lines will be skipped. + """ + def parser(self): + for line in self.filehandle: + fields = line.split('\t') + if len(fields) < 11: + continue + # Skip headers. + if fields[0].startswith('@') and len(fields[0]) == 3: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[9] + read.qual = fields[10].rstrip('\r\n') + yield read + + +class FastaReader(Reader): + """A simple FASTA parser that reads one sequence at a time into memory.""" + def parser(self): + read = Read() + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if line.startswith('>'): + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove ">" + if read.name: + read.id = read.name.split()[0] + continue + else: + read.seq += line + + +class FastqReader(Reader): + """A simple FASTQ parser. Can handle multi-line sequences, though.""" + def parser(self): + read = Read() + state = 'header' + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if state == 'header': + if not line.startswith('@'): + raise FormatError('line state = "header" but line does not start with "@"') + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove '@' + if read.name: + read.id = read.name.split()[0] + state = 'sequence' + elif state == 'sequence': + if line.startswith('+'): + state = 'plus' + else: + read.seq += line + elif state == 'plus' or state == 'quality': + state = 'quality' + togo = len(read.seq) - len(read.qual) + read.qual += line[:togo] + # The end of the quality lines is when we have a quality string as long as the sequence. + if len(read.qual) >= len(read.seq): + state = 'header' |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/getreads.pyc |
b |
Binary file utils/getreads.pyc has changed |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/msa2fa.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/msa2fa.awk Mon Nov 23 18:44:23 2015 -0500 |
b |
@@ -0,0 +1,22 @@ +# A quick script to convert the .msa.tsv output of sscs.py back into FASTA format. + +BEGIN { + FS = "\t"; + OFS = "\t"; +} + +$2 == "CONSENSUS" { + if ($1 == last) { + mate = 2; + } else { + mate = 1; + } + printf(">%s.%d:%d\n", $1, mate, pairs); + print $3; + pairs = 0; + last = $1; +} + +$2 != "CONSENSUS" { + pairs++; +} \ No newline at end of file |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/outconv.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/outconv.awk Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,16 @@ + +substr($0, 1, 1) == ">" { + header = $0 + split(header, fields1, ".") + split(fields1[2], fields2) + mate = fields2[1] + if (mate == target) { + print fields1[1]" "fields2[2] + } + next +} +{ + if (mate == target) { + } +} |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/precheck.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/precheck.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,123 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse +import getreads + +OPT_DEFAULTS = {'tag_len':12, 'const_len':5, 'min_reads':3, 'human':True} +USAGE = "%(prog)s [options]" +DESCRIPTION = """Print statistics on the raw duplex sequencing reads.""" +EPILOG = """Warning: This tracks all barcodes in a dict, so it can take a lot of memory. A guideline +is about 200 bytes per (12bp) tag. For example, it took about 800MB for a 10GB, 32 million read +dataset with an average of 4 pairs per barcode.""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('infile1', metavar='reads_1.fq', + help='The first mates in the read pairs.') + parser.add_argument('infile2', metavar='reads_2.fq', + help='The second mates in the read pairs.') + parser.add_argument('-t', '--tag-length', dest='tag_len', type=int) + parser.add_argument('-c', '--constant-length', dest='const_len', type=int) + parser.add_argument('-C', '--computer', dest='human', action='store_false', + help='Print results in computer-readable format. This will be a tab-delimited version of the ' + 'output, in the same order, but with two columns: stat name and value.') + parser.add_argument('-m', '--min-reads', type=int, + help='The minimum number of reads required in each single-stranded family. Default: ' + '%(default)s') + parser.add_argument('-v', '--validate', action='store_true', + help='Check the id\'s of the reads to make sure the correct reads are mated into pairs (the ' + 'id\'s of mates must be identical).') + + args = parser.parse_args(argv[1:]) + + with open(args.infile1) as infileh1: + with open(args.infile2) as infileh2: + barcodes = read_files(infileh1, infileh2, tag_len=args.tag_len, validate=args.validate) + + stats = get_stats(barcodes, tag_len=args.tag_len, min_reads=args.min_reads) + print_stats(stats, min_reads=args.min_reads, human=args.human) + + +def read_files(infileh1, infileh2, tag_len=12, validate=False): + reader1 = getreads.getparser(infileh1, filetype='fastq').parser() + reader2 = getreads.getparser(infileh2, filetype='fastq').parser() + barcodes = {} + while True: + try: + read1 = reader1.next() + read2 = reader2.next() + except StopIteration: + break + if validate and read1.id != read2.id: + raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id)) + alpha = read1.seq[:tag_len] + beta = read2.seq[:tag_len] + barcode = alpha + beta + if barcode in barcodes: + barcodes[barcode] += 1 + else: + barcodes[barcode] = 1 + return barcodes + + +def get_stats(barcodes, tag_len=12, min_reads=3): + passed_sscs = 0 + duplexes = 0 + passed_duplexes = 0 + singletons = 0 + total_pairs = 0 + for barcode, count in barcodes.items(): + total_pairs += count + if count == 1: + singletons += 1 + if count >= min_reads: + passed_sscs += 1 + alpha = barcode[:tag_len] + beta = barcode[tag_len:] + reverse = beta + alpha + if reverse in barcodes: + duplexes += 1 + if count >= min_reads and barcodes[reverse] >= min_reads: + passed_duplexes += 1 + # Each full duplex ends up being counted twice. Halve it to get the real total. + stats = { + 'pairs':total_pairs, + 'barcodes':len(barcodes), + 'avg_pairs':total_pairs/len(barcodes), + 'singletons':singletons, + 'duplexes':duplexes//2, + 'passed_sscs':passed_sscs*2, + 'passed_duplexes':passed_duplexes, + } + return stats + + +def print_stats(stats, min_reads=3, human=True): + all_stats = stats.copy() + all_stats.update({'min_reads':min_reads}) + if human: + print """Total read pairs:\t{pairs} +Unique barcodes:\t{barcodes} +Avg # of read pairs per barcode:\t{avg_pairs} +Singletons:\t{singletons} +Barcodes with reverse (other strand) present:\t{duplexes} +Passing threshold of {min_reads} reads per single-strand consensus: +\tSingle-strand consensus sequences:\t{passed_sscs} +\tDuplex consensus sequences:\t{passed_duplexes}""".format(**all_stats) + else: + for stat in ('pairs', 'barcodes', 'avg_pairs', 'singletons', 'duplexes', 'min_reads', + 'passed_sscs', 'passed_duplexes'): + print '{}\t{}'.format(stat, all_stats[stat]) + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/stats.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,154 @@ +#!/usr/bin/env python +from __future__ import division +import os +import sys +import math +import argparse +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +import seqtools +import swalign + +INF = float('inf') +STATS = ('diffs', 'diffs-binned', 'seqlen', 'strand') +OPT_DEFAULTS = {'bins':10, 'probes':'', 'thres':0.75} +USAGE = "%(prog)s [options]" +DESCRIPTION = """""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('stats', + help='The type of statistics to compute and print. Give a comma-separated list of stat names, ' + 'choosing from "{}".'.format('", "'.join(STATS))) + parser.add_argument('infile', metavar='read-families.msa.tsv', nargs='?', + help='The --msa output of sscs.py. Will read from stdin if not provided.') + parser.add_argument('-b', '--bins', type=int, + help='The number of bins to segment reads into when doing "diffs-binned".') + parser.add_argument('-p', '--probes', + help='Sequence excerpts from the sense strand. Required for "strand" statistic. ' + 'Comma-separated.') + parser.add_argument('-t', '--thres', type=int, + help='Alignment identity threshold (in fraction, not decimal). Default: %(default)s') + + args = parser.parse_args(argv[1:]) + + stats = args.stats.split(',') + for stat in stats: + if stat not in STATS: + fail('Error: invalid statistic "{}". Must choose one of "{}".'.format(stat, '", "'.join(STATS))) + if 'strand' in stats and not args.probes: + fail('Error: must provide a probe if requesting "strand" statistic.') + + if args.infile: + infile = open(args.infile) + else: + infile = sys.stdin + + family = [] + consensus = None + barcode = None + for line in infile: + fields = line.rstrip('\r\n').split('\t') + if len(fields) != 3: + continue + (this_barcode, name, seq) = fields + if fields[1] == 'CONSENSUS': + if family and consensus: + process_family(stats, barcode, consensus, family, args) + barcode = this_barcode + consensus = seq + family = [] + else: + family.append(seq) + if family and consensus: + process_family(stats, barcode, consensus, family, args) + + if infile is not sys.stdin: + infile.close() + + +#TODO: Maybe print the number of N's in the consensus? +def process_family(stats, barcode, consensus, family, args): + # Compute stats requiring the whole family at once. + for stat in stats: + if stat == 'diffs': + diffs = seqtools.get_diffs_frac_simple(consensus, family) + elif stat == 'diffs-binned': + diffs_binned = seqtools.get_diffs_frac_binned(consensus, family, args.bins) + elif stat == 'strand': + probes = args.probes.split(',') + strand = get_strand(consensus, probes, args.thres) + # Print the requested stats for each read. + # Columns: barcode, [stat columns], read sequence. + for (i, read) in enumerate(family): + sys.stdout.write(barcode+'\t') + for stat in stats: + if stat == 'diffs': + sys.stdout.write('{}\t'.format(round_sig_figs(diffs[i], 3))) + elif stat == 'diffs-binned': + if diffs_binned is None: + sys.stdout.write('\t' * args.bins) + else: + for diff in diffs_binned[i]: + sys.stdout.write(str(round_sig_figs(diff, 3))+'\t') + elif stat == 'seqlen': + sys.stdout.write('{}\t'.format(len(read))) + elif stat == 'strand': + sys.stdout.write('{}\t'.format(strand)) + print read.upper() + + +def get_strand(seq, probes, thres): + """Determine which strand the sequence comes from by trying to align probes from the sense strand. + Returns 'sense', 'anti', or None. + Algorithm: This tries each probe in both directions. + If at least one of the alignments has an identity above the threshold, a vote is cast for the + direction with a higher identity. + If the votes that were cast are unanimous for one direction, that strand is returned. + Else, return None.""" + votes = [] + for probe in probes: + alignment = swalign.smith_waterman(seq, probe) + sense_id = alignment.matches/len(probe) + alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe)) + anti_id = alignment.matches/len(probe) + # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id) + if sense_id > thres or anti_id > thres: + if sense_id > anti_id: + votes.append('sense') + else: + votes.append('anti') + strand = None + for vote in votes: + if strand: + if strand != vote: + return None + else: + strand = vote + return strand + + +def round_sig_figs(n, figs): + if n == 0: + return n + elif n < 0: + n = -n + sign = -1 + elif n > 0: + sign = 1 + elif math.isnan(n) or n == INF: + return n + magnitude = int(math.floor(math.log10(n))) + return sign * round(n, figs - 1 - magnitude) + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |
b |
diff -r 13bcc2f459b0 -r af383638de66 utils/subsample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/subsample.py Mon Nov 23 18:44:23 2015 -0500 |
[ |
@@ -0,0 +1,55 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import random +import argparse + +OPT_DEFAULTS = {'fraction':0.1, 'seed':1} +USAGE = "%(prog)s [options]" +DESCRIPTION = """""" + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('infile', metavar='read-families.tsv', nargs='?', + help='The input reads, sorted into families.') + parser.add_argument('-f', '--fraction', type=float, + help='Fraction of families to output. Default: %(default)s') + parser.add_argument('-s', '--seed', type=int, + help='Random number generator seed. Default: %(default)s') + + args = parser.parse_args(argv[1:]) + + random.seed(args.seed) + + if args.infile: + infile = open(args.infile) + else: + infile = sys.stdin + + family = [] + last_barcode = None + for line in infile: + fields = line.rstrip('\r\n').split('\t') + if not fields: + continue + barcode = fields[0] + if barcode != last_barcode: + if random.random() <= args.fraction: + sys.stdout.write(''.join(family)) + family = [] + family.append(line) + last_barcode = barcode + + if infile is not sys.stdin: + infile.close() + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) |