Mercurial > repos > devteam > fasta_concatenate_by_species
changeset 0:a63b082a26eb
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 10:59:37 -0400 |
parents | |
children | d9f0a11824e9 |
files | fasta_concatenate_by_species.py fasta_concatenate_by_species.xml test-data/cf_maf2fasta.dat test-data/fasta_concatenate_out.fasta utils/__init__.py utils/maf_utilities.py utils/odict.py |
diffstat | 6 files changed, 947 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_concatenate_by_species.py Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,39 @@ +#!/usr/bin/env python +#Dan Blankenberg +""" +Takes a Multiple Alignment FASTA file and concatenates +sequences for each species, resulting in one sequence +alignment per species. +""" + +import sys, tempfile +from utils.maf_utilities import iter_fasta_alignment +from utils.odict import odict + +def __main__(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + species = odict() + cur_size = 0 + for components in iter_fasta_alignment( input_filename ): + species_not_written = species.keys() + for component in components: + if component.species not in species: + species[component.species] = tempfile.TemporaryFile() + species[component.species].write( "-" * cur_size ) + species[component.species].write( component.text ) + try: + species_not_written.remove( component.species ) + except ValueError: + #this is a new species + pass + for spec in species_not_written: + species[spec].write( "-" * len( components[0].text ) ) + cur_size += len( components[0].text ) + out = open( output_filename, 'wb' ) + for spec, f in species.iteritems(): + f.seek( 0 ) + out.write( ">%s\n%s\n" % ( spec, f.read() ) ) + out.close() + +if __name__ == "__main__" : __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_concatenate_by_species.xml Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,72 @@ +<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0"> + <description>FASTA alignment by species</description> + <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command> + <inputs> + <param name="input1" type="data" format="fasta" label="FASTA alignment"/> + </inputs> + <outputs> + <data name="out_file1" format="fasta"/> + </outputs> + <tests> + <test> + <param name="input1" value="cf_maf2fasta.dat" /> + <output name="out_file1" file="fasta_concatenate_out.fasta" /> + </test> + </tests> + <help> + +**What it does** + +This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment. +It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species. + +------- + +**Example** + +Starting FASTA:: + + >hg18.chr1(+):10016339-10016341|hg18_0 + GT + >panTro2.chr1(+):10195380-10195382|panTro2_0 + GT + >rheMac2.chr1(+):13119747-13119749|rheMac2_0 + GT + >mm8.chr4(-):148269679-148269681|mm8_0 + GT + >canFam2.chr5(+):66213635-66213637|canFam2_0 + GT + + >hg18.chr1(-):100323677-100323679|hg18_1 + GT + >panTro2.chr1(-):101678671-101678673|panTro2_1 + GT + >rheMac2.chr1(-):103154011-103154013|rheMac2_1 + GT + >mm8.chr3(+):116620616-116620618|mm8_1 + GT + >canFam2.chr6(+):52954092-52954094|canFam2_1 + GT + + + +becomes:: + + >hg18 + GTGT + >panTro2 + GTGT + >rheMac2 + GTGT + >mm8 + GTGT + >canFam2 + GTGT + + +.. class:: warningmark + + This tool will only work properly on files with Galaxy style FASTA headers. + +</help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cf_maf2fasta.dat Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,134 @@ +>hg17.chr7(+):127471195-127471526|hg17_0 +gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>panTro1.chr6(+):129885076-129885407|panTro1_0 +gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>rheMac2.chr3(+):165787989-165788319|rheMac2_0 +gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGATGTGCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA +>rn3.chr4(+):56178191-56178473|rn3_0 +CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCTGT----CAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTCGG-AGTCTAGCTGTAGACAGCCCAATGGG---------------------------------------------------------TATAAC---------AATACTCACTAA +>mm7.chr6(+):28984529-28984886|mm7_0 +CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAACG-AGGGTGGTCCAGTTACTATCTTG---ACTGCAGCTGG----CAGTCAGTTGCCACT--CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCC-AGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA + +>hg17.chr7(+):127471526-127471584|hg17_1 +AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG +>mm7.chr6(+):28984886-28984940|mm7_1 +----AACGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG +>rheMac2.chr3(+):165788319-165788377|rheMac2_1 +AATTTGTGGTTTATTTATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG +>panTro1.chr6(+):129885407-129885465|panTro1_1 +AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG + +>hg17.chr7(+):127471584-127471688|hg17_2 +GAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt +>panTro1.chr6(+):129885465-129885569|panTro1_2 +GAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt +>rheMac2.chr3(+):165788377-165788482|rheMac2_2 +GAGATATTT-GGggaaatttg-gtatagactagctt--tcatgatgtaagggagttatttttgtgtgataatggccctacagttac-acagaaattcttccttatttttt +>canFam2.chr14(-):11090703-11090811|canFam2_2 +gagatattt-gggggaatttgaatgtagtgttgctcttttgtgatgctaagaaattataattgtctgatgatagtctcgtggttatgggggaaatgcttcctta-ttttt +>bosTau2.chr4(-):50243931-50244034|bosTau2_2 +-agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-ttttt +>rn3.chr4(+):56182200-56182295|rn3_2 +----TATTTGGGGGAAATATG-ATGTGCA----CTT--CCATGATCTTAAAGAATTGCTACTGTTTGATAGTGATCTTATGGTTAA-ATAAAAAAAAT--CTTA-GTTGT +>dasNov1.scaffold_256527(+):298-392|dasNov1_2 +GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-tttta + +>hg17.chr7(+):127471688-127471871|hg17_3 +tgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaatttttcattta--------------------------aaaa- +>panTro1.chr6(+):129885569-129885752|panTro1_3 +tgggaaacaccaaagta-------gggataaaatgtcatgatgtgtgcaatacgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaaattttcattta--------------------------aaaa- +>rheMac2.chr3(+):165788482-165788684|rheMac2_3 +tgggaagcacaaaagta-------gggataaaatgtcatgatgtgtacaatatgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataactgttaaatctaggt-----gttgggtatattgcagttcattatgttattgcacacttttctgtgtgtttaaaattttcatttaaaaatatgttttaaaaatg-------aaaa- +>rn3.chr4(+):56182295-56182489|rn3_3 +TAGAAAATACTCAAATATTTAGGGGCGTGACAATGTCACAGTGTCTGCAATTTGCTTTAAAGATTTTT-----AAA----------TATTTAAAAAAGTTTTAATAATTTTGAAAAACTGAAGCTACACTATG---GGAAGTGGTAATTGTTACATATGGGT-----AATAAGTAT-----AATTCGTTATATTAT-------TTTTC------TTAGAATTTTTCATTTG--------------------------AAAA- +>bosTau2.chr4(-):50243792-50243930|bosTau2_3 +agataaacacttaagtattta---aggatgaaacgccctgatgtttgtaatttgctttagaatattttagccaaaa----------gaattaa-------------------------tgatgc--aaatatg--caaaaagagta--cgttaaacctaa-----------------------------------------------------atttgCGATTttcattta--------------------------aaaa- +>canFam2.chr14(-):11090345-11090505|canFam2_3 +agacacaaactgaagtattta---aggatgaaatgtcatgatgtttgcaattggctttaaaatattttagccaaaa-----------agtaaa-------------------------tgaagc--AAATATG--GGAAGACAATAATCATTAAATCTAGGT-----GATGCATAC---------------------------TTTTCCATATGTTTGAAATTTTCATTTA--------------------------AAAA- +>dasNov1.scaffold_256527(+):393-625|dasNov1_3 +agacgcatgctgaagcatgta---aggataaaatgtcgtggtgtttgtaatttattctaaaacattttagccaaaaacaaataaataaataaa-------------------------tgaagc--aaatatgggggaaatgtttaattgttaaatctagatttaacacggtatataccgtgcttcattatactagtctctacttttccatgtgtttgaaattttCATTAAAATGTTTGTTTGTTGTCTGTTTTAATGAAAT + +>hg17.chr7(+):127471871-127471910|hg17_4 +actttgagctagacaccaggctatgagcta-ggagcatag +>rheMac2.chr3(+):165788684-165788723|rheMac2_4 +actttgagctagataccaggttatgagcta-ggagcatag +>panTro1.chr6(+):129885752-129885791|panTro1_4 +actttgagctagacaccaggctatgagcta-ggagcatag +>bosTau2.chr4(-):50243734-50243773|bosTau2_4 +tcttcgtgcaacgcacggggctatcaatgt-gggatacag +>canFam2.chr14(-):11090081-11090120|canFam2_4 +ACATCAtgctagatcctggactatgagctg-ggtatatag +>dasNov1.scaffold_256527(+):625-665|dasNov1_4 +CCTTTGTGCTAGCCACTGGGATGAAAGCTAGGGAACACAG + +>hg17.chr7(+):127471910-127472074|hg17_5 +caatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACCTG-TGTGATCCTGGCTTTTCCTGTTCCCTCCG---CATCATCACTGCAGGTGTGTTTTCCCAAG +>panTro1.chr6(+):129885791-129885955|panTro1_5 +caatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACATG-TGTGATCCTGGCTTTTCCTGTTCCCTCTG---CATCATCACTGCAGGTGTATTTTCCCAAG +>rheMac2.chr3(+):165788723-165788885|rheMac2_5 +caatgaccaa----------------------------------------------------------------------------------------------atagacccctaccga-ctc-aaagaatgtacattctTTG-GGAAACATGTTTCCATCAGAAAATCTCAAATGCAATGTGACTGGGGTCTCCAGGACCTG-TGTGAGCCTGGCTTTTCCTGTTCCCTCCA---CATCATCACTGCAGGTGTATTTTCCC--G +>mm7.chr6(+):28990714-28990875|mm7_5 +caaaaaccaa------------------------------------------------------------------------------------------------aaaaACCTATAGC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAACATGCATCCGCTAGAAAGTCCCAAGTACACTATGACAGTTG--CCCAGGCCCCGCCTTAAACCTGGTTTTCCTGGTTTCTTTCA---CATCATTACCACGAATATATTTCCTCAAG +>rn3.chr4(+):56183448-56183705|rn3_5 +--ATGACCAATATACACTGTTTACATGTATAGCATTGTGAATGGAGACATAAAAAGATAATCTAGCTTTGTGCTAGGTAGGTGCTGAGCTCTTAACAGTGCTGGGCAGAAACCTATAAC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAGCGTGCTAACCCTAGGAAGTCTCAAATACAATGTGATGGTTGCCCCCAGGCACCACCTTGAACCTGGTCTTCCTGGTTTCTTTCA---CACCATTACCACAAATACATTTTCTCAGG +>bosTau2.chr4(-):50243566-50243734|bosTau2_5 +atgtgaacaa---------------------------------------------------------------------------------------------aacggacccgtgtgggactcggcggagcacacagattttgcgggagCACGTTCCCGTTAGGAAGTCTCTGATGCAATACGACCGGTGCCTTCAGGACCTG-TG--AGGCTGACTTTCCTTA-CCCCTCCACACCATCATCAAGGCAGGTGTGATTTTCCAGG +>canFam2.chr14(-):11089913-11090081|canFam2_5 +cagtgaacaa---------------------------------------------------------------------------------------------aacagagccctgcagt-cttgatggagcacacaacctttg-gggaaCATGTTTCCATAAGAAAGTCTCCAATGTGATCTGA-TGGTGCCGCCAGGACCTA-TGTCAGCCTACCGTTCCATGTCCCCTCCACACCATCATCACTGCAGGTGTGTTTTCCCACA +>dasNov1.scaffold_256527(+):665-786|dasNov1_5 +CAGTGAGCAA-----------------------------------------------------------------------------------------------CAGCCTGGCTCCGT-CC--GGGGGCCGCTCAGCAGCTC-GGGAGCGTGGAGACG---GGAAGTCTGTCACGCGATGCG-----------CTGGGCCCG------------CTGTTCCCGCCCCCCTCC---CCCC----------------TTTCCCAAG + +>hg17.chr7(+):127472074-127472258|hg17_6 +TTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-ccc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagtt +>panTro1.chr6(+):129885955-129886139|panTro1_6 +TTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagtt +>rheMac2.chr3(+):165788885-165789069|rheMac2_6 +TTTTAAA------CATTTACTCTCCCAGTAGCCTTGCATCTCGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTGTACGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccctgtgttcttggcactggaaaagtaccaggacttaaatagta +>mm7.chr6(+):28990875-28991025|mm7_6 +TTTAAAGAAAGTACCCCCTCCTTTCCAGT-GCCTCAAATCTAGAAGAATATTCATAGTGAAGT-GC------------------------ACAGCCGGGTGGTGCATGGTA-ATCTGGAAGTCACCTCTGCAAATCTT-TCC----------------TGTTGGTGCTGTGAAGGCACCAGGACTTCAAGAGTA +>rn3.chr4(+):56183705-56183879|rn3_6 +TTTAAAAGAAGT-CCCACTCCTTTCCAGT-GCCCTAGATCTAGAAGCACATTCATAATGATGT-ACAC-----TAACCC----------GACAGCTGTGTGGTATATGGTA-TCCCGGAAGTCACCTCAGCAAACCTT-TCCCGGGGAACCTACATGGTGTTGGTGCTGTGAAGGTACCAGGTTGTCAAGGGTA +>canFam2.chr14(-):11089743-11089913|canFam2_6 +TTTTAAA------TATCTGC-TTCCCGGTGGCCTTGAGTCTAGAGGAGTCCCCCCACTATGGTGGCACTAATACTGAAGGTCAGAAATAATCAGTTCTGTGGTGCATGTTGCCCCTGAGGTTCTGTTCGGGAAACTTC-TTC-TGAGCAC----ATGCACCTGGCACTGCAAACGTACCAGGA----------- +>dasNov1.scaffold_256527(+):786-964|dasNov1_6 +TTTTAAA------AATTTACCTTCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG- + +>hg17.chr7(+):127472258-127472280|hg17_7 +gatggccca-atccctgtcctct- +>panTro1.chr6(+):129886139-129886161|panTro1_7 +gatggccca-atccctgtcctct- +>rheMac2.chr3(+):165789069-165789091|rheMac2_7 +gatggccca-atccctgtcctct- +>mm7.chr6(+):28991025-28991048|mm7_7 +AATGGCAGAGGGCTCTGTTCTCT- +>rn3.chr4(+):56183879-56183902|rn3_7 +AATGGCAGAGGCCCCTGTTCTCT- +>canFam2.chr14(-):11089526-11089548|canFam2_7 +GGAGACTTG-ATGCCTGCCTTCC- +>dasNov1.scaffold_256527(+):964-987|dasNov1_7 +GACGGCCAG-ACCTCTGCCCTCGG + +>hg17.chr7(+):127472280-127472681|hg17_8 +taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgt +>panTro1.chr6(+):129886161-129886562|panTro1_8 +taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt +>rheMac2.chr3(+):165789091-165789492|rheMac2_8 +taaaacctaatggaggagatggaATG-GGTCACCCAACCCGGACTGAGAGACAGGAATTAGCTGCAAGGGTAACCAGGACAAGCTTCTCTA---ATGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCTGGGAGAAGCGATGGATTCGTAGTTGGGCATCCCCACAGAGGGACTGGAAAGAAAAAAGACCTGGAGGAACCA------ATGTGC-AATGTATGTGTGTTTCCTGGTTcaagggctggcaaactttctcta--aagggccagatagaaaacattttaggctttgtaagccaagg---caaaatcgaggag-attacatgggtacttatacaacaagaataaacaatt---tccacaa--tttttattcacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt +>rn3.chr4(+):56183902-56184219|rn3_8 +------------------------------------GTCCATAGTCAAAG------------------------------AAGCCTCTCAG---ATGGAG--AGCAGGGCCTATGCAAAAGAGGGGGCTTCTGTAGGCAGAAGGGATGGACTAGCCTCCGGACATAGCCATAGAGAGGCTGGCAGGACTGAGACCCAGGAGAAGCCAGCGCAGGTGTGCGGGCGTGTGTATATTTCATAGTTTGCAGGTTGG----------------------------CAAACAATTCCTGCTTTGCAGGCCAAGA---GGAAACTGAAGGTGACCCCGTGAGTGCTTAC---ACAAGAGAAAACAAG-------ACAA-TTTTTGGTTGACCAAATTCAGAA---CTTTATTTGAGGATGC---TAAAGTTTAAATTTCTTTT +>canFam2.chr14(-):11089143-11089523|canFam2_8 +TACAGCCTGTGGGCAGAGGTGGGAAGAGGTCACGCAAGCCAGTTGGAATGAGGGGAGTTGGCTGGAAAGGTGACCAGGACAAGCTACTTCAACCAGGAAG--AAGAGACCCCG-GTG----------------CTTGGAGAAGGCCTGATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatat +>dasNov1.scaffold_256527(+):987-1401|dasNov1_8 +CTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatat + +>hg17.chr7(+):127472681-127472715|hg17_9 +aattttcccat---gagaactattcttcttttgtttt +>rheMac2.chr3(+):165789492-165789526|rheMac2_9 +aattttcacat---aagaactattcttcttttgtttt +>panTro1.chr6(+):129886562-129886596|panTro1_9 +aattttcccgt---gagaactattcttcttttgtttt +>canFam2.chr14(-):11089108-11089143|canFam2_9 +aatggtcatgt--ccataactattcttcttttatttt +>dasNov1.scaffold_256527(+):1401-1433|dasNov1_9 +aattttcacatatcacgaagtatttttttttt----- +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_concatenate_out.fasta Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,16 @@ +>hg17 +gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaatttttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACCTG-TGTGATCCTGGCTTTTCCTGTTCCCTCCG---CATCATCACTGCAGGTGTGTTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-ccc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgtaattttcccat---gagaactattcttcttttgtttt +>panTro1 +gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaaacaccaaagta-------gggataaaatgtcatgatgtgtgcaatacgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaaattttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACATG-TGTGATCCTGGCTTTTCCTGTTCCCTCTG---CATCATCACTGCAGGTGTATTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgtaattttcccgt---gagaactattcttcttttgtttt +>rheMac2 +gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGATGTGCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTTATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGATATTT-GGggaaatttg-gtatagactagctt--tcatgatgtaagggagttatttttgtgtgataatggccctacagttac-acagaaattcttccttatttttttgggaagcacaaaagta-------gggataaaatgtcatgatgtgtacaatatgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataactgttaaatctaggt-----gttgggtatattgcagttcattatgttattgcacacttttctgtgtgtttaaaattttcatttaaaaatatgttttaaaaatg-------aaaa-actttgagctagataccaggttatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagacccctaccga-ctc-aaagaatgtacattctTTG-GGAAACATGTTTCCATCAGAAAATCTCAAATGCAATGTGACTGGGGTCTCCAGGACCTG-TGTGAGCCTGGCTTTTCCTGTTCCCTCCA---CATCATCACTGCAGGTGTATTTTCCC--GTTTTAAA------CATTTACTCTCCCAGTAGCCTTGCATCTCGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTGTACGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccctgtgttcttggcactggaaaagtaccaggacttaaatagtagatggccca-atccctgtcctct-taaaacctaatggaggagatggaATG-GGTCACCCAACCCGGACTGAGAGACAGGAATTAGCTGCAAGGGTAACCAGGACAAGCTTCTCTA---ATGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCTGGGAGAAGCGATGGATTCGTAGTTGGGCATCCCCACAGAGGGACTGGAAAGAAAAAAGACCTGGAGGAACCA------ATGTGC-AATGTATGTGTGTTTCCTGGTTcaagggctggcaaactttctcta--aagggccagatagaaaacattttaggctttgtaagccaagg---caaaatcgaggag-attacatgggtacttatacaacaagaataaacaatt---tccacaa--tttttattcacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgtaattttcacat---aagaactattcttcttttgtttt +>rn3 +CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCTGT----CAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTCGG-AGTCTAGCTGTAGACAGCCCAATGGG---------------------------------------------------------TATAAC---------AATACTCACTAA--------------------------------------------------------------TATTTGGGGGAAATATG-ATGTGCA----CTT--CCATGATCTTAAAGAATTGCTACTGTTTGATAGTGATCTTATGGTTAA-ATAAAAAAAAT--CTTA-GTTGTTAGAAAATACTCAAATATTTAGGGGCGTGACAATGTCACAGTGTCTGCAATTTGCTTTAAAGATTTTT-----AAA----------TATTTAAAAAAGTTTTAATAATTTTGAAAAACTGAAGCTACACTATG---GGAAGTGGTAATTGTTACATATGGGT-----AATAAGTAT-----AATTCGTTATATTAT-------TTTTC------TTAGAATTTTTCATTTG--------------------------AAAA-------------------------------------------ATGACCAATATACACTGTTTACATGTATAGCATTGTGAATGGAGACATAAAAAGATAATCTAGCTTTGTGCTAGGTAGGTGCTGAGCTCTTAACAGTGCTGGGCAGAAACCTATAAC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAGCGTGCTAACCCTAGGAAGTCTCAAATACAATGTGATGGTTGCCCCCAGGCACCACCTTGAACCTGGTCTTCCTGGTTTCTTTCA---CACCATTACCACAAATACATTTTCTCAGGTTTAAAAGAAGT-CCCACTCCTTTCCAGT-GCCCTAGATCTAGAAGCACATTCATAATGATGT-ACAC-----TAACCC----------GACAGCTGTGTGGTATATGGTA-TCCCGGAAGTCACCTCAGCAAACCTT-TCCCGGGGAACCTACATGGTGTTGGTGCTGTGAAGGTACCAGGTTGTCAAGGGTAAATGGCAGAGGCCCCTGTTCTCT-------------------------------------GTCCATAGTCAAAG------------------------------AAGCCTCTCAG---ATGGAG--AGCAGGGCCTATGCAAAAGAGGGGGCTTCTGTAGGCAGAAGGGATGGACTAGCCTCCGGACATAGCCATAGAGAGGCTGGCAGGACTGAGACCCAGGAGAAGCCAGCGCAGGTGTGCGGGCGTGTGTATATTTCATAGTTTGCAGGTTGG----------------------------CAAACAATTCCTGCTTTGCAGGCCAAGA---GGAAACTGAAGGTGACCCCGTGAGTGCTTAC---ACAAGAGAAAACAAG-------ACAA-TTTTTGGTTGACCAAATTCAGAA---CTTTATTTGAGGATGC---TAAAGTTTAAATTTCTTTT------------------------------------- +>mm7 +CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAACG-AGGGTGGTCCAGTTACTATCTTG---ACTGCAGCTGG----CAGTCAGTTGCCACT--CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCC-AGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA----AACGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------caaaaaccaa------------------------------------------------------------------------------------------------aaaaACCTATAGC-CTC-ACAGGGTGGGTTGTCTTTG-AGGAACATGCATCCGCTAGAAAGTCCCAAGTACACTATGACAGTTG--CCCAGGCCCCGCCTTAAACCTGGTTTTCCTGGTTTCTTTCA---CATCATTACCACGAATATATTTCCTCAAGTTTAAAGAAAGTACCCCCTCCTTTCCAGT-GCCTCAAATCTAGAAGAATATTCATAGTGAAGT-GC------------------------ACAGCCGGGTGGTGCATGGTA-ATCTGGAAGTCACCTCTGCAAATCTT-TCC----------------TGTTGGTGCTGTGAAGGCACCAGGACTTCAAGAGTAAATGGCAGAGGGCTCTGTTCTCT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +>canFam2 +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------gagatattt-gggggaatttgaatgtagtgttgctcttttgtgatgctaagaaattataattgtctgatgatagtctcgtggttatgggggaaatgcttcctta-tttttagacacaaactgaagtattta---aggatgaaatgtcatgatgtttgcaattggctttaaaatattttagccaaaa-----------agtaaa-------------------------tgaagc--AAATATG--GGAAGACAATAATCATTAAATCTAGGT-----GATGCATAC---------------------------TTTTCCATATGTTTGAAATTTTCATTTA--------------------------AAAA-ACATCAtgctagatcctggactatgagctg-ggtatatagcagtgaacaa---------------------------------------------------------------------------------------------aacagagccctgcagt-cttgatggagcacacaacctttg-gggaaCATGTTTCCATAAGAAAGTCTCCAATGTGATCTGA-TGGTGCCGCCAGGACCTA-TGTCAGCCTACCGTTCCATGTCCCCTCCACACCATCATCACTGCAGGTGTGTTTTCCCACATTTTAAA------TATCTGC-TTCCCGGTGGCCTTGAGTCTAGAGGAGTCCCCCCACTATGGTGGCACTAATACTGAAGGTCAGAAATAATCAGTTCTGTGGTGCATGTTGCCCCTGAGGTTCTGTTCGGGAAACTTC-TTC-TGAGCAC----ATGCACCTGGCACTGCAAACGTACCAGGA-----------GGAGACTTG-ATGCCTGCCTTCC-TACAGCCTGTGGGCAGAGGTGGGAAGAGGTCACGCAAGCCAGTTGGAATGAGGGGAGTTGGCTGGAAAGGTGACCAGGACAAGCTACTTCAACCAGGAAG--AAGAGACCCCG-GTG----------------CTTGGAGAAGGCCTGATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatataatggtcatgt--ccataactattcttcttttatttt +>bosTau2 +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-tttttagataaacacttaagtattta---aggatgaaacgccctgatgtttgtaatttgctttagaatattttagccaaaa----------gaattaa-------------------------tgatgc--aaatatg--caaaaagagta--cgttaaacctaa-----------------------------------------------------atttgCGATTttcattta--------------------------aaaa-tcttcgtgcaacgcacggggctatcaatgt-gggatacagatgtgaacaa---------------------------------------------------------------------------------------------aacggacccgtgtgggactcggcggagcacacagattttgcgggagCACGTTCCCGTTAGGAAGTCTCTGATGCAATACGACCGGTGCCTTCAGGACCTG-TG--AGGCTGACTTTCCTTA-CCCCTCCACACCATCATCAAGGCAGGTGTGATTTTCCAGG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +>dasNov1 +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-ttttaagacgcatgctgaagcatgta---aggataaaatgtcgtggtgtttgtaatttattctaaaacattttagccaaaaacaaataaataaataaa-------------------------tgaagc--aaatatgggggaaatgtttaattgttaaatctagatttaacacggtatataccgtgcttcattatactagtctctacttttccatgtgtttgaaattttCATTAAAATGTTTGTTTGTTGTCTGTTTTAATGAAATCCTTTGTGCTAGCCACTGGGATGAAAGCTAGGGAACACAGCAGTGAGCAA-----------------------------------------------------------------------------------------------CAGCCTGGCTCCGT-CC--GGGGGCCGCTCAGCAGCTC-GGGAGCGTGGAGACG---GGAAGTCTGTCACGCGATGCG-----------CTGGGCCCG------------CTGTTCCCGCCCCCCTCC---CCCC----------------TTTCCCAAGTTTTAAA------AATTTACCTTCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG-GACGGCCAG-ACCTCTGCCCTCGGCTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatataattttcacatatcacgaagtatttttttttt-----
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/maf_utilities.py Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,601 @@ +#!/usr/bin/env python +""" +Provides wrappers and utilities for working with MAF files and alignments. +""" +#Dan Blankenberg +import pkg_resources; pkg_resources.require( "bx-python" ) +import bx.align.maf +import bx.intervals +import bx.interval_index_file +import sys, os, string, tempfile +import logging +from copy import deepcopy + +assert sys.version_info[:2] >= ( 2, 4 ) + +log = logging.getLogger(__name__) + + +GAP_CHARS = [ '-' ] +SRC_SPLIT_CHAR = '.' + +def src_split( src ): + fields = src.split( SRC_SPLIT_CHAR, 1 ) + spec = fields.pop( 0 ) + if fields: + chrom = fields.pop( 0 ) + else: + chrom = spec + return spec, chrom + +def src_merge( spec, chrom, contig = None ): + if None in [ spec, chrom ]: + spec = chrom = spec or chrom + return bx.align.maf.src_merge( spec, chrom, contig ) + +def get_species_in_block( block ): + species = [] + for c in block.components: + spec, chrom = src_split( c.src ) + if spec not in species: + species.append( spec ) + return species + +def tool_fail( msg = "Unknown Error" ): + print >> sys.stderr, "Fatal Error: %s" % msg + sys.exit() + +#an object corresponding to a reference layered alignment +class RegionAlignment( object ): + + DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) + MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed + + def __init__( self, size, species = [] ): + assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) + self.size = size + self.sequences = {} + if not isinstance( species, list ): + species = [species] + for spec in species: + self.add_species( spec ) + + #add a species to the alignment + def add_species( self, species ): + #make temporary sequence files + self.sequences[species] = tempfile.TemporaryFile() + self.sequences[species].write( "-" * self.size ) + + #returns the names for species found in alignment, skipping names as requested + def get_species_names( self, skip = [] ): + if not isinstance( skip, list ): skip = [skip] + names = self.sequences.keys() + for name in skip: + try: names.remove( name ) + except: pass + return names + + #returns the sequence for a species + def get_sequence( self, species ): + self.sequences[species].seek( 0 ) + return self.sequences[species].read() + + #returns the reverse complement of the sequence for a species + def get_sequence_reverse_complement( self, species ): + complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] + complement.reverse() + return "".join( complement ) + + #sets a position for a species + def set_position( self, index, species, base ): + if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) + return self.set_range( index, species, base ) + #sets a range for a species + def set_range( self, index, species, bases ): + if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) + if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) + if species not in self.sequences.keys(): self.add_species( species ) + self.sequences[species].seek( index ) + self.sequences[species].write( bases ) + + #Flush temp file of specified species, or all species + def flush( self, species = None ): + if species is None: + species = self.sequences.keys() + elif not isinstance( species, list ): + species = [species] + for spec in species: + self.sequences[spec].flush() + +class GenomicRegionAlignment( RegionAlignment ): + + def __init__( self, start, end, species = [] ): + RegionAlignment.__init__( self, end - start, species ) + self.start = start + self.end = end + +class SplicedAlignment( object ): + + DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) + + def __init__( self, exon_starts, exon_ends, species = [] ): + if not isinstance( exon_starts, list ): + exon_starts = [exon_starts] + if not isinstance( exon_ends, list ): + exon_ends = [exon_ends] + assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." + self.exons = [] + for i in range( len( exon_starts ) ): + self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) + + #returns the names for species found in alignment, skipping names as requested + def get_species_names( self, skip = [] ): + if not isinstance( skip, list ): skip = [skip] + names = [] + for exon in self.exons: + for name in exon.get_species_names( skip = skip ): + if name not in names: + names.append( name ) + return names + + #returns the sequence for a species + def get_sequence( self, species ): + sequence = tempfile.TemporaryFile() + for exon in self.exons: + if species in exon.get_species_names(): + sequence.write( exon.get_sequence( species ) ) + else: + sequence.write( "-" * exon.size ) + sequence.seek( 0 ) + return sequence.read() + + #returns the reverse complement of the sequence for a species + def get_sequence_reverse_complement( self, species ): + complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] + complement.reverse() + return "".join( complement ) + + #Start and end of coding region + @property + def start( self ): + return self.exons[0].start + @property + def end( self ): + return self.exons[-1].end + +#Open a MAF index using a UID +def maf_index_by_uid( maf_uid, index_location_file ): + for line in open( index_location_file ): + try: + #read each line, if not enough fields, go to next line + if line[0:1] == "#" : continue + fields = line.split('\t') + if maf_uid == fields[1]: + try: + maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) + return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) + except Exception, e: + raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) + except: + pass + return None + +#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created +def open_or_build_maf_index( maf_file, index_filename, species = None ): + try: + return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) + except: + return build_maf_index( maf_file, species = species ) + +def build_maf_index_species_chromosomes( filename, index_species = None ): + species = [] + species_chromosomes = {} + indexes = bx.interval_index_file.Indexes() + blocks = 0 + try: + maf_reader = bx.align.maf.Reader( open( filename ) ) + while True: + pos = maf_reader.file.tell() + block = maf_reader.next() + if block is None: + break + blocks += 1 + for c in block.components: + spec = c.src + chrom = None + if "." in spec: + spec, chrom = spec.split( ".", 1 ) + if spec not in species: + species.append( spec ) + species_chromosomes[spec] = [] + if chrom and chrom not in species_chromosomes[spec]: + species_chromosomes[spec].append( chrom ) + if index_species is None or spec in index_species: + forward_strand_start = c.forward_strand_start + forward_strand_end = c.forward_strand_end + try: + forward_strand_start = int( forward_strand_start ) + forward_strand_end = int( forward_strand_end ) + except ValueError: + continue #start and end are not integers, can't add component to index, goto next component + #this likely only occurs when parse_e_rows is True? + #could a species exist as only e rows? should the + if forward_strand_end > forward_strand_start: + #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed + indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) + except Exception, e: + #most likely a bad MAF + log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) + return ( None, [], {}, 0 ) + return ( indexes, species, species_chromosomes, blocks ) + +#builds and returns ( index, index_filename ) for specified maf_file +def build_maf_index( maf_file, species = None ): + indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) + if indexes is not None: + fd, index_filename = tempfile.mkstemp() + out = os.fdopen( fd, 'w' ) + indexes.write( out ) + out.close() + return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) + return ( None, None ) + +def component_overlaps_region( c, region ): + if c is None: return False + start, end = c.get_forward_strand_start(), c.get_forward_strand_end() + if region.start >= end or region.end <= start: + return False + return True + +def chop_block_by_region( block, src, region, species = None, mincols = 0 ): + # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: + # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) + # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency + # comments welcome + slice_start = block.text_size #max for the min() + slice_end = 0 #min for the max() + old_score = block.score #save old score for later use + # We no longer assume only one occurance of src per block, so we need to check them all + for c in iter_components_by_src( block, src ): + if component_overlaps_region( c, region ): + if c.text is not None: + rev_strand = False + if c.strand == "-": + #We want our coord_to_col coordinates to be returned from positive stranded component + rev_strand = True + c = c.reverse_complement() + start = max( region.start, c.start ) + end = min( region.end, c.end ) + start = c.coord_to_col( start ) + end = c.coord_to_col( end ) + if rev_strand: + #need to orient slice coordinates to the original block direction + slice_len = end - start + end = len( c.text ) - start + start = end - slice_len + slice_start = min( start, slice_start ) + slice_end = max( end, slice_end ) + + if slice_start < slice_end: + block = block.slice( slice_start, slice_end ) + if block.text_size > mincols: + # restore old score, may not be accurate, but it is better than 0 for everything? + block.score = old_score + if species is not None: + block = block.limit_to_species( species ) + block.remove_all_gap_columns() + return block + return None + +def orient_block_by_region( block, src, region, force_strand = None ): + #loop through components matching src, + #make sure each of these components overlap region + #cache strand for each of overlaping regions + #if force_strand / region.strand not in strand cache, reverse complement + ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing + strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] + if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): + block = block.reverse_complement() + return block + +def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): + for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): + yield block +def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): + yield orient_block_by_region( block, src, region, force_strand ), idx, offset + +#split a block with multiple occurances of src into one block per src +def iter_blocks_split_by_src( block, src ): + for src_c in iter_components_by_src( block, src ): + new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) + new_block.text_size = block.text_size + for c in block.components: + if c == src_c or c.src != src: + new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components + yield new_block + +#split a block into multiple blocks with all combinations of a species appearing only once per block +def iter_blocks_split_by_species( block, species = None ): + def __split_components_by_species( components_by_species, new_block ): + if components_by_species: + #more species with components to add to this block + components_by_species = deepcopy( components_by_species ) + spec_comps = components_by_species.pop( 0 ) + for c in spec_comps: + newer_block = deepcopy( new_block ) + newer_block.add_component( deepcopy( c ) ) + for value in __split_components_by_species( components_by_species, newer_block ): + yield value + else: + #no more components to add, yield this block + yield new_block + + #divide components by species + spec_dict = {} + if not species: + species = [] + for c in block.components: + spec, chrom = src_split( c.src ) + if spec not in spec_dict: + spec_dict[ spec ] = [] + species.append( spec ) + spec_dict[ spec ].append( c ) + else: + for spec in species: + spec_dict[ spec ] = [] + for c in iter_components_by_src_start( block, spec ): + spec_dict[ spec ].append( c ) + + empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? + empty_block.text_size = block.text_size + #call recursive function to split into each combo of spec/blocks + for value in __split_components_by_species( spec_dict.values(), empty_block ): + sort_block_components_by_block( value, block ) #restore original component order + yield value + + +#generator yielding only chopped and valid blocks for a specified region +def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): + yield block +def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): + for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): + block = chop_block_by_region( block, src, region, species, mincols ) + if block is not None: + yield block, idx, offset + +#returns a filled region alignment for specified regions +def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): + if species is not None: alignment = RegionAlignment( end - start, species ) + else: alignment = RegionAlignment( end - start, primary_species ) + return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) + +#reduces a block to only positions exisiting in the src provided +def reduce_block_by_primary_genome( block, species, chromosome, region_start ): + #returns ( startIndex, {species:texts} + #where texts' contents are reduced to only positions existing in the primary genome + src = "%s.%s" % ( species, chromosome ) + ref = block.get_component_by_src( src ) + start_offset = ref.start - region_start + species_texts = {} + for c in block.components: + species_texts[ c.src.split( '.' )[0] ] = list( c.text ) + #remove locations which are gaps in the primary species, starting from the downstream end + for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): + if species_texts[ species ][i] == '-': + for text in species_texts.values(): + text.pop( i ) + for spec, text in species_texts.items(): + species_texts[spec] = ''.join( text ) + return ( start_offset, species_texts ) + +#fills a region alignment +def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): + region = bx.intervals.Interval( start, end ) + region.chrom = chrom + region.strand = strand + primary_src = "%s.%s" % ( primary_species, chrom ) + + #Order blocks overlaping this position by score, lowest first + blocks = [] + for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): + score = float( block.score ) + for i in range( 0, len( blocks ) ): + if score < blocks[i][0]: + blocks.insert( i, ( score, idx, offset ) ) + break + else: + blocks.append( ( score, idx, offset ) ) + + #gap_chars_tuple = tuple( GAP_CHARS ) + gap_chars_str = ''.join( GAP_CHARS ) + #Loop through ordered blocks and layer by increasing score + for block_dict in blocks: + for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately + if component_overlaps_region( block.get_component_by_src( primary_src ), region ): + block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block + block = orient_block_by_region( block, primary_src, region ) #orient block + start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) + for spec, text in species_texts.items(): + #we should trim gaps from both sides, since these are not positions in this species genome (sequence) + text = text.rstrip( gap_chars_str ) + gap_offset = 0 + while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() + #while text.startswith( gap_chars_tuple ): + gap_offset += 1 + text = text[1:] + if not text: + break + if text: + if overwrite_with_gaps: + alignment.set_range( start_offset + gap_offset, spec, text ) + else: + for i, char in enumerate( text ): + if char not in GAP_CHARS: + alignment.set_position( start_offset + gap_offset + i, spec, char ) + return alignment + +#returns a filled spliced region alignment for specified region with start and end lists +def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): + #create spliced alignment object + if species is not None: alignment = SplicedAlignment( starts, ends, species ) + else: alignment = SplicedAlignment( starts, ends, [primary_species] ) + for exon in alignment.exons: + fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) + return alignment + +#loop through string array, only return non-commented lines +def line_enumerator( lines, comment_start = '#' ): + i = 0 + for line in lines: + if not line.startswith( comment_start ): + i += 1 + yield ( i, line ) + +#read a GeneBed file, return list of starts, ends, raw fields +def get_starts_ends_fields_from_gene_bed( line ): + #Starts and ends for exons + starts = [] + ends = [] + + fields = line.split() + #Requires atleast 12 BED columns + if len(fields) < 12: + raise Exception( "Not a proper 12 column BED line (%s)." % line ) + chrom = fields[0] + tx_start = int( fields[1] ) + tx_end = int( fields[2] ) + name = fields[3] + strand = fields[5] + if strand != '-': strand='+' #Default strand is + + cds_start = int( fields[6] ) + cds_end = int( fields[7] ) + + #Calculate and store starts and ends of coding exons + region_start, region_end = cds_start, cds_end + exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) + exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) + exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) + exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); + for start, end in zip( exon_starts, exon_ends ): + start = max( start, region_start ) + end = min( end, region_end ) + if start < end: + starts.append( start ) + ends.append( end ) + return ( starts, ends, fields ) + +def iter_components_by_src( block, src ): + for c in block.components: + if c.src == src: + yield c + +def get_components_by_src( block, src ): + return [ value for value in iter_components_by_src( block, src ) ] + +def iter_components_by_src_start( block, src ): + for c in block.components: + if c.src.startswith( src ): + yield c + +def get_components_by_src_start( block, src ): + return [ value for value in iter_components_by_src_start( block, src ) ] + +def sort_block_components_by_block( block1, block2 ): + #orders the components in block1 by the index of the component in block2 + #block1 must be a subset of block2 + #occurs in-place + return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) + +def get_species_in_maf( maf_filename ): + species = [] + for block in bx.align.maf.Reader( open( maf_filename ) ): + for spec in get_species_in_block( block ): + if spec not in species: + species.append( spec ) + return species + +def parse_species_option( species ): + if species: + species = species.split( ',' ) + if 'None' not in species: + return species + return None #provided species was '', None, or had 'None' in it + +def remove_temp_index_file( index_filename ): + try: os.unlink( index_filename ) + except: pass + +#Below are methods to deal with FASTA files + +def get_fasta_header( component, attributes = {}, suffix = None ): + header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) + for key, value in attributes.iteritems(): + header = "%s%s=%s|" % ( header, key, value ) + if suffix: + header = "%s%s" % ( header, suffix ) + else: + header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) + return header + +def get_attributes_from_fasta_header( header ): + if not header: return {} + attributes = {} + header = header.lstrip( '>' ) + header = header.strip() + fields = header.split( '|' ) + try: + region = fields[0] + region = region.split( '(', 1 ) + temp = region[0].split( '.', 1 ) + attributes['species'] = temp[0] + if len( temp ) == 2: + attributes['chrom'] = temp[1] + else: + attributes['chrom'] = temp[0] + region = region[1].split( ')', 1 ) + attributes['strand'] = region[0] + region = region[1].lstrip( ':' ).split( '-' ) + attributes['start'] = int( region[0] ) + attributes['end'] = int( region[1] ) + except: + #fields 0 is not a region coordinate + pass + if len( fields ) > 2: + for i in xrange( 1, len( fields ) - 1 ): + prop = fields[i].split( '=', 1 ) + if len( prop ) == 2: + attributes[ prop[0] ] = prop[1] + if len( fields ) > 1: + attributes['__suffix__'] = fields[-1] + return attributes + +def iter_fasta_alignment( filename ): + class fastaComponent: + def __init__( self, species, text = "" ): + self.species = species + self.text = text + def extend( self, text ): + self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() + #yields a list of fastaComponents for a FASTA file + f = open( filename, 'rb' ) + components = [] + #cur_component = None + while True: + line = f.readline() + if not line: + if components: + yield components + return + line = line.strip() + if not line: + if components: + yield components + components = [] + elif line.startswith( '>' ): + attributes = get_attributes_from_fasta_header( line ) + components.append( fastaComponent( attributes['species'] ) ) + elif components: + components[-1].extend( line ) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/odict.py Mon May 19 10:59:37 2014 -0400 @@ -0,0 +1,85 @@ +""" +Ordered dictionary implementation. +""" + +from UserDict import UserDict + +class odict(UserDict): + """ + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 + + This dictionary class extends UserDict to record the order in which items are + added. Calling keys(), values(), items(), etc. will return results in this + order. + """ + def __init__( self, dict = None ): + self._keys = [] + UserDict.__init__( self, dict ) + + def __delitem__( self, key ): + UserDict.__delitem__( self, key ) + self._keys.remove( key ) + + def __setitem__( self, key, item ): + UserDict.__setitem__( self, key, item ) + if key not in self._keys: + self._keys.append( key ) + + def clear( self ): + UserDict.clear( self ) + self._keys = [] + + def copy(self): + new = odict() + new.update( self ) + return new + + def items( self ): + return zip( self._keys, self.values() ) + + def keys( self ): + return self._keys[:] + + def popitem( self ): + try: + key = self._keys[-1] + except IndexError: + raise KeyError( 'dictionary is empty' ) + val = self[ key ] + del self[ key ] + return ( key, val ) + + def setdefault( self, key, failobj=None ): + if key not in self._keys: + self._keys.append( key ) + return UserDict.setdefault( self, key, failobj ) + + def update( self, dict ): + for ( key, val ) in dict.items(): + self.__setitem__( key, val ) + + def values( self ): + return map( self.get, self._keys ) + + def iterkeys( self ): + return iter( self._keys ) + + def itervalues( self ): + for key in self._keys: + yield self.get( key ) + + def iteritems( self ): + for key in self._keys: + yield key, self.get( key ) + + def __iter__( self ): + for key in self._keys: + yield key + + def reverse( self ): + self._keys.reverse() + + def insert( self, index, key, item ): + if key not in self._keys: + self._keys.insert( index, key ) + UserDict.__setitem__( self, key, item )