# HG changeset patch # User jjohnson # Date 1387323913 18000 # Node ID 41a666a3d8a5f9495bc741f3c093e06ba7e8b365 Uploaded diff -r 000000000000 -r 41a666a3d8a5 snpeff_to_peptides.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpeff_to_peptides.py Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,239 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2013, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + + +""" +This tool takes a SnpEff VCF file and an Ensembl pep.all.fa file ( e.g. Homo_sapiens.GRCh37.73.pep.all.fa ) +It outputs a peptide fasta file with the variant peptide sequence that result from NON_SYNONYMOUS_CODING effects + +""" + +import sys,re,os.path +import tempfile +import optparse +from optparse import OptionParser +import logging + +## dictionary for Amino Acid Abbreviations +aa_abbrev_dict = dict() +aa_abbrev_dict['Phe'] = 'F' +aa_abbrev_dict['Leu'] = 'L' +aa_abbrev_dict['Ser'] = 'S' +aa_abbrev_dict['Tyr'] = 'Y' +aa_abbrev_dict['Cys'] = 'C' +aa_abbrev_dict['Trp'] = 'W' +aa_abbrev_dict['Pro'] = 'P' +aa_abbrev_dict['His'] = 'H' +aa_abbrev_dict['Gln'] = 'Q' +aa_abbrev_dict['Arg'] = 'R' +aa_abbrev_dict['Ile'] = 'I' +aa_abbrev_dict['Met'] = 'M' +aa_abbrev_dict['Thr'] = 'T' +aa_abbrev_dict['Asn'] = 'N' +aa_abbrev_dict['Lys'] = 'K' +aa_abbrev_dict['Val'] = 'V' +aa_abbrev_dict['Ala'] = 'A' +aa_abbrev_dict['Asp'] = 'D' +aa_abbrev_dict['Glu'] = 'E' +aa_abbrev_dict['Gly'] = 'G' + +## Get the peptide ID and sequence a given ID +def get_sequence(id,seq_file): + fh = open(seq_file, 'r') + try: + for (ln,line) in enumerate(fh): + if line.find(id) >= 0: + fields = line.split('\t') + return ( ' '.join(fields[0:-1]),fields[-1].rstrip() if fields and len(fields) > 0 else None ) + except Exception, e: + print >> sys.stderr, "failed: %s" % e + finally: + fh.close() + +def fasta_to_tabular(fasta_file,tabular_file): + inFile = open(fasta_file,'r') + outFile = open(tabular_file,'w') + for i, line in enumerate( inFile ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + if line.startswith( '>' ): + #Don't want any existing tabs to trigger extra columns: + line = line.replace('\t', ' ') + if i > 0: + outFile.write('\n') + outFile.write(line[1:]) + outFile.write('\t') + else: + outFile.write(line) + if i > 0: + outFile.write('\n') + if inFile: + inFile.close() + if outFile: + outFile.close() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-i', '--input', dest='input', help='The input snpeff vcf file with HGVS annotations (else read from stdin)' ) + parser.add_option( '-o', '--output', dest='output', help='The output fasta (else write to stdout)' ) + parser.add_option( '-p', '--protein_fasta', dest='protein_fasta', default=None, help='The Esembl protein fasta in tabular format' ) + parser.add_option( '-l', '--leading_aa_num', dest='leading_aa_num', type='int', default=None, help='leading number of AAs to output' ) + parser.add_option( '-t', '--trailing_aa_num', dest='trailing_aa_num', type='int', default=None, help='trailing number of AAs to output' ) + parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) + (options, args) = parser.parse_args() + + # need protein_fasta file + fastaFile = options.protein_fasta + if options.protein_fasta == None: + print >> sys.stderr, "Ensembl protein_fasta tabular file required" + exit(4) + else: + # determine if fasta is already in tabular format + is_tabular = False + standard_aa = '^[AC-IK-WY]+$' + standard_na = '^[ACGTN]+$' + inFile = open(fastaFile,'r') + try: + nseq = 0 + for i, line in enumerate( inFile ): + line = line.rstrip( '\r\n' ) + if not line or line.startswith( '#' ): + continue + fields = line.split('\t') + if len(fields) < 2: + is_tabular = False + if line[0] != '>': + print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile + exit(4) + break + if re.match('^[A-Z]+$',fields[-1].upper()): + is_tabular = True + nseq += 1 + else: + if line[0] != '>': + print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile + exit(4) + if nseq > 10: + break + finally: + if inFile: + inFile.close() + if not is_tabular: + fastaFile = tempfile.NamedTemporaryFile(prefix='pep_fasta_',suffix=".tab",dir=os.getcwd()).name + fasta_to_tabular(options.protein_fasta,fastaFile) + # vcf input + if options.input != None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # output + if options.output != None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + ## Amino_Acid_Change notations + # G528R + # p.Gly528Arg/c.1582G>C + aa_change_regex = '([A-Z])(\d+)([A-Z])' # G528R + aa_hgvs_regex = 'p\.([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])(/c\.(\d+)([ACGTN])>([ACGTN]))' # p.Gly528Arg/c.1582G>C + # Save VCF file header, not currently used + vcf_header = [] + reading_entries = False + try: + for linenum,line in enumerate(inputFile): + ## print >> sys.stderr, "%d: %s\n" % (linenum,line) + if line.startswith('##'): + vcf_header.append(line) + # May need to check SnpEff version in the header, the EFF info changed between versions 2 and 3 + ##SnpEffVersion + elif line.startswith('#CHROM'): + reading_entries = True + else: + fields = line.split('\t') + # This is the current format of the EFF entry: + # EFF=missense(MODERATE|MISSENSE|Ggg/Cgg|G528R|802|SCNN1D|protein_coding|CODING|ENST00000379116|12|1);OICR=(ENST00000379116|1808) + # If this becomes variable, will need to dynamically pattern this on the defintion in the vcf header: + ##INFO= + (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] + for info_item in info.split(';'): + try: + if info_item.find('=') < 0: + continue + (key,val) = info_item.split('=',1) + if key == 'EFF': + effects = val.split(',') + for effect in effects: + (eff,effs) = effect.rstrip(')').split('(') + if not eff == 'NON_SYNONYMOUS_CODING': + continue + eff_fields = effs.split('|') + (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] + if transcript: + aa_pos = None # 1-based position + alt_aa = '_' + # parse aa_change + # get AA change position and alternate Animo Acid + sap = aa_change + m = re.match(aa_change_regex,aa_change) + if m: + aa_pos = int(m.groups()[1]) + alt_aa = m.groups()[2] + else: + m = re.match(aa_hgvs_regex,aa_change) + if m: + aa_pos = int(m.groups()[1]) + ref_aa = aa_abbrev_dict[m.groups()[0]] + alt_aa = aa_abbrev_dict[m.groups()[2]] + sap = "%s%d%s" % (ref_aa,aa_pos,alt_aa) + if not aa_pos: + continue + # get AA sequence + aa_offset = aa_pos - 1 + (pep_id,pep_seq) = get_sequence(transcript,fastaFile) + if not pep_seq: + continue + start_pos = max(aa_offset - options.leading_aa_num, 0) if options.leading_aa_num else 0 + end_pos = min(aa_offset + options.trailing_aa_num + 1, len(pep_seq)) if options.trailing_aa_num else len(pep_seq) + # transform sequence + alt_seq = pep_seq[start_pos:aa_offset] + alt_aa + pep_seq[aa_offset+1:end_pos] + # >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:1:22778472 codon_change:Gtg/Atg sap:V885M + pep_id = re.sub('pep:[a-z]*','pep:sap',pep_id) + hdr = ">%s snp_location:%s:%s codon_change:%s sap:%s\n" % (pep_id, chrom, pos, codon_change, sap) + outputFile.write(hdr) + if options.debug: + trimmed_seq = pep_seq[start_pos:end_pos] + outputFile.write(trimmed_seq) + outputFile.write('\n') + outputFile.write(alt_seq) + outputFile.write('\n') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(1) + +if __name__ == "__main__" : __main__() + diff -r 000000000000 -r 41a666a3d8a5 snpeff_to_peptides.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpeff_to_peptides.xml Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,76 @@ + + + to create a Search DB fasta for variant SAP peptides + snpeff_to_peptides.py --input "$snpeff_vcf" --protein_fasta "$all_pep_fasta" --output "$peptide_variant_fasta" + #if $leading_aa_num: + --leading_aa_num $leading_aa_num + #end if + #if $trailing_aa_num: + --trailing_aa_num $trailing_aa_num + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**SnpEff to Peptide Fasta** + +This generates a fasta file of peptide sequences with SAPs ( Single Amino acid Polymorphisms ) +from the NON_SYNONYMOUS_CODING EFF annnotations from the SnpEff_ application. +The SnpEff VCF may be filtered or annotated using SnpSift. + +The following is appended to the fasta ID line: snp_location:chr:position codon_change:nnn/nnn sap:AposA + +For VCF entry:: + + chr1 22846709 . G A 9.31 . DP=2;VDB=0.0174;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Gtg/Atg|V885M|1127|ZBTB40|protein_coding|CODING|ENST00000374651|12|1) PL 40,6,0 + +The peptide fasta entry that matches transcript ID: ENST00000374651 would be:: + + >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding + +The ID of the output peptide fasta ID would be:: + + >ENSP00000363782 pep:sap chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:22846709 codon_change:Gtg/Atg sap:V885M + + +.. _SnpEff: http://snpeff.sourceforge.net/index.html + +**Citation** + +SnpEff citation: +"A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672 [PubMed - in process] + +SnpSift citation: +"Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift", Cingolani, P., et. al., Frontiers in Genetics, 3, 2012. + + + diff -r 000000000000 -r 41a666a3d8a5 test-data/all_pep.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_pep.fa Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,47 @@ +>ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding +MELPNYSRQLLQQLYTLCKEQQFCDCTISIGTIYFRAHKLVLAAASLLFKTLLDNTDTISIDASVVSPEE +FALLLEMMYTGKLPVGKHNFSKIISLADSLQMFDVAVSCKNLLTSLVNCSVQGQVVRDVSAPSSETFRKE +PEKPQVEILSSEGAGEPHSSPELAATPGGPVKAETEEAAHSVSQEMSVNSPTAQESQRNAETPAETPTTA +EACSPSPAVQTFSEAKKTSTEPGCERKHYQLNFLLENEGVFSDALMVTQDVLKKLEMCSEIKGPQKEVIL +NCCEGRTPKETIENLLHRMTEEKTLTAEGLVKLLQAVKTTFPNLGLLLEKLQKSATLPSTTVQPSPDDYG +TELLRRYHENLSEIFTDNQILLKMISHMTSLAPGEREVMEKLVKRDSGSGGFNSLISAVLEKQTLSATAI +WQLLLVVQETKTCPLDLLMEEIRREPGADAFFRAVTTPEHATLETILRHNQLILEAIQQKIEYKLFTSEE +EHLAETVKEILSIPSETASPEASLRAVLSRAMEKSVPAIEICHLLCSVHKSFPGLQPVMQELAYIGVLTK +EDGEKETWKVSNKFHLEANNKEDEKAAKEDSQPGEQNDQGETGSLPGQQEKEASASPDPAKKSFICKACD +KSFHFYCRLKVHMKRCRVAKSKQVQCKECSETKDSKKELDKHQLEAHGAGGEPDAPKKKKKRLPVTCDLC +GREFAHASGMQYHKLTEHFDEKPFSCEECGAKFAANSTLKNHLRLHTGDRPFMCKHCLMTFTQASALAYH +TKKKHSEGKMYACQYCDAVFAQSIELSRHVRTHTGDKPYVCRDCGKGFRQANGLSIHLHTFHNIEDPYDC +KKCRMSFPTLQDHRKHIHEVHSKEYHPCPTCGKIFSAPSMLERHVVTHVGGKPFSCGICNKAYQQLSGLW +YHNRTHHPDVFAAQNHRSSKFSSLQCSSCDKTFPNTIEHKKHIKAEHADMKFHECDQCKELFPTPALLQV +HVKCQHSGSQPFRCLYCAATFRFPGALQHHVTTEHFKQSETTFPCELCGELFTSQAQLDSHLESEHPKVM +STETQAAASQMAQVIQTPEPVAPTEQVITLEETQLAGSQVFVTLPDSQASQASSELVAVTVEDLLDGTVT +LICGEAK +>ENSP00000346634 pep:known chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding +MSKTNKSKSGSRSSRSRSASRSRSRSFSKSRSRSRSLSRSRKRRLSSRSRSRSYSPAHNRERNHPRVYQN +RDFRGHNRGYRRPYYFRGRNRGFYPWGQYNRGGYGNYRSNWQNYRQAYSPRRGRSRSRSPKRRSPSPRSR +SHSRNSDKSSSDRSRRSSSSRSSSNHSRVESSKRKSAKEKKSSSKDSRPSQAAGDNQGDEAKEQTFSGGT +SQDTKASESSKPWPDATYGTGSASRASAVSELSPRERSPALKSPLQSVVVRRRSPRPSPVPKPSPPLSST +SQMGSTLPSGAGYQSGTHQGQFDHGSGSLSPSKKSPVGKSPPSTGSTYGSSQKEESAASGGAAYTKRYLE +EQKTENGKDKEQKQTNTDKEKIKEKGSFSDTGLGDGKMKSDSFAPKTDSEKPFRGSQSPKRYKLRDDFEK +KMADFHKEEMDDQDKDKAKGRKESEFDDEPKFMSKVIGANKNQEEEKSGKWEGLVYAPPGKEKQRKTEEL +EEESFPERSKKEDRGKRSEGGHRGFVPEKNFRVTAYKAVQEKSSSPPPRKTSESRDKLGAKGDFPTGKSS +FSITREAQVNVRMDSFDEDLARPSGLLAQERKLCRDLVHSNKKEQEFRSIFQHIQSAQSQRSPSELFAQH +IVTIVHHVKEHHFGSSGMTLHERFTKYLKRGTEQEAAKNKKSPEIHRRIDISPSTFRKHGLAHDEMKSPR +EPGYKAEGKYKDDPVDLRLDIERRKKHKERDLKRGKSRESVDSRDSSHSRERSAEKTEKTHKGSKKQKKH +RRARDRSRSSSSSSQSSHSYKAEEYTEETEEREESTTGFDKSRLGTKDFVGPSERGGGRARGTFQFRARG +RGWGRGNYSGNNNNNSNNDFQKRNREEEWDPEYTPKSKKYYLHDDREGEGSDKWVSRGRGRGAFPRGRGR +FMFRKSSTSPKWAHDKFSGEEGEIEDDESGTENREEKDNIQPTTE +>ENSP00000318415 pep:known chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding +MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMN +GKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNM +SSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPR +DDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGS +YRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPP +SVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY +>ENSP00000446099 pep:known chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding +MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMN +GKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNM +SSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPR +DDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGS +YRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPP +SVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY diff -r 000000000000 -r 41a666a3d8a5 test-data/all_pep.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_pep.tabular Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,4 @@ +ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding MELPNYSRQLLQQLYTLCKEQQFCDCTISIGTIYFRAHKLVLAAASLLFKTLLDNTDTISIDASVVSPEEFALLLEMMYTGKLPVGKHNFSKIISLADSLQMFDVAVSCKNLLTSLVNCSVQGQVVRDVSAPSSETFRKEPEKPQVEILSSEGAGEPHSSPELAATPGGPVKAETEEAAHSVSQEMSVNSPTAQESQRNAETPAETPTTAEACSPSPAVQTFSEAKKTSTEPGCERKHYQLNFLLENEGVFSDALMVTQDVLKKLEMCSEIKGPQKEVILNCCEGRTPKETIENLLHRMTEEKTLTAEGLVKLLQAVKTTFPNLGLLLEKLQKSATLPSTTVQPSPDDYGTELLRRYHENLSEIFTDNQILLKMISHMTSLAPGEREVMEKLVKRDSGSGGFNSLISAVLEKQTLSATAIWQLLLVVQETKTCPLDLLMEEIRREPGADAFFRAVTTPEHATLETILRHNQLILEAIQQKIEYKLFTSEEEHLAETVKEILSIPSETASPEASLRAVLSRAMEKSVPAIEICHLLCSVHKSFPGLQPVMQELAYIGVLTKEDGEKETWKVSNKFHLEANNKEDEKAAKEDSQPGEQNDQGETGSLPGQQEKEASASPDPAKKSFICKACDKSFHFYCRLKVHMKRCRVAKSKQVQCKECSETKDSKKELDKHQLEAHGAGGEPDAPKKKKKRLPVTCDLCGREFAHASGMQYHKLTEHFDEKPFSCEECGAKFAANSTLKNHLRLHTGDRPFMCKHCLMTFTQASALAYHTKKKHSEGKMYACQYCDAVFAQSIELSRHVRTHTGDKPYVCRDCGKGFRQANGLSIHLHTFHNIEDPYDCKKCRMSFPTLQDHRKHIHEVHSKEYHPCPTCGKIFSAPSMLERHVVTHVGGKPFSCGICNKAYQQLSGLWYHNRTHHPDVFAAQNHRSSKFSSLQCSSCDKTFPNTIEHKKHIKAEHADMKFHECDQCKELFPTPALLQVHVKCQHSGSQPFRCLYCAATFRFPGALQHHVTTEHFKQSETTFPCELCGELFTSQAQLDSHLESEHPKVMSTETQAAASQMAQVIQTPEPVAPTEQVITLEETQLAGSQVFVTLPDSQASQASSELVAVTVEDLLDGTVTLICGEAK +ENSP00000346634 pep:known chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding MSKTNKSKSGSRSSRSRSASRSRSRSFSKSRSRSRSLSRSRKRRLSSRSRSRSYSPAHNRERNHPRVYQNRDFRGHNRGYRRPYYFRGRNRGFYPWGQYNRGGYGNYRSNWQNYRQAYSPRRGRSRSRSPKRRSPSPRSRSHSRNSDKSSSDRSRRSSSSRSSSNHSRVESSKRKSAKEKKSSSKDSRPSQAAGDNQGDEAKEQTFSGGTSQDTKASESSKPWPDATYGTGSASRASAVSELSPRERSPALKSPLQSVVVRRRSPRPSPVPKPSPPLSSTSQMGSTLPSGAGYQSGTHQGQFDHGSGSLSPSKKSPVGKSPPSTGSTYGSSQKEESAASGGAAYTKRYLEEQKTENGKDKEQKQTNTDKEKIKEKGSFSDTGLGDGKMKSDSFAPKTDSEKPFRGSQSPKRYKLRDDFEKKMADFHKEEMDDQDKDKAKGRKESEFDDEPKFMSKVIGANKNQEEEKSGKWEGLVYAPPGKEKQRKTEELEEESFPERSKKEDRGKRSEGGHRGFVPEKNFRVTAYKAVQEKSSSPPPRKTSESRDKLGAKGDFPTGKSSFSITREAQVNVRMDSFDEDLARPSGLLAQERKLCRDLVHSNKKEQEFRSIFQHIQSAQSQRSPSELFAQHIVTIVHHVKEHHFGSSGMTLHERFTKYLKRGTEQEAAKNKKSPEIHRRIDISPSTFRKHGLAHDEMKSPREPGYKAEGKYKDDPVDLRLDIERRKKHKERDLKRGKSRESVDSRDSSHSRERSAEKTEKTHKGSKKQKKHRRARDRSRSSSSSSQSSHSYKAEEYTEETEEREESTTGFDKSRLGTKDFVGPSERGGGRARGTFQFRARGRGWGRGNYSGNNNNNSNNDFQKRNREEEWDPEYTPKSKKYYLHDDREGEGSDKWVSRGRGRGAFPRGRGRFMFRKSSTSPKWAHDKFSGEEGEIEDDESGTENREEKDNIQPTTE +ENSP00000318415 pep:known chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMNGKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNMSSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPRDDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGSYRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPPSVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY +ENSP00000446099 pep:known chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMNGKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNMSSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPRDDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGSYRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPPSVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY diff -r 000000000000 -r 41a666a3d8a5 test-data/peptides_10_10.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptides_10_10.fa Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,12 @@ +>ENSP00000363782 pep:sap chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:22846709 codon_change:Gtg/Atg sap:V885M +FSAPSMLERHMVTHVGGKPFS +>ENSP00000346634 pep:sap chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:36752433 codon_change:gCc/gTc sap:A201V +QAAGDNQGDEVKEQTFSGGTS +>ENSP00000318415 pep:sap chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449390 codon_change:atA/atG sap:I40M +KYGRIVEVLLMKDRETNKSRG +>ENSP00000446099 pep:sap chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449390 codon_change:atA/atG sap:I40M +KYGRIVEVLLMKDRETNKSRG +>ENSP00000318415 pep:sap chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449434 codon_change:Aca/Gca sap:T26A +NTETNEKALEAVFGKYGRIVE +>ENSP00000446099 pep:sap chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449434 codon_change:Aca/Gca sap:T26A +NTETNEKALEAVFGKYGRIVE diff -r 000000000000 -r 41a666a3d8a5 test-data/snpeff.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/snpeff.vcf Tue Dec 17 18:45:13 2013 -0500 @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##samtoolsVersion=0.1.18 (r982:295) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##SnpEffVersion="3.4 (build 2013-11-27), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -snp -no-downstream -no-intergenic -no-intron -no-upstream -no-utr -stats /galaxy/PRODUCTION/database/files/000/376/dataset_376197.dat GRCh37.73 /galaxy/PRODUCTION/database/files/000/376/dataset_376194.dat " +##INFO= +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -geneId -no-downstream -no-intergenic -no-intron -no-upstream -no-utr -stats /galaxy/PRODUCTION/database/files/000/376/dataset_376199.dat GRCh37.73 /galaxy/PRODUCTION/database/files/000/376/dataset_376196.dat " +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /galaxy/PRODUCTION/database/tmp/tmp-SAMTOOLS-BmXdVn/bam_input_0.bam +chr1 22846709 . G A 9.31 . DP=2;VDB=0.0174;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Gtg/Atg|V885M|1127|ZBTB40|protein_coding|CODING|ENST00000374651|12|1) PL 40,6,0 +chr1 36752433 . C T 6.02 . DP=2;VDB=0.0099;AF1=1;AC1=2;DP4=0,0,2,0;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|gCc/gTc|A201V|955|THRAP3|protein_coding|CODING|ENST00000354618|4|1) PL 36,6,0 +chr1 89449390 . T C 15.1 . DP=3;VDB=0.0214;AF1=1;AC1=2;DP4=0,0,1,2;MQ=20;FQ=-36;EFF=EXON(MODIFIER|||||RBMXL1|processed_transcript|CODING|ENST00000413769|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|atA/atG|I40M|390|RBMXL1|protein_coding|CODING|ENST00000321792|2|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|atA/atG|I40M|390|RBMXL1|protein_coding|CODING|ENST00000399794|3|1);EFF=EXON(MODIFIER|||||ENSG00000213516|processed_transcript|CODING|ENST00000413769|3|1) PL 47,9,0 +chr1 89449434 . T C 9.31 . DP=2;VDB=0.0120;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=EXON(MODIFIER|||||RBMXL1|processed_transcript|CODING|ENST00000413769|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Aca/Gca|T26A|390|RBMXL1|protein_coding|CODING|ENST00000321792|2|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Aca/Gca|T26A|390|RBMXL1|protein_coding|CODING|ENST00000399794|3|1);EFF=EXON(MODIFIER|||||ENSG00000213516|processed_transcript|CODING|ENST00000413769|3|1) PL 40,6,0