Mercurial > repos > jjohnson > snpeff_to_peptides
comparison snpeff_to_peptides.py @ 0:41a666a3d8a5 draft default tip
Uploaded
| author | jjohnson |
|---|---|
| date | Tue, 17 Dec 2013 18:45:13 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:41a666a3d8a5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2013, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 | |
| 15 | |
| 16 """ | |
| 17 This tool takes a SnpEff VCF file and an Ensembl pep.all.fa file ( e.g. Homo_sapiens.GRCh37.73.pep.all.fa ) | |
| 18 It outputs a peptide fasta file with the variant peptide sequence that result from NON_SYNONYMOUS_CODING effects | |
| 19 | |
| 20 """ | |
| 21 | |
| 22 import sys,re,os.path | |
| 23 import tempfile | |
| 24 import optparse | |
| 25 from optparse import OptionParser | |
| 26 import logging | |
| 27 | |
| 28 ## dictionary for Amino Acid Abbreviations | |
| 29 aa_abbrev_dict = dict() | |
| 30 aa_abbrev_dict['Phe'] = 'F' | |
| 31 aa_abbrev_dict['Leu'] = 'L' | |
| 32 aa_abbrev_dict['Ser'] = 'S' | |
| 33 aa_abbrev_dict['Tyr'] = 'Y' | |
| 34 aa_abbrev_dict['Cys'] = 'C' | |
| 35 aa_abbrev_dict['Trp'] = 'W' | |
| 36 aa_abbrev_dict['Pro'] = 'P' | |
| 37 aa_abbrev_dict['His'] = 'H' | |
| 38 aa_abbrev_dict['Gln'] = 'Q' | |
| 39 aa_abbrev_dict['Arg'] = 'R' | |
| 40 aa_abbrev_dict['Ile'] = 'I' | |
| 41 aa_abbrev_dict['Met'] = 'M' | |
| 42 aa_abbrev_dict['Thr'] = 'T' | |
| 43 aa_abbrev_dict['Asn'] = 'N' | |
| 44 aa_abbrev_dict['Lys'] = 'K' | |
| 45 aa_abbrev_dict['Val'] = 'V' | |
| 46 aa_abbrev_dict['Ala'] = 'A' | |
| 47 aa_abbrev_dict['Asp'] = 'D' | |
| 48 aa_abbrev_dict['Glu'] = 'E' | |
| 49 aa_abbrev_dict['Gly'] = 'G' | |
| 50 | |
| 51 ## Get the peptide ID and sequence a given ID | |
| 52 def get_sequence(id,seq_file): | |
| 53 fh = open(seq_file, 'r') | |
| 54 try: | |
| 55 for (ln,line) in enumerate(fh): | |
| 56 if line.find(id) >= 0: | |
| 57 fields = line.split('\t') | |
| 58 return ( ' '.join(fields[0:-1]),fields[-1].rstrip() if fields and len(fields) > 0 else None ) | |
| 59 except Exception, e: | |
| 60 print >> sys.stderr, "failed: %s" % e | |
| 61 finally: | |
| 62 fh.close() | |
| 63 | |
| 64 def fasta_to_tabular(fasta_file,tabular_file): | |
| 65 inFile = open(fasta_file,'r') | |
| 66 outFile = open(tabular_file,'w') | |
| 67 for i, line in enumerate( inFile ): | |
| 68 line = line.rstrip( '\r\n' ) | |
| 69 if not line or line.startswith( '#' ): | |
| 70 continue | |
| 71 if line.startswith( '>' ): | |
| 72 #Don't want any existing tabs to trigger extra columns: | |
| 73 line = line.replace('\t', ' ') | |
| 74 if i > 0: | |
| 75 outFile.write('\n') | |
| 76 outFile.write(line[1:]) | |
| 77 outFile.write('\t') | |
| 78 else: | |
| 79 outFile.write(line) | |
| 80 if i > 0: | |
| 81 outFile.write('\n') | |
| 82 if inFile: | |
| 83 inFile.close() | |
| 84 if outFile: | |
| 85 outFile.close() | |
| 86 | |
| 87 def __main__(): | |
| 88 #Parse Command Line | |
| 89 parser = optparse.OptionParser() | |
| 90 parser.add_option( '-i', '--input', dest='input', help='The input snpeff vcf file with HGVS annotations (else read from stdin)' ) | |
| 91 parser.add_option( '-o', '--output', dest='output', help='The output fasta (else write to stdout)' ) | |
| 92 parser.add_option( '-p', '--protein_fasta', dest='protein_fasta', default=None, help='The Esembl protein fasta in tabular format' ) | |
| 93 parser.add_option( '-l', '--leading_aa_num', dest='leading_aa_num', type='int', default=None, help='leading number of AAs to output' ) | |
| 94 parser.add_option( '-t', '--trailing_aa_num', dest='trailing_aa_num', type='int', default=None, help='trailing number of AAs to output' ) | |
| 95 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
| 96 (options, args) = parser.parse_args() | |
| 97 | |
| 98 # need protein_fasta file | |
| 99 fastaFile = options.protein_fasta | |
| 100 if options.protein_fasta == None: | |
| 101 print >> sys.stderr, "Ensembl protein_fasta tabular file required" | |
| 102 exit(4) | |
| 103 else: | |
| 104 # determine if fasta is already in tabular format | |
| 105 is_tabular = False | |
| 106 standard_aa = '^[AC-IK-WY]+$' | |
| 107 standard_na = '^[ACGTN]+$' | |
| 108 inFile = open(fastaFile,'r') | |
| 109 try: | |
| 110 nseq = 0 | |
| 111 for i, line in enumerate( inFile ): | |
| 112 line = line.rstrip( '\r\n' ) | |
| 113 if not line or line.startswith( '#' ): | |
| 114 continue | |
| 115 fields = line.split('\t') | |
| 116 if len(fields) < 2: | |
| 117 is_tabular = False | |
| 118 if line[0] != '>': | |
| 119 print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile | |
| 120 exit(4) | |
| 121 break | |
| 122 if re.match('^[A-Z]+$',fields[-1].upper()): | |
| 123 is_tabular = True | |
| 124 nseq += 1 | |
| 125 else: | |
| 126 if line[0] != '>': | |
| 127 print >> sys.stderr, "failed: %s does not appear to be a fasta file" % fastaFile | |
| 128 exit(4) | |
| 129 if nseq > 10: | |
| 130 break | |
| 131 finally: | |
| 132 if inFile: | |
| 133 inFile.close() | |
| 134 if not is_tabular: | |
| 135 fastaFile = tempfile.NamedTemporaryFile(prefix='pep_fasta_',suffix=".tab",dir=os.getcwd()).name | |
| 136 fasta_to_tabular(options.protein_fasta,fastaFile) | |
| 137 # vcf input | |
| 138 if options.input != None: | |
| 139 try: | |
| 140 inputPath = os.path.abspath(options.input) | |
| 141 inputFile = open(inputPath, 'r') | |
| 142 except Exception, e: | |
| 143 print >> sys.stderr, "failed: %s" % e | |
| 144 exit(2) | |
| 145 else: | |
| 146 inputFile = sys.stdin | |
| 147 # output | |
| 148 if options.output != None: | |
| 149 try: | |
| 150 outputPath = os.path.abspath(options.output) | |
| 151 outputFile = open(outputPath, 'w') | |
| 152 except Exception, e: | |
| 153 print >> sys.stderr, "failed: %s" % e | |
| 154 exit(3) | |
| 155 else: | |
| 156 outputFile = sys.stdout | |
| 157 ## Amino_Acid_Change notations | |
| 158 # G528R | |
| 159 # p.Gly528Arg/c.1582G>C | |
| 160 aa_change_regex = '([A-Z])(\d+)([A-Z])' # G528R | |
| 161 aa_hgvs_regex = 'p\.([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])(/c\.(\d+)([ACGTN])>([ACGTN]))' # p.Gly528Arg/c.1582G>C | |
| 162 # Save VCF file header, not currently used | |
| 163 vcf_header = [] | |
| 164 reading_entries = False | |
| 165 try: | |
| 166 for linenum,line in enumerate(inputFile): | |
| 167 ## print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
| 168 if line.startswith('##'): | |
| 169 vcf_header.append(line) | |
| 170 # May need to check SnpEff version in the header, the EFF info changed between versions 2 and 3 | |
| 171 ##SnpEffVersion | |
| 172 elif line.startswith('#CHROM'): | |
| 173 reading_entries = True | |
| 174 else: | |
| 175 fields = line.split('\t') | |
| 176 # This is the current format of the EFF entry: | |
| 177 # EFF=missense(MODERATE|MISSENSE|Ggg/Cgg|G528R|802|SCNN1D|protein_coding|CODING|ENST00000379116|12|1);OICR=(ENST00000379116|1808) | |
| 178 # If this becomes variable, will need to dynamically pattern this on the defintion in the vcf header: | |
| 179 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )' "> | |
| 180 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] | |
| 181 for info_item in info.split(';'): | |
| 182 try: | |
| 183 if info_item.find('=') < 0: | |
| 184 continue | |
| 185 (key,val) = info_item.split('=',1) | |
| 186 if key == 'EFF': | |
| 187 effects = val.split(',') | |
| 188 for effect in effects: | |
| 189 (eff,effs) = effect.rstrip(')').split('(') | |
| 190 if not eff == 'NON_SYNONYMOUS_CODING': | |
| 191 continue | |
| 192 eff_fields = effs.split('|') | |
| 193 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] | |
| 194 if transcript: | |
| 195 aa_pos = None # 1-based position | |
| 196 alt_aa = '_' | |
| 197 # parse aa_change | |
| 198 # get AA change position and alternate Animo Acid | |
| 199 sap = aa_change | |
| 200 m = re.match(aa_change_regex,aa_change) | |
| 201 if m: | |
| 202 aa_pos = int(m.groups()[1]) | |
| 203 alt_aa = m.groups()[2] | |
| 204 else: | |
| 205 m = re.match(aa_hgvs_regex,aa_change) | |
| 206 if m: | |
| 207 aa_pos = int(m.groups()[1]) | |
| 208 ref_aa = aa_abbrev_dict[m.groups()[0]] | |
| 209 alt_aa = aa_abbrev_dict[m.groups()[2]] | |
| 210 sap = "%s%d%s" % (ref_aa,aa_pos,alt_aa) | |
| 211 if not aa_pos: | |
| 212 continue | |
| 213 # get AA sequence | |
| 214 aa_offset = aa_pos - 1 | |
| 215 (pep_id,pep_seq) = get_sequence(transcript,fastaFile) | |
| 216 if not pep_seq: | |
| 217 continue | |
| 218 start_pos = max(aa_offset - options.leading_aa_num, 0) if options.leading_aa_num else 0 | |
| 219 end_pos = min(aa_offset + options.trailing_aa_num + 1, len(pep_seq)) if options.trailing_aa_num else len(pep_seq) | |
| 220 # transform sequence | |
| 221 alt_seq = pep_seq[start_pos:aa_offset] + alt_aa + pep_seq[aa_offset+1:end_pos] | |
| 222 # >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:1:22778472 codon_change:Gtg/Atg sap:V885M | |
| 223 pep_id = re.sub('pep:[a-z]*','pep:sap',pep_id) | |
| 224 hdr = ">%s snp_location:%s:%s codon_change:%s sap:%s\n" % (pep_id, chrom, pos, codon_change, sap) | |
| 225 outputFile.write(hdr) | |
| 226 if options.debug: | |
| 227 trimmed_seq = pep_seq[start_pos:end_pos] | |
| 228 outputFile.write(trimmed_seq) | |
| 229 outputFile.write('\n') | |
| 230 outputFile.write(alt_seq) | |
| 231 outputFile.write('\n') | |
| 232 except Exception, e: | |
| 233 print >> sys.stderr, "failed: %s" % e | |
| 234 except Exception, e: | |
| 235 print >> sys.stderr, "failed: %s" % e | |
| 236 exit(1) | |
| 237 | |
| 238 if __name__ == "__main__" : __main__() | |
| 239 |
