# HG changeset patch # User iuc # Date 1632584281 0 # Node ID 9c298cab341dad98b1ad8d01a3f9829f500a240e # Parent bba49324f2fa9cdfecc5f37f551dfc33b9e6f9fd "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gffread commit f40643d8b80299ebb84faebe92579321ac459746" diff -r bba49324f2fa -r 9c298cab341d gffread.xml --- a/gffread.xml Tue Aug 31 08:29:57 2021 +0000 +++ b/gffread.xml Sat Sep 25 15:38:01 2021 +0000 @@ -1,16 +1,21 @@ - + Filters and/or converts GFF3/GTF2 records gffread - 0.11.6 + + 2.2.1.3 + 0.12.7 + 0 - - - + + + + @@ -25,14 +30,14 @@ - + \w+ - + @@ -51,14 +56,19 @@ - gffread + gffread + gffread --version 'gtf' #if $input.datatype.file_ext == 'gft': $gffs.ensembl #end if - $gffs.output_cmd - #elif $gffs.gff_fmt == 'gtf': - $gffs.output_cmd + #end if + #if $gffs.gff_fmt == 'gtf' + -T + #elif $gffs.gff_fmt == 'bed' + --bed #end if + -o output.$gffs.gff_fmt #end if + +## Missing options +## +## --ids +## --nids +## -l +## --jmatch +## --nc +## --ignore-locus +## -A -s (see above) +## --sort-alpha : chromosomes (reference sequences) are sorted alphabetically +## --sort-by : sort the reference sequences by the order in which their +## names are given in the file +## Misc +## --keep-exon-attrs : for -F option, do not attempt to reduce redundant +## --attrs +## --keep-genes : in transcript-only mode (default), also preserve gene records +## --keep-comments: for GFF3 input/output, try to preserve comments +## -B (see above) +## -P +## --add-hasCDS : add a "hasCDS" attribute with value "true" for transcripts +## that have CDS features +## --adj-stop stop codon adjustment: enables -P and performs automatic +## adjustment of the CDS stop coordinate if premature or downstream + +## --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS +## features (see --tlf option below); automatic if the input +## filename ends with .tlf) +## --stream: fast processing of input GFF/BED transcripts as they are received +## ((no sorting, exons must be grouped by transcript in the input data) + +## Clustering + +## -Y + +## Output + +## --gene2exon +## --t-adopt +## -j +## --w-add +## --w-nocds ]]> - + @@ -138,9 +194,9 @@ - + + [['strand']'chr':]'start'..'end'
examples:
1000..500000
chr1:1000..500000
@@ -150,14 +206,14 @@
(([+-])?(\w+:))?\d+\.\.\d+ - +
- - - + + + It is useful for switching between Ensembl and UCSC naming conventions
NOTE: GFF records on reference sequences that are not found among the "original_ref_ID" entries in this file will be filtered out @@ -177,10 +233,10 @@ - + - - + + @@ -195,7 +251,7 @@ - + @@ -203,14 +259,14 @@ - + - + @@ -222,35 +278,39 @@ + - - + - + + - - - + + +
- + gffs['gff_fmt'] == 'gff' gffs['gff_fmt'] == 'gtf' + + gffs['gff_fmt'] == 'bed' + 'fa_outputs' in reference_genome and str(reference_genome['fa_outputs']).find('exons.fa') > 0 @@ -265,28 +325,48 @@ - + - + - + + + + + + - + - - - - - + + + + + - + + - + + + + + + + + + + + + + + + @@ -298,7 +378,7 @@ - + @@ -311,7 +391,7 @@ - + @@ -324,7 +404,7 @@ - + @@ -357,7 +437,18 @@ - + + + + + + + + + + + + [-g | ][-s ] - [-o ] [-t ] [-r [[]:].. [-R]] + gffread [-g | ] [-s ] + [-o ] [-t ] [-r []:- [-R]] + [--jmatch :-] [--no-pseudo] [-CTVNJMKQAFPGUBHZWTOLE] [-w ] [-x ] [-y ] - [-i ] [--bed] [--table ] [--sort-by ] - + [-j ][--ids | --nids ] [--attrs ] [-i ] + [--stream] [--bed | --gtf | --tlf] [--table ] [--sort-by ] + [] + Filter, convert or cluster GFF/GTF/BED records, extract the sequence of transcripts (exon or CDS) and more. By default (i.e. without -O) only transcripts are processed, discarding any other non-transcript features. Default output is a simplified GFF3 with only the basic attributes. - is a GFF file, use '-' for stdin - Options: - + --ids discard records/transcripts if their IDs are not listed in + --nids discard records/transcripts if their IDs are listed in -i discard transcripts having an intron larger than -l discard transcripts shorter than bases -r only show transcripts overlapping coordinate range .. (on chromosome/contig , strand if provided) -R for -r option, discard all transcripts that are not fully contained within the given range + --jmatch only output transcripts matching the given junction -U discard single-exon transcripts -C coding only: discard mRNAs that have no CDS features --nc non-coding only: discard mRNAs that have CDS features @@ -401,18 +494,18 @@ for each of the mapped sequences: (useful for -A option with mRNA/EST/protein mappings) - - Sorting: (by default, chromosomes are kept in the order they were found) + Sorting: (by default, chromosomes are kept in the order they were found) --sort-alpha : chromosomes (reference sequences) are sorted alphabetically --sort-by : sort the reference sequences by the order in which their names are given in the file - Misc options: - -F preserve all GFF attributes (for non-exon features) + -F keep all GFF attributes (for non-exon features) --keep-exon-attrs : for -F option, do not attempt to reduce redundant exon/CDS attributes -G do not keep exon attributes, move them to the transcript feature (for GFF3 output) + --attrs only output the GTF/GFF attributes listed in + which is a comma delimited list of attribute names to --keep-genes : in transcript-only mode (default), also preserve gene records --keep-comments: for GFF3 input/output, try to preserve comments -O process other non-transcript GFF records (by default non-transcript @@ -440,10 +533,11 @@ --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS features (see --tlf option below); automatic if the input filename ends with .tlf) - + --stream: fast processing of input GFF/BED transcripts as they are received + ((no sorting, exons must be grouped by transcript in the input data) Clustering: -M/--merge : cluster the input transcripts into loci, discarding - "duplicated" transcripts (those with the same exact introns + "redundant" transcripts (those with the same exact introns and fully contained or equal boundaries) -d : for -M option, write duplication info to file --cluster-only: same as -M/--merge but without discarding any of the @@ -455,7 +549,6 @@ multi-exon transcripts, and >=80% overlap for single-exon transcripts -Y for -M option, enforce -Q but also discard overlapping single-exon transcripts, even on the opposite strand (can be combined with -K) - Output options: --force-exons: make sure that the lowest level GFF features are considered "exon" features @@ -468,25 +561,26 @@ -g full path to a multi-fasta file with the genomic sequences for all input mappings, OR a directory with single-fasta files (one per genomic sequence, with file names matching sequence names) - -w write a fasta file with spliced exons for each GFF transcript + -j output the junctions and the corresponding transcripts + -w write a fasta file with spliced exons for each transcript + --w-add for the -w option, extract additional bases + both upstream and downstream of the transcript boundaries + --w-nocds for -w, disable the output of CDS info in the FASTA file -x write a fasta file with spliced CDS for each GFF transcript -y write a protein fasta file with the translation of CDS for each record - -W for -w and -x options, write in the FASTA defline the exon + -W for -w, -x and -y options, write in the FASTA defline all the exon coordinates projected onto the spliced sequence; - for -y option, write transcript attributes in the FASTA defline -S for -y option, use '*' instead of '.' as stop codon translation - -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m) + -L Ensembl GTF to GFF3 conversion, adds version to IDs -m is a name mapping table for converting reference sequence names, having this 2-column format: - WARNING: all GFF records on reference sequences whose original IDs - are not found in the 1st column of this table will be discarded! -t use in the 2nd column of each GFF/GTF output line - -o write the records into instead of stdout + -o write the output records into instead of stdout -T main output will be GTF instead of GFF3 --bed output records in BED format instead of default GFF3 --tlf output "transcript line format" which is like GFF - but exons, CDS features and related data are stored as GFF + but with exons and CDS related features stored as GFF attributes in the transcript feature line, like this: exoncount=N;exons=;CDSphase=;CDS= is a comma-delimited list of exon_start-exon_end coordinates; @@ -494,9 +588,14 @@ --table output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in ; special pseudo-attributes (prefixed by @) are recognized: - @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen + @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, + @cds, @covlen, @cdslen + If any of -w/-y/-x FASTA output files are enabled, the same fields + (excluding @id) are appended to the definition line of corresponding + FASTA records -v,-E expose (warn about) duplicate transcript IDs and other potential problems with the given GFF/GTF records + ]]> diff -r bba49324f2fa -r 9c298cab341d test-data/Homo_sapiens.GRCh37_19.71.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Homo_sapiens.GRCh37_19.71.bed Sat Sep 25 15:38:01 2021 +0000 @@ -0,0 +1,42 @@ +19 223157 223261 ENST00000410397 100 - 223157 223261 0,0,0 1 104, 0, geneID=ENSG00000222329;gene_name=U6 +19 229639 230165 ENST00000587910 100 - 229639 230165 0,0,0 2 70,82, 0,444, geneID=ENSG00000267600;gene_name=AC098474.1 +19 239144 239247 ENST00000588755 100 - 239144 239247 0,0,0 1 103, 0, geneID=ENSG00000267305;gene_name=CTD-3113P16.7 +19 279494 280170 ENST00000589981 100 + 279494 280170 0,0,0 1 676, 0, geneID=ENSG00000267447;gene_name=VN2R11P +19 281042 291386 ENST00000269812 100 - 281387 291336 0,0,0 6 495,177,58,278,152,102, 0,1091,1709,6431,6977,10242, CDS=281387:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281344 291393 ENST00000434325 100 - 281387 288055 0,0,0 6 193,177,58,278,152,68, 0,789,1407,6129,6675,9981, CDS=281387:288055;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281387 291200 ENST00000327790 100 - 281387 291066 0,0,0 6 150,177,58,278,152,249, 0,746,1364,6086,6632,9564, CDS=281387:291066;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281990 287636 ENST00000586998 100 - 282121 287636 2,0,0 3 320,58,163, 0,761,5483, CDS=282121:287636;CDSphase=2;geneID=ENSG00000141934;gene_name=PPAP2C +19 287160 288530 ENST00000589672 100 - 287160 288530 0,0,0 2 591,511, 0,859, geneID=ENSG00000141934;gene_name=PPAP2C +19 287473 291382 ENST00000591572 100 - 287473 291336 0,0,0 3 278,170,98, 0,546,3811, CDS=287473:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 305572 306467 ENST00000591533 100 + 305572 306467 0,0,0 2 131,411, 0,484, geneID=ENSG00000267124;gene_name=CTD-3113P16.5 +19 305574 344793 ENST00000264819 100 - 306689 344782 0,0,0 14 1137,418,89,125,95,82,152,70,92,124,126,143,91,20, 0,1544,3002,3226,6270,6616,7917,20060,20932,21558,22289,28825,30508,39199, CDS=306689:344782;CDSphase=0;geneID=ENSG00000105556;gene_name=MIER2 +19 305578 325706 ENST00000589092 100 + 305578 325706 0,0,0 2 356,83, 0,20045, geneID=ENSG00000267124;gene_name=CTD-3113P16.5 +19 326606 336178 ENST00000586994 100 - 326606 336178 0,0,0 4 650,126,143,96, 0,1257,7793,9476, geneID=ENSG00000105556;gene_name=MIER2 +19 327863 340599 ENST00000592722 100 - 327863 340599 0,0,0 5 126,117,143,91,86, 0,2400,6536,8219,12650, geneID=ENSG00000105556;gene_name=MIER2 +19 334114 344798 ENST00000587966 100 - 334114 344798 0,0,0 2 428,25, 0,10659, geneID=ENSG00000105556;gene_name=MIER2 +19 361749 376013 ENST00000342640 100 - 362199 375970 0,0,0 8 677,160,118,70,62,72,123,351, 0,5315,9455,10881,11720,12190,12549,13913, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG +19 362057 374620 ENST00000530711 100 - 362057 374620 0,0,0 3 369,160,322, 0,5007,12241, geneID=ENSG00000105549;gene_name=THEG +19 362199 375970 ENST00000346878 100 - 362199 375970 0,0,0 7 227,160,118,70,62,123,308, 0,4865,9005,10431,11270,12099,13463, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG +19 367201 374249 ENST00000528213 100 - 367201 374249 0,0,0 5 23,118,70,62,310, 0,4003,5429,6268,6738, geneID=ENSG00000105549;gene_name=THEG +19 397588 398941 ENST00000591757 100 + 397588 398941 0,0,0 2 45,252, 0,1101, geneID=ENSG00000267443;gene_name=AC010641.1 +19 405444 409139 ENST00000332235 100 - 407095 408361 0,0,0 2 2957,134, 0,3561, CDS=407095:408361;CDSphase=0;geneID=ENSG00000183186;gene_name=C2CD4C +19 416582 419879 ENST00000587423 100 - 416582 419879 0,0,0 2 740,957, 0,2340, geneID=ENSG00000129946;gene_name=SHC2 +19 416582 422828 ENST00000588376 100 - 416582 422828 0,0,0 3 740,134,683, 0,2340,5563, geneID=ENSG00000129946;gene_name=SHC2 +19 416592 460996 ENST00000264554 100 - 418927 460996 0,0,0 13 730,134,311,135,64,157,127,52,54,120,61,71,468, 0,2330,5553,8504,14091,18116,19572,19787,20037,22125,22377,24269,43936, CDS=418927:460996;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2 +19 416608 441384 ENST00000589922 100 - 416608 441384 0,0,0 11 714,134,311,135,64,157,127,304,120,61,523, 0,2314,5537,8488,14075,18100,19556,19771,22109,22361,24253, geneID=ENSG00000129946;gene_name=SHC2 +19 417199 436258 ENST00000590170 100 - 434761 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, CDS=434761:436258;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2 +19 417199 436258 ENST00000591948 100 - 417199 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, geneID=ENSG00000129946;gene_name=SHC2 +19 434701 460571 ENST00000590222 100 - 439397 460571 1,0,0 9 164,127,52,54,120,61,259,71,43, 0,1463,1678,1928,4016,4268,4531,6160,25827, CDS=439397:460571;CDSphase=1;geneID=ENSG00000129946;gene_name=SHC2 +19 435770 436534 ENST00000591388 100 - 435770 436534 0,0,0 3 191,127,155, 0,394,609, geneID=ENSG00000129946;gene_name=SHC2 +19 435778 439031 ENST00000590113 100 - 435778 439031 0,0,0 6 183,127,52,54,120,62, 0,386,601,851,2939,3191, geneID=ENSG00000129946;gene_name=SHC2 +19 453133 453245 ENST00000516730 100 + 453133 453245 0,0,0 1 112, 0, geneID=ENSG00000252539;gene_name=RNA5SP462 +19 463345 474983 ENST00000315489 100 - 463843 474747 0,0,0 4 1019,114,108,363, 0,4303,9048,11275, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2 +19 463466 474880 ENST00000382696 100 - 463843 474747 0,0,0 3 898,114,260, 0,4182,11154, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2 +19 464145 472631 ENST00000591681 100 - 464145 472631 0,0,0 3 219,114,238, 0,3503,8248, geneID=ENSG00000181781;gene_name=ODF3L2 +19 489175 505342 ENST00000587541 100 + 489175 505342 0,0,0 3 864,261,598, 0,12493,15569, geneID=ENSG00000099866;gene_name=MADCAM1 +19 490045 507813 ENST00000592413 100 - 490045 507813 0,0,0 3 308,84,438, 0,11495,17330, geneID=ENSG00000266933;gene_name=AC005775.2 +19 496453 505207 ENST00000346144 100 + 496499 504965 0,0,0 4 98,285,330,463, 0,1379,2042,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 496453 505347 ENST00000215637 100 + 496499 504965 0,0,0 5 98,285,330,261,603, 0,1379,2042,5215,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 496499 504965 ENST00000382683 100 + 496499 504965 0,0,0 3 52,330,221, 0,1996,8245, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 507298 519654 ENST00000359315 100 + 507506 519423 0,0,0 2 546,766, 0,11590, CDS=507506:519423;CDSphase=0;geneID=ENSG00000141933;gene_name=TPGS1 +19 507499 510372 ENST00000588278 100 + 507499 510372 0,0,0 1 2873, 0, geneID=ENSG00000141933;gene_name=TPGS1 diff -r bba49324f2fa -r 9c298cab341d test-data/Homo_sapiens.GRCh37_19.71.gff3 --- a/test-data/Homo_sapiens.GRCh37_19.71.gff3 Tue Aug 31 08:29:57 2021 +0000 +++ b/test-data/Homo_sapiens.GRCh37_19.71.gff3 Sat Sep 25 15:38:01 2021 +0000 @@ -1,6 +1,6 @@ -# gffread /tmp/tmpq6d_yfqc/files/9/2/2/dataset_922cd54b-d77c-48fb-abf7-6fc8d8fdb97c.dat -o output.gff3 -# gffread v0.11.6 ##gff-version 3 +# gffread v0.12.7 +# gffread /tmp/tmpk_iy6dhb/files/e/1/9/dataset_e191f2e3-7ad2-452e-b21c-edd22b6ba6e2.dat -o output.gff 19 snRNA transcript 223158 223261 . - . ID=ENST00000410397;geneID=ENSG00000222329;gene_name=U6 19 snRNA exon 223158 223261 . - . Parent=ENST00000410397 19 unprocessed_pseudogene transcript 229640 230165 . - . ID=ENST00000587910;geneID=ENSG00000267600;gene_name=AC098474.1 diff -r bba49324f2fa -r 9c298cab341d test-data/ecoli-k12.processed.gff3 --- a/test-data/ecoli-k12.processed.gff3 Tue Aug 31 08:29:57 2021 +0000 +++ b/test-data/ecoli-k12.processed.gff3 Sat Sep 25 15:38:01 2021 +0000 @@ -1,33 +1,33 @@ -# gffread /tmp/tmpq6d_yfqc/files/2/7/7/dataset_277f6e18-b25a-4b59-b712-49b5c202a183.dat -F -o output.gff3 -# gffread v0.11.6 ##gff-version 3 -NC_000913.3 RefSeq gene 190 255 . + . ID=gene-b0001;geneID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001 +# gffread v0.12.7 +# gffread /tmp/tmpk_iy6dhb/files/7/c/b/dataset_7cbb521e-a7fc-4b92-8335-006b4f916f5c.dat -F -o output.gff +NC_000913.3 RefSeq gene 190 255 . + . ID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001 NC_000913.3 RefSeq CDS 190 255 . + 0 Parent=gene-b0001;Dbxref=UniProtKB/Swiss-Prot:P0AD86,Genbank:NP_414542.1,ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=NP_414542.1;gbkey=CDS;gene=thrL;locus_tag=b0001;orig_transcript_id=gnl|b0001|mrna.b0001;product=thr operon leader peptide;protein_id=NP_414542.1;transl_table=11 -NC_000913.3 RefSeq gene 337 2799 . + . ID=gene-b0002;geneID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002 +NC_000913.3 RefSeq gene 337 2799 . + . ID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002 NC_000913.3 RefSeq CDS 337 2799 . + 0 Parent=gene-b0002;Dbxref=UniProtKB/Swiss-Prot:P00561,Genbank:NP_414543.1,ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=NP_414543.1;gbkey=CDS;gene=thrA;locus_tag=b0002;orig_transcript_id=gnl|b0002|mrna.b0002;product=fused aspartate kinase/homoserine dehydrogenase 1;protein_id=NP_414543.1;transl_table=11 -NC_000913.3 RefSeq gene 2801 3733 . + . ID=gene-b0003;geneID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003 +NC_000913.3 RefSeq gene 2801 3733 . + . ID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003 NC_000913.3 RefSeq CDS 2801 3733 . + 0 Parent=gene-b0003;Dbxref=UniProtKB/Swiss-Prot:P00547,Genbank:NP_414544.1,ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=NP_414544.1;gbkey=CDS;gene=thrB;locus_tag=b0003;orig_transcript_id=gnl|b0003|mrna.b0003;product=homoserine kinase;protein_id=NP_414544.1;transl_table=11 -NC_000913.3 RefSeq gene 3734 5020 . + . ID=gene-b0004;geneID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004 +NC_000913.3 RefSeq gene 3734 5020 . + . ID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004 NC_000913.3 RefSeq CDS 3734 5020 . + 0 Parent=gene-b0004;Dbxref=UniProtKB/Swiss-Prot:P00934,Genbank:NP_414545.1,ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=NP_414545.1;gbkey=CDS;gene=thrC;locus_tag=b0004;orig_transcript_id=gnl|b0004|mrna.b0004;product=threonine synthase;protein_id=NP_414545.1;transl_table=11 -NC_000913.3 RefSeq gene 5234 5530 . + . ID=gene-b0005;geneID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=yaaX;gbkey=Gene;gene=yaaX;gene_biotype=protein_coding;gene_synonym=ECK0005;locus_tag=b0005 +NC_000913.3 RefSeq gene 5234 5530 . + . ID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=yaaX;gbkey=Gene;gene=yaaX;gene_biotype=protein_coding;gene_synonym=ECK0005;locus_tag=b0005 NC_000913.3 RefSeq CDS 5234 5530 . + 0 Parent=gene-b0005;Dbxref=UniProtKB/Swiss-Prot:P75616,Genbank:NP_414546.1,ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=NP_414546.1;gbkey=CDS;gene=yaaX;locus_tag=b0005;orig_transcript_id=gnl|b0005|mrna.b0005;product=DUF2502 domain-containing protein YaaX;protein_id=NP_414546.1;transl_table=11 -NC_000913.3 RefSeq gene 5683 6459 . - . ID=gene-b0006;geneID=gene-b0006;gene_name=yaaA;Dbxref=ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=yaaA;gbkey=Gene;gene=yaaA;gene_biotype=protein_coding;gene_synonym=ECK0006;locus_tag=b0006 +NC_000913.3 RefSeq gene 5683 6459 . - . ID=gene-b0006;gene_name=yaaA;Dbxref=ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=yaaA;gbkey=Gene;gene=yaaA;gene_biotype=protein_coding;gene_synonym=ECK0006;locus_tag=b0006 NC_000913.3 RefSeq CDS 5683 6459 . - 0 Parent=gene-b0006;Dbxref=UniProtKB/Swiss-Prot:P0A8I3,Genbank:NP_414547.1,ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=NP_414547.1;gbkey=CDS;gene=yaaA;locus_tag=b0006;orig_transcript_id=gnl|b0006|mrna.b0006;product=peroxide stress resistance protein YaaA;protein_id=NP_414547.1;transl_table=11 -NC_000913.3 RefSeq gene 6529 7959 . - . ID=gene-b0007;geneID=gene-b0007;gene_name=yaaJ;Dbxref=ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=yaaJ;gbkey=Gene;gene=yaaJ;gene_biotype=protein_coding;gene_synonym=ECK0007;locus_tag=b0007 +NC_000913.3 RefSeq gene 6529 7959 . - . ID=gene-b0007;gene_name=yaaJ;Dbxref=ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=yaaJ;gbkey=Gene;gene=yaaJ;gene_biotype=protein_coding;gene_synonym=ECK0007;locus_tag=b0007 NC_000913.3 RefSeq CDS 6529 7959 . - 0 Parent=gene-b0007;Dbxref=UniProtKB/Swiss-Prot:P30143,Genbank:NP_414548.1,ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=NP_414548.1;gbkey=CDS;gene=yaaJ;locus_tag=b0007;orig_transcript_id=gnl|b0007|mrna.b0007;product=putative transporter YaaJ;protein_id=NP_414548.1;transl_table=11 -NC_000913.3 RefSeq gene 8238 9191 . + . ID=gene-b0008;geneID=gene-b0008;gene_name=talB;Dbxref=ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=talB;gbkey=Gene;gene=talB;gene_biotype=protein_coding;gene_synonym=ECK0008,yaaK;locus_tag=b0008 +NC_000913.3 RefSeq gene 8238 9191 . + . ID=gene-b0008;gene_name=talB;Dbxref=ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=talB;gbkey=Gene;gene=talB;gene_biotype=protein_coding;gene_synonym=ECK0008,yaaK;locus_tag=b0008 NC_000913.3 RefSeq CDS 8238 9191 . + 0 Parent=gene-b0008;Dbxref=UniProtKB/Swiss-Prot:P0A870,Genbank:NP_414549.1,ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=NP_414549.1;gbkey=CDS;gene=talB;locus_tag=b0008;orig_transcript_id=gnl|b0008|mrna.b0008;product=transaldolase B;protein_id=NP_414549.1;transl_table=11 -NC_000913.3 RefSeq gene 9306 9893 . + . ID=gene-b0009;geneID=gene-b0009;gene_name=mog;Dbxref=ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=mog;gbkey=Gene;gene=mog;gene_biotype=protein_coding;gene_synonym=bisD,chlG,ECK0009,mogA,yaaG;locus_tag=b0009 +NC_000913.3 RefSeq gene 9306 9893 . + . ID=gene-b0009;gene_name=mog;Dbxref=ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=mog;gbkey=Gene;gene=mog;gene_biotype=protein_coding;gene_synonym=bisD,chlG,ECK0009,mogA,yaaG;locus_tag=b0009 NC_000913.3 RefSeq CDS 9306 9893 . + 0 Parent=gene-b0009;Dbxref=UniProtKB/Swiss-Prot:P0AF03,Genbank:NP_414550.1,ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=NP_414550.1;gbkey=CDS;gene=mog;locus_tag=b0009;orig_transcript_id=gnl|b0009|mrna.b0009;product=molybdopterin adenylyltransferase;protein_id=NP_414550.1;transl_table=11 -NC_000913.3 RefSeq gene 9928 10494 . - . ID=gene-b0010;geneID=gene-b0010;gene_name=satP;Dbxref=ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=satP;gbkey=Gene;gene=satP;gene_biotype=protein_coding;gene_synonym=ECK0010,yaaH;locus_tag=b0010 +NC_000913.3 RefSeq gene 9928 10494 . - . ID=gene-b0010;gene_name=satP;Dbxref=ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=satP;gbkey=Gene;gene=satP;gene_biotype=protein_coding;gene_synonym=ECK0010,yaaH;locus_tag=b0010 NC_000913.3 RefSeq CDS 9928 10494 . - 0 Parent=gene-b0010;Dbxref=UniProtKB/Swiss-Prot:P0AC98,Genbank:NP_414551.1,ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=NP_414551.1;gbkey=CDS;gene=satP;locus_tag=b0010;orig_transcript_id=gnl|b0010|mrna.b0010;product=acetate/succinate:H(+) symporter;protein_id=NP_414551.1;transl_table=11 -NC_000913.3 RefSeq gene 10643 11356 . - . ID=gene-b0011;geneID=gene-b0011;gene_name=yaaW;Dbxref=ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=yaaW;gbkey=Gene;gene=yaaW;gene_biotype=protein_coding;gene_synonym=ECK0011;locus_tag=b0011 +NC_000913.3 RefSeq gene 10643 11356 . - . ID=gene-b0011;gene_name=yaaW;Dbxref=ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=yaaW;gbkey=Gene;gene=yaaW;gene_biotype=protein_coding;gene_synonym=ECK0011;locus_tag=b0011 NC_000913.3 RefSeq CDS 10643 11356 . - 0 Parent=gene-b0011;Dbxref=UniProtKB/Swiss-Prot:P75617,Genbank:NP_414552.1,ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=NP_414552.1;gbkey=CDS;gene=yaaW;locus_tag=b0011;orig_transcript_id=gnl|b0011|mrna.b0011;product=putative enzyme-specific chaperone YaaW;protein_id=NP_414552.1;transl_table=11 -NC_000913.3 RefSeq gene 10830 11315 . + . ID=gene-b0012;geneID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012 +NC_000913.3 RefSeq gene 10830 11315 . + . ID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012 NC_000913.3 RefSeq CDS 10830 11315 . + 0 Parent=gene-b0012;Dbxref=UniProtKB/Swiss-Prot:P28697,Genbank:YP_009518733.1,ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=YP_009518733.1;gbkey=CDS;gene=mbiA;locus_tag=b0012;orig_transcript_id=gnl|b0012|mrna.CDS13;product=uncharacterized protein MbiA;protein_id=YP_009518733.1;transl_table=11 -NC_000913.3 RefSeq gene 11382 11786 . - . ID=gene-b0013;geneID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013 +NC_000913.3 RefSeq gene 11382 11786 . - . ID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013 NC_000913.3 RefSeq CDS 11382 11786 . - 0 Parent=gene-b0013;Dbxref=UniProtKB/Swiss-Prot:P28696,Genbank:NP_414554.1,ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=NP_414554.1;gbkey=CDS;gene=yaaI;locus_tag=b0013;orig_transcript_id=gnl|b0013|mrna.b0013;product=DUF2541 domain-containing protein YaaI;protein_id=NP_414554.1;transl_table=11 -NC_000913.3 RefSeq gene 12163 14079 . + . ID=gene-b0014;geneID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014 +NC_000913.3 RefSeq gene 12163 14079 . + . ID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014 NC_000913.3 RefSeq CDS 12163 14079 . + 0 Parent=gene-b0014;Dbxref=UniProtKB/Swiss-Prot:P0A6Y8,Genbank:NP_414555.1,ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=NP_414555.1;gbkey=CDS;gene=dnaK;locus_tag=b0014;orig_transcript_id=gnl|b0014|mrna.b0014;product=chaperone protein DnaK;protein_id=NP_414555.1;transl_table=11 -NC_000913.3 RefSeq gene 14168 15298 . + . ID=gene-b0015;geneID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015 +NC_000913.3 RefSeq gene 14168 15298 . + . ID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015 NC_000913.3 RefSeq CDS 14168 15298 . + 0 Parent=gene-b0015;Dbxref=UniProtKB/Swiss-Prot:P08622,Genbank:NP_414556.1,ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=NP_414556.1;gbkey=CDS;gene=dnaJ;locus_tag=b0015;orig_transcript_id=gnl|b0015|mrna.b0015;product=chaperone protein DnaJ;protein_id=NP_414556.1;transl_table=11 diff -r bba49324f2fa -r 9c298cab341d test-data/stop_codons.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/stop_codons.gtf Sat Sep 25 15:38:01 2021 +0000 @@ -0,0 +1,14 @@ +19 protein_coding exon 291275 291386 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00001234447"; +19 protein_coding CDS 291275 291336 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding start_codon 291334 291336 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; +19 protein_coding exon 288020 288171 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003304149"; +19 protein_coding CDS 288020 288171 . - 2 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 287474 287751 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003352024"; +19 protein_coding CDS 287474 287751 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 282752 282809 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951309"; +19 protein_coding CDS 282752 282809 . - 1 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 282134 282310 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951310"; +19 protein_coding CDS 282134 282310 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 281043 281537 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951311"; +19 protein_coding CDS 281391 281537 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding stop_codon 281388 281390 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001";