Mercurial > repos > devteam > gffread
changeset 7:9c298cab341d draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gffread commit f40643d8b80299ebb84faebe92579321ac459746"
author | iuc |
---|---|
date | Sat, 25 Sep 2021 15:38:01 +0000 (2021-09-25) |
parents | bba49324f2fa |
children | |
files | gffread.xml test-data/Homo_sapiens.GRCh37_19.71.bed test-data/Homo_sapiens.GRCh37_19.71.gff3 test-data/ecoli-k12.processed.gff3 test-data/stop_codons.gtf |
diffstat | 5 files changed, 251 insertions(+), 96 deletions(-) [+] |
line wrap: on
line diff
--- a/gffread.xml Tue Aug 31 08:29:57 2021 +0000 +++ b/gffread.xml Sat Sep 25 15:38:01 2021 +0000 @@ -1,16 +1,21 @@ -<tool id="gffread" name="gffread" version="@VERSION@.0"> +<tool id="gffread" name="gffread" version="@GALAXY_TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05"> <description>Filters and/or converts GFF3/GTF2 records</description> <xrefs> <xref type="bio.tools">gffread</xref> </xrefs> <macros> - <token name="@VERSION@">0.11.6</token> + <!-- the version of this tool must not be lowered since in the past 2.x was used + lets use small increments and hope that gffread catches up one day --> + <token name="@GALAXY_TOOL_VERSION@">2.2.1.3</token> + <token name="@TOOL_VERSION@">0.12.7</token> + <token name="@VERSION_SUFFIX@">0</token> <xml name="fasta_output_select"> <param name="fa_outputs" type="select" display="checkboxes" multiple="true" label="Select fasta outputs"> - <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w exons.fa)</option> - <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x cds.fa)</option> - <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y pep.fa)</option> + <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w)</option> + <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x)</option> + <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y)</option> <option value="-W">for each fasta: record the exon coordinates projected onto the spliced sequence (-W)</option> + <option value="-S">for protein fasta: use '*' instead of '.' as stop codon translation (-S)</option> </param> </xml> <xml name="ref_filtering_select"> @@ -25,14 +30,14 @@ </param> </xml> <xml name="trackname"> - <param name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help="(-t track_name}"> + <param argument="-t" name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help=""> <validator type="regex">\w+</validator> </param> </xml> <xml name="merge_opts"> <option value="-K">also collapse shorter, fully contained transcripts with fewer introns than the container (-K)</option> <option value="-Q">remove the containment restriction: multi-exon transcripts will be collapsed if just their introns match, while single-exon transcripts can partially overlap 80% (-Q)</option> - <option value="-d dupinfo">output collapsing info (-d dupinfo)</option> + <option value="-d dupinfo">output collapsing info (-d)</option> </xml> <xml name="cluster_opts"> <option value="--force-exons"> make sure that the lowest level GFF features are printed as 'exon' features (--force-exons)</option> @@ -51,14 +56,19 @@ </xml> </macros> <requirements> - <requirement type="package" version="@VERSION@">gffread</requirement> + <requirement type="package" version="@TOOL_VERSION@">gffread</requirement> </requirements> + <version_command>gffread --version</version_command> <command detect_errors="aggressive"> <![CDATA[ #if $reference_genome.source == 'history': ln -s '$reference_genome.genome_fasta' genomeref.fa && #end if + gffread '$input' + #if $input.ext.startswith("bed") + --in-bed + #end if #if $reference_genome.source == 'cached': -g '${reference_genome.fasta_indexes.fields.path}' #if $reference_genome.ref_filtering and str($reference_genome.ref_filtering) != '': @@ -107,22 +117,68 @@ #end if #end if #if $gffs.gff_fmt != 'none': - #if $gffs.tname: + #if $gffs.gff_fmt != 'bed' and $gffs.tname: -t '$gffs.tname' #end if #if $gffs.gff_fmt == 'gff': + ## TODO bug 'gft' -> 'gtf' #if $input.datatype.file_ext == 'gft': $gffs.ensembl #end if - $gffs.output_cmd - #elif $gffs.gff_fmt == 'gtf': - $gffs.output_cmd + #end if + #if $gffs.gff_fmt == 'gtf' + -T + #elif $gffs.gff_fmt == 'bed' + --bed #end if + -o output.$gffs.gff_fmt #end if + +## Missing options +## +## --ids +## --nids +## -l +## --jmatch +## --nc +## --ignore-locus +## -A -s (see above) +## --sort-alpha : chromosomes (reference sequences) are sorted alphabetically +## --sort-by : sort the reference sequences by the order in which their +## names are given in the <refseq.lst> file +## Misc +## --keep-exon-attrs : for -F option, do not attempt to reduce redundant +## --attrs +## --keep-genes : in transcript-only mode (default), also preserve gene records +## --keep-comments: for GFF3 input/output, try to preserve comments +## -B (see above) +## -P +## --add-hasCDS : add a "hasCDS" attribute with value "true" for transcripts +## that have CDS features +## --adj-stop stop codon adjustment: enables -P and performs automatic +## adjustment of the CDS stop coordinate if premature or downstream + +## --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS +## features (see --tlf option below); automatic if the input +## filename ends with .tlf) +## --stream: fast processing of input GFF/BED transcripts as they are received +## ((no sorting, exons must be grouped by transcript in the input data) + +## Clustering + +## -Y + +## Output + +## --gene2exon +## --t-adopt +## -j +## --w-add +## --w-nocds ]]> </command> <inputs> - <param name="input" type="data" format="gff3,gtf" label="Input GFF3 or GTF feature file"/> + <param name="input" type="data" format="bed,gff3,gtf" label="Input BED, GFF3 or GTF feature file"/> <!-- filtering --> <param name="filtering" type="select" display="checkboxes" multiple="true" label="filters"> <option value="-U">discard single-exon transcripts (-U)</option> @@ -138,9 +194,9 @@ </param> <when value="none"/> <when value="filter"> - <param name="range" type="text" value="" label="Only show transcripts overlapping coordinate range"> + <param argument="-r" name="range" type="text" value="" label="Only show transcripts overlapping coordinate range"> <help><![CDATA[ - (-r [['strand']'chr':]'start'..'end') <br> + [['strand']'chr':]'start'..'end' <br> examples: <br> 1000..500000 <br> chr1:1000..500000 <br> @@ -150,14 +206,14 @@ </help> <validator type="regex">(([+-])?(\w+:))?\d+\.\.\d+</validator> </param> - <param name="discard_partial" type="boolean" truevalue="-R" falsevalue="" checked="false" - label="Discard all transcripts that are not fully contained within the given range" help="(-R)"/> + <param argument="-R" name="discard_partial" type="boolean" truevalue="-R" falsevalue="" checked="false" + label="Discard all transcripts that are not fully contained within the given range" help=""/> </when> </conditional> - <param name="maxintron" type="integer" value="" optional="true" min="0" label="Filter out transcipts with large introns" - help="If set, discard transcripts having an intron larger (-i max_intron)"/> - <param name="chr_replace" type="data" format="tabular" optional="true" label="Replace reference sequence names" > - <help><![CDATA[(-m chr_replace) <br> + <param argument="-i" name="maxintron" type="integer" value="" optional="true" min="0" label="Filter out transcipts with large introns" + help="If set, discard transcripts having an intron larger"/> + <param argument="-m" name="chr_replace" type="data" format="tabular" optional="true" label="Replace reference sequence names" > + <help><![CDATA[ chr_replace is a reference sequence replacement table consisting of 2 columns: "original_ref_ID" "new_ref_ID"<br> It is useful for switching between Ensembl and UCSC naming conventions <br> NOTE: GFF records on reference sequences that are not found among the "original_ref_ID" entries in this file will be filtered out @@ -177,10 +233,10 @@ <!-- merging --> <conditional name="merging"> - <param name="merge_sel" type="select" label="Transcript merging" help="(-M/--merge or --cluster-only)"> + <param name="merge_sel" type="select" label="Transcript merging" help=""> <option value="none">none</option> - <option value="merge">merge: cluster the input transcripts into loci, collapsing matching transcripts</option> - <option value="cluster">cluster-only: merge but without collapsing matching transcripts</option> + <option value="merge">merge: cluster the input transcripts into loci, collapsing matching transcripts (--merge)</option> + <option value="cluster">cluster-only: merge but without collapsing matching transcripts (--cluster-only)</option> </param> <when value="none"/> <when value="merge"> @@ -195,7 +251,7 @@ <!-- reference sequence file --> <!-- Error: -g option is required for options -w, -x, -y, -V, -N, -M --> <conditional name="reference_genome"> - <param name="source" type="select" label="Reference Genome" help="(-g genome.fasta) NOTE: Required for fasta outputs"> + <param name="source" type="select" label="Reference Genome" help="NOTE: Required for fasta outputs"> <option value="none">none</option> <option value="cached"></option> <option value="history">From your history</option> @@ -203,14 +259,14 @@ <when value="none"> </when> <when value="cached"> - <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <param argument="-g" name="fasta_indexes" type="select" label="Source FASTA Sequence"> <options from_data_table="all_fasta"/> </param> <expand macro="ref_filtering_select" /> <expand macro="fasta_output_select" /> </when> <when value="history"> - <param name="genome_fasta" type="data" format="fasta" label="Genome Reference Fasta"/> + <param argument="-g" name="genome_fasta" type="data" format="fasta" label="Genome Reference Fasta"/> <expand macro="ref_filtering_select" /> <expand macro="fasta_output_select" /> </when> @@ -222,35 +278,39 @@ <option value="none">none</option> <option value="gff">GFF</option> <option value="gtf">GTF</option> + <option value="bed">BED</option> </param> <when value="none"> </when> <when value="gff"> - <param name="output_cmd" type="hidden" value="-o output.gff3"/> - <param name="ensembl" type="boolean" truevalue="-L" falsevalue="" checked="false" label="Ensembl GTF to GFF3 conversion" help="(-L)"/> + <param argument="-L" name="ensembl" type="boolean" truevalue="-L" falsevalue="" checked="false" label="Ensembl GTF to GFF3 conversion" help=""/> <expand macro="trackname" /> </when> <when value="gtf"> - <param name="output_cmd" type="hidden" value="-T -o output.gtf"/> <expand macro="trackname" /> </when> + <when value="bed"> + </when> </conditional> - <param name="full_gff_attribute_preservation" type="boolean" truevalue="-F" falsevalue="" checked="false" - label="full GFF attribute preservation (all attributes are shown)" help="(-F)"/> - <param name="decode_url" type="boolean" truevalue="-D" falsevalue="" checked="false" - label="decode url encoded characters within attributes" help="(-D)"/> - <param name="expose" type="boolean" truevalue="-E" falsevalue="" checked="false" - label="warn about duplicate transcript IDs and other potential problems with the given GFF/GTF records" help="(-E)"/> + <param argument="-F" name="full_gff_attribute_preservation" type="boolean" truevalue="-F" falsevalue="" checked="false" + label="full GFF attribute preservation (all attributes are shown)" help=""/> + <param argument="-D" name="decode_url" type="boolean" truevalue="-D" falsevalue="" checked="false" + label="decode url encoded characters within attributes" help=""/> + <param argument="-E" name="expose" type="boolean" truevalue="-E" falsevalue="" checked="false" + label="warn about duplicate transcript IDs and other potential problems with the given GFF/GTF records" help=""/> </inputs> <outputs> - <data name="output_gff" format="gff3" metadata_source="input" label="${tool.name} on ${on_string}: gff3" from_work_dir="output.gff3"> + <data name="output_gff" format="gff3" metadata_source="input" label="${tool.name} on ${on_string}: gff3" from_work_dir="output.gff"> <filter>gffs['gff_fmt'] == 'gff'</filter> </data> <data name="output_gtf" format="gtf" metadata_source="input" label="${tool.name} on ${on_string}: gtf" from_work_dir="output.gtf"> <filter>gffs['gff_fmt'] == 'gtf'</filter> </data> + <data name="output_bed" format="bed" metadata_source="input" label="${tool.name} on ${on_string}: bed" from_work_dir="output.bed"> + <filter>gffs['gff_fmt'] == 'bed'</filter> + </data> <data name="output_exons" format="fasta" label="${tool.name} on ${on_string}: exons.fa" from_work_dir="exons.fa"> <filter>'fa_outputs' in reference_genome and str(reference_genome['fa_outputs']).find('exons.fa') > 0 </filter> </data> @@ -265,28 +325,48 @@ </data> </outputs> <tests> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="gff_fmt" value="gff"/> - <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="2" /> + <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="4" /> </test> - <test> + <test expect_num_outputs="1"> + <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> + <param name="gff_fmt" value="gff"/> + <output name="output_gff" file="Homo_sapiens.GRCh37_19.71.gff3" ftype="gff3" lines_diff="4" /> + </test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="ecoli-k12.gff3"/> <param name="gff_fmt" value="gff"/> <param name="full_gff_attribute_preservation" value="-F"/> - <output name="output_gff" file="ecoli-k12.processed.gff3" ftype="gff3" lines_diff="2" /> + <output name="output_gff" file="ecoli-k12.processed.gff3" ftype="gff3" lines_diff="4" /> </test> - <test> - <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> - <param name="filtering" value="--no-pseudo"/> - <param name="gff_fmt" value="gtf"/> - <output name="output_gtf"> + <!-- bed output --> + <test expect_num_outputs="1"> + <param name="input" ftype="gff3" value="Homo_sapiens.GRCh37_19.71.gff3"/> + <param name="gff_fmt" value="bed"/> + <output name="output_bed" ftype="bed"> <assert_contents> - <not_has_text text="pseudo" /> + <has_n_lines n="42"/> + <has_n_columns n="13"/> </assert_contents> </output> </test> - <test> + <!-- bed input and test tname --> + <test expect_num_outputs="1"> + <param name="input" ftype="bed" value="Homo_sapiens.GRCh37_19.71.bed"/> + <param name="gff_fmt" value="gff"/> + <param name="tname" value="track name"/> + <output name="output_bed" ftype="gff3"> + <assert_contents> + <has_n_lines n="388"/> + <!-- this will work with https://github.com/galaxyproject/galaxy/pull/12528 --> + <!-- <has_n_columns n="9" comment="#"/> --> + <has_text text="track name"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="region_filter" value="filter"/> <param name="range" value="19:496500..504965"/> @@ -298,7 +378,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="region_filter" value="filter"/> <param name="range" value="19:496500..504965"/> @@ -311,7 +391,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="1"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="filtering" value="-C"/> <param name="region_filter" value="filter"/> @@ -324,7 +404,7 @@ </assert_contents> </output> </test> - <test> + <test expect_num_outputs="4"> <param name="input" ftype="gtf" value="Homo_sapiens.GRCh37_19.71.gtf"/> <param name="source" value="history"/> <param name="genome_fasta" ftype="fasta" value="Homo_sapiens.GRCh37.71.dna.chromosome.19.fa"/> @@ -357,7 +437,18 @@ </assert_contents> </output> </test> - + <test expect_num_outputs="1"> + <param name="input" ftype="gtf" value="stop_codons.gtf"/> + <param name="source" value="history"/> + <param name="genome_fasta" ftype="fasta" value="Homo_sapiens.GRCh37.71.dna.chromosome.19.fa"/> + <param name="fa_outputs" value="-y pep.fa,-S"/> + <output name="output_pep"> + <assert_contents> + <has_text text="ENST00000269812" /> + <has_text text="PLRGLHPRV*LQTPLERCPCWPPAGGTGGCPHCLLHLRLLQSPTPTALSEGGGAGTEAQPVTDVDPGRG*" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[ @@ -367,30 +458,32 @@ .. _stringtie: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread - -gffread v0.11.4. Usage: :: +gffread v0.12.7. Usage: :: - gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] - [-o <outfile>] [-t <trackname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]] + gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] + [-o <outfile>] [-t <trackname>] [-r [<strand>]<chr>:<start>-<end> [-R]] + [--jmatch <chr>:<start>-<end>] [--no-pseudo] [-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>] - [-i <maxintron>] [--bed] [--table <attrlist>] [--sort-by <refseq_list.txt>] - + [-j ][--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>] + [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>] + [<input_gff>] + Filter, convert or cluster GFF/GTF/BED records, extract the sequence of transcripts (exon or CDS) and more. By default (i.e. without -O) only transcripts are processed, discarding any other non-transcript features. Default output is a simplified GFF3 with only the basic attributes. - <input_gff> is a GFF file, use '-' for stdin - Options: - + --ids discard records/transcripts if their IDs are not listed in <IDs.lst> + --nids discard records/transcripts if their IDs are listed in <IDs.lst> -i discard transcripts having an intron larger than <maxintron> -l discard transcripts shorter than <minlen> bases -r only show transcripts overlapping coordinate range <start>..<end> (on chromosome/contig <chr>, strand <strand> if provided) -R for -r option, discard all transcripts that are not fully contained within the given range + --jmatch only output transcripts matching the given junction -U discard single-exon transcripts -C coding only: discard mRNAs that have no CDS features --nc non-coding only: discard mRNAs that have CDS features @@ -401,18 +494,18 @@ for each of the mapped sequences: <seq-name> <seq-length> <seq-description> (useful for -A option with mRNA/EST/protein mappings) - - Sorting: (by default, chromosomes are kept in the order they were found) + Sorting: (by default, chromosomes are kept in the order they were found) --sort-alpha : chromosomes (reference sequences) are sorted alphabetically --sort-by : sort the reference sequences by the order in which their names are given in the <refseq.lst> file - Misc options: - -F preserve all GFF attributes (for non-exon features) + -F keep all GFF attributes (for non-exon features) --keep-exon-attrs : for -F option, do not attempt to reduce redundant exon/CDS attributes -G do not keep exon attributes, move them to the transcript feature (for GFF3 output) + --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list> + which is a comma delimited list of attribute names to --keep-genes : in transcript-only mode (default), also preserve gene records --keep-comments: for GFF3 input/output, try to preserve comments -O process other non-transcript GFF records (by default non-transcript @@ -440,10 +533,11 @@ --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS features (see --tlf option below); automatic if the input filename ends with .tlf) - + --stream: fast processing of input GFF/BED transcripts as they are received + ((no sorting, exons must be grouped by transcript in the input data) Clustering: -M/--merge : cluster the input transcripts into loci, discarding - "duplicated" transcripts (those with the same exact introns + "redundant" transcripts (those with the same exact introns and fully contained or equal boundaries) -d <dupinfo> : for -M option, write duplication info to file <dupinfo> --cluster-only: same as -M/--merge but without discarding any of the @@ -455,7 +549,6 @@ multi-exon transcripts, and >=80% overlap for single-exon transcripts -Y for -M option, enforce -Q but also discard overlapping single-exon transcripts, even on the opposite strand (can be combined with -K) - Output options: --force-exons: make sure that the lowest level GFF features are considered "exon" features @@ -468,25 +561,26 @@ -g full path to a multi-fasta file with the genomic sequences for all input mappings, OR a directory with single-fasta files (one per genomic sequence, with file names matching sequence names) - -w write a fasta file with spliced exons for each GFF transcript + -j output the junctions and the corresponding transcripts + -w write a fasta file with spliced exons for each transcript + --w-add <N> for the -w option, extract additional <N> bases + both upstream and downstream of the transcript boundaries + --w-nocds for -w, disable the output of CDS info in the FASTA file -x write a fasta file with spliced CDS for each GFF transcript -y write a protein fasta file with the translation of CDS for each record - -W for -w and -x options, write in the FASTA defline the exon + -W for -w, -x and -y options, write in the FASTA defline all the exon coordinates projected onto the spliced sequence; - for -y option, write transcript attributes in the FASTA defline -S for -y option, use '*' instead of '.' as stop codon translation - -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m) + -L Ensembl GTF to GFF3 conversion, adds version to IDs -m <chr_replace> is a name mapping table for converting reference sequence names, having this 2-column format: <original_ref_ID> <new_ref_ID> - WARNING: all GFF records on reference sequences whose original IDs - are not found in the 1st column of this table will be discarded! -t use <trackname> in the 2nd column of each GFF/GTF output line - -o write the records into <outfile> instead of stdout + -o write the output records into <outfile> instead of stdout -T main output will be GTF instead of GFF3 --bed output records in BED format instead of default GFF3 --tlf output "transcript line format" which is like GFF - but exons, CDS features and related data are stored as GFF + but with exons and CDS related features stored as GFF attributes in the transcript feature line, like this: exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> <exons> is a comma-delimited list of exon_start-exon_end coordinates; @@ -494,9 +588,14 @@ --table output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in <attrlist>; special pseudo-attributes (prefixed by @) are recognized: - @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen + @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, + @cds, @covlen, @cdslen + If any of -w/-y/-x FASTA output files are enabled, the same fields + (excluding @id) are appended to the definition line of corresponding + FASTA records -v,-E expose (warn about) duplicate transcript IDs and other potential problems with the given GFF/GTF records + ]]> </help> <citations>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Homo_sapiens.GRCh37_19.71.bed Sat Sep 25 15:38:01 2021 +0000 @@ -0,0 +1,42 @@ +19 223157 223261 ENST00000410397 100 - 223157 223261 0,0,0 1 104, 0, geneID=ENSG00000222329;gene_name=U6 +19 229639 230165 ENST00000587910 100 - 229639 230165 0,0,0 2 70,82, 0,444, geneID=ENSG00000267600;gene_name=AC098474.1 +19 239144 239247 ENST00000588755 100 - 239144 239247 0,0,0 1 103, 0, geneID=ENSG00000267305;gene_name=CTD-3113P16.7 +19 279494 280170 ENST00000589981 100 + 279494 280170 0,0,0 1 676, 0, geneID=ENSG00000267447;gene_name=VN2R11P +19 281042 291386 ENST00000269812 100 - 281387 291336 0,0,0 6 495,177,58,278,152,102, 0,1091,1709,6431,6977,10242, CDS=281387:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281344 291393 ENST00000434325 100 - 281387 288055 0,0,0 6 193,177,58,278,152,68, 0,789,1407,6129,6675,9981, CDS=281387:288055;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281387 291200 ENST00000327790 100 - 281387 291066 0,0,0 6 150,177,58,278,152,249, 0,746,1364,6086,6632,9564, CDS=281387:291066;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 281990 287636 ENST00000586998 100 - 282121 287636 2,0,0 3 320,58,163, 0,761,5483, CDS=282121:287636;CDSphase=2;geneID=ENSG00000141934;gene_name=PPAP2C +19 287160 288530 ENST00000589672 100 - 287160 288530 0,0,0 2 591,511, 0,859, geneID=ENSG00000141934;gene_name=PPAP2C +19 287473 291382 ENST00000591572 100 - 287473 291336 0,0,0 3 278,170,98, 0,546,3811, CDS=287473:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C +19 305572 306467 ENST00000591533 100 + 305572 306467 0,0,0 2 131,411, 0,484, geneID=ENSG00000267124;gene_name=CTD-3113P16.5 +19 305574 344793 ENST00000264819 100 - 306689 344782 0,0,0 14 1137,418,89,125,95,82,152,70,92,124,126,143,91,20, 0,1544,3002,3226,6270,6616,7917,20060,20932,21558,22289,28825,30508,39199, CDS=306689:344782;CDSphase=0;geneID=ENSG00000105556;gene_name=MIER2 +19 305578 325706 ENST00000589092 100 + 305578 325706 0,0,0 2 356,83, 0,20045, geneID=ENSG00000267124;gene_name=CTD-3113P16.5 +19 326606 336178 ENST00000586994 100 - 326606 336178 0,0,0 4 650,126,143,96, 0,1257,7793,9476, geneID=ENSG00000105556;gene_name=MIER2 +19 327863 340599 ENST00000592722 100 - 327863 340599 0,0,0 5 126,117,143,91,86, 0,2400,6536,8219,12650, geneID=ENSG00000105556;gene_name=MIER2 +19 334114 344798 ENST00000587966 100 - 334114 344798 0,0,0 2 428,25, 0,10659, geneID=ENSG00000105556;gene_name=MIER2 +19 361749 376013 ENST00000342640 100 - 362199 375970 0,0,0 8 677,160,118,70,62,72,123,351, 0,5315,9455,10881,11720,12190,12549,13913, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG +19 362057 374620 ENST00000530711 100 - 362057 374620 0,0,0 3 369,160,322, 0,5007,12241, geneID=ENSG00000105549;gene_name=THEG +19 362199 375970 ENST00000346878 100 - 362199 375970 0,0,0 7 227,160,118,70,62,123,308, 0,4865,9005,10431,11270,12099,13463, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG +19 367201 374249 ENST00000528213 100 - 367201 374249 0,0,0 5 23,118,70,62,310, 0,4003,5429,6268,6738, geneID=ENSG00000105549;gene_name=THEG +19 397588 398941 ENST00000591757 100 + 397588 398941 0,0,0 2 45,252, 0,1101, geneID=ENSG00000267443;gene_name=AC010641.1 +19 405444 409139 ENST00000332235 100 - 407095 408361 0,0,0 2 2957,134, 0,3561, CDS=407095:408361;CDSphase=0;geneID=ENSG00000183186;gene_name=C2CD4C +19 416582 419879 ENST00000587423 100 - 416582 419879 0,0,0 2 740,957, 0,2340, geneID=ENSG00000129946;gene_name=SHC2 +19 416582 422828 ENST00000588376 100 - 416582 422828 0,0,0 3 740,134,683, 0,2340,5563, geneID=ENSG00000129946;gene_name=SHC2 +19 416592 460996 ENST00000264554 100 - 418927 460996 0,0,0 13 730,134,311,135,64,157,127,52,54,120,61,71,468, 0,2330,5553,8504,14091,18116,19572,19787,20037,22125,22377,24269,43936, CDS=418927:460996;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2 +19 416608 441384 ENST00000589922 100 - 416608 441384 0,0,0 11 714,134,311,135,64,157,127,304,120,61,523, 0,2314,5537,8488,14075,18100,19556,19771,22109,22361,24253, geneID=ENSG00000129946;gene_name=SHC2 +19 417199 436258 ENST00000590170 100 - 434761 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, CDS=434761:436258;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2 +19 417199 436258 ENST00000591948 100 - 417199 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, geneID=ENSG00000129946;gene_name=SHC2 +19 434701 460571 ENST00000590222 100 - 439397 460571 1,0,0 9 164,127,52,54,120,61,259,71,43, 0,1463,1678,1928,4016,4268,4531,6160,25827, CDS=439397:460571;CDSphase=1;geneID=ENSG00000129946;gene_name=SHC2 +19 435770 436534 ENST00000591388 100 - 435770 436534 0,0,0 3 191,127,155, 0,394,609, geneID=ENSG00000129946;gene_name=SHC2 +19 435778 439031 ENST00000590113 100 - 435778 439031 0,0,0 6 183,127,52,54,120,62, 0,386,601,851,2939,3191, geneID=ENSG00000129946;gene_name=SHC2 +19 453133 453245 ENST00000516730 100 + 453133 453245 0,0,0 1 112, 0, geneID=ENSG00000252539;gene_name=RNA5SP462 +19 463345 474983 ENST00000315489 100 - 463843 474747 0,0,0 4 1019,114,108,363, 0,4303,9048,11275, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2 +19 463466 474880 ENST00000382696 100 - 463843 474747 0,0,0 3 898,114,260, 0,4182,11154, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2 +19 464145 472631 ENST00000591681 100 - 464145 472631 0,0,0 3 219,114,238, 0,3503,8248, geneID=ENSG00000181781;gene_name=ODF3L2 +19 489175 505342 ENST00000587541 100 + 489175 505342 0,0,0 3 864,261,598, 0,12493,15569, geneID=ENSG00000099866;gene_name=MADCAM1 +19 490045 507813 ENST00000592413 100 - 490045 507813 0,0,0 3 308,84,438, 0,11495,17330, geneID=ENSG00000266933;gene_name=AC005775.2 +19 496453 505207 ENST00000346144 100 + 496499 504965 0,0,0 4 98,285,330,463, 0,1379,2042,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 496453 505347 ENST00000215637 100 + 496499 504965 0,0,0 5 98,285,330,261,603, 0,1379,2042,5215,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 496499 504965 ENST00000382683 100 + 496499 504965 0,0,0 3 52,330,221, 0,1996,8245, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1 +19 507298 519654 ENST00000359315 100 + 507506 519423 0,0,0 2 546,766, 0,11590, CDS=507506:519423;CDSphase=0;geneID=ENSG00000141933;gene_name=TPGS1 +19 507499 510372 ENST00000588278 100 + 507499 510372 0,0,0 1 2873, 0, geneID=ENSG00000141933;gene_name=TPGS1
--- a/test-data/Homo_sapiens.GRCh37_19.71.gff3 Tue Aug 31 08:29:57 2021 +0000 +++ b/test-data/Homo_sapiens.GRCh37_19.71.gff3 Sat Sep 25 15:38:01 2021 +0000 @@ -1,6 +1,6 @@ -# gffread /tmp/tmpq6d_yfqc/files/9/2/2/dataset_922cd54b-d77c-48fb-abf7-6fc8d8fdb97c.dat -o output.gff3 -# gffread v0.11.6 ##gff-version 3 +# gffread v0.12.7 +# gffread /tmp/tmpk_iy6dhb/files/e/1/9/dataset_e191f2e3-7ad2-452e-b21c-edd22b6ba6e2.dat -o output.gff 19 snRNA transcript 223158 223261 . - . ID=ENST00000410397;geneID=ENSG00000222329;gene_name=U6 19 snRNA exon 223158 223261 . - . Parent=ENST00000410397 19 unprocessed_pseudogene transcript 229640 230165 . - . ID=ENST00000587910;geneID=ENSG00000267600;gene_name=AC098474.1
--- a/test-data/ecoli-k12.processed.gff3 Tue Aug 31 08:29:57 2021 +0000 +++ b/test-data/ecoli-k12.processed.gff3 Sat Sep 25 15:38:01 2021 +0000 @@ -1,33 +1,33 @@ -# gffread /tmp/tmpq6d_yfqc/files/2/7/7/dataset_277f6e18-b25a-4b59-b712-49b5c202a183.dat -F -o output.gff3 -# gffread v0.11.6 ##gff-version 3 -NC_000913.3 RefSeq gene 190 255 . + . ID=gene-b0001;geneID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001 +# gffread v0.12.7 +# gffread /tmp/tmpk_iy6dhb/files/7/c/b/dataset_7cbb521e-a7fc-4b92-8335-006b4f916f5c.dat -F -o output.gff +NC_000913.3 RefSeq gene 190 255 . + . ID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001 NC_000913.3 RefSeq CDS 190 255 . + 0 Parent=gene-b0001;Dbxref=UniProtKB/Swiss-Prot:P0AD86,Genbank:NP_414542.1,ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=NP_414542.1;gbkey=CDS;gene=thrL;locus_tag=b0001;orig_transcript_id=gnl|b0001|mrna.b0001;product=thr operon leader peptide;protein_id=NP_414542.1;transl_table=11 -NC_000913.3 RefSeq gene 337 2799 . + . ID=gene-b0002;geneID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002 +NC_000913.3 RefSeq gene 337 2799 . + . ID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002 NC_000913.3 RefSeq CDS 337 2799 . + 0 Parent=gene-b0002;Dbxref=UniProtKB/Swiss-Prot:P00561,Genbank:NP_414543.1,ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=NP_414543.1;gbkey=CDS;gene=thrA;locus_tag=b0002;orig_transcript_id=gnl|b0002|mrna.b0002;product=fused aspartate kinase/homoserine dehydrogenase 1;protein_id=NP_414543.1;transl_table=11 -NC_000913.3 RefSeq gene 2801 3733 . + . ID=gene-b0003;geneID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003 +NC_000913.3 RefSeq gene 2801 3733 . + . ID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003 NC_000913.3 RefSeq CDS 2801 3733 . + 0 Parent=gene-b0003;Dbxref=UniProtKB/Swiss-Prot:P00547,Genbank:NP_414544.1,ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=NP_414544.1;gbkey=CDS;gene=thrB;locus_tag=b0003;orig_transcript_id=gnl|b0003|mrna.b0003;product=homoserine kinase;protein_id=NP_414544.1;transl_table=11 -NC_000913.3 RefSeq gene 3734 5020 . + . ID=gene-b0004;geneID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004 +NC_000913.3 RefSeq gene 3734 5020 . + . ID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004 NC_000913.3 RefSeq CDS 3734 5020 . + 0 Parent=gene-b0004;Dbxref=UniProtKB/Swiss-Prot:P00934,Genbank:NP_414545.1,ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=NP_414545.1;gbkey=CDS;gene=thrC;locus_tag=b0004;orig_transcript_id=gnl|b0004|mrna.b0004;product=threonine synthase;protein_id=NP_414545.1;transl_table=11 -NC_000913.3 RefSeq gene 5234 5530 . + . ID=gene-b0005;geneID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=yaaX;gbkey=Gene;gene=yaaX;gene_biotype=protein_coding;gene_synonym=ECK0005;locus_tag=b0005 +NC_000913.3 RefSeq gene 5234 5530 . + . ID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=yaaX;gbkey=Gene;gene=yaaX;gene_biotype=protein_coding;gene_synonym=ECK0005;locus_tag=b0005 NC_000913.3 RefSeq CDS 5234 5530 . + 0 Parent=gene-b0005;Dbxref=UniProtKB/Swiss-Prot:P75616,Genbank:NP_414546.1,ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=NP_414546.1;gbkey=CDS;gene=yaaX;locus_tag=b0005;orig_transcript_id=gnl|b0005|mrna.b0005;product=DUF2502 domain-containing protein YaaX;protein_id=NP_414546.1;transl_table=11 -NC_000913.3 RefSeq gene 5683 6459 . - . ID=gene-b0006;geneID=gene-b0006;gene_name=yaaA;Dbxref=ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=yaaA;gbkey=Gene;gene=yaaA;gene_biotype=protein_coding;gene_synonym=ECK0006;locus_tag=b0006 +NC_000913.3 RefSeq gene 5683 6459 . - . ID=gene-b0006;gene_name=yaaA;Dbxref=ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=yaaA;gbkey=Gene;gene=yaaA;gene_biotype=protein_coding;gene_synonym=ECK0006;locus_tag=b0006 NC_000913.3 RefSeq CDS 5683 6459 . - 0 Parent=gene-b0006;Dbxref=UniProtKB/Swiss-Prot:P0A8I3,Genbank:NP_414547.1,ASAP:ABE-0000018,ECOCYC:EG10011,EcoGene:EG10011,GeneID:944749;Name=NP_414547.1;gbkey=CDS;gene=yaaA;locus_tag=b0006;orig_transcript_id=gnl|b0006|mrna.b0006;product=peroxide stress resistance protein YaaA;protein_id=NP_414547.1;transl_table=11 -NC_000913.3 RefSeq gene 6529 7959 . - . ID=gene-b0007;geneID=gene-b0007;gene_name=yaaJ;Dbxref=ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=yaaJ;gbkey=Gene;gene=yaaJ;gene_biotype=protein_coding;gene_synonym=ECK0007;locus_tag=b0007 +NC_000913.3 RefSeq gene 6529 7959 . - . ID=gene-b0007;gene_name=yaaJ;Dbxref=ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=yaaJ;gbkey=Gene;gene=yaaJ;gene_biotype=protein_coding;gene_synonym=ECK0007;locus_tag=b0007 NC_000913.3 RefSeq CDS 6529 7959 . - 0 Parent=gene-b0007;Dbxref=UniProtKB/Swiss-Prot:P30143,Genbank:NP_414548.1,ASAP:ABE-0000020,ECOCYC:EG11555,EcoGene:EG11555,GeneID:944745;Name=NP_414548.1;gbkey=CDS;gene=yaaJ;locus_tag=b0007;orig_transcript_id=gnl|b0007|mrna.b0007;product=putative transporter YaaJ;protein_id=NP_414548.1;transl_table=11 -NC_000913.3 RefSeq gene 8238 9191 . + . ID=gene-b0008;geneID=gene-b0008;gene_name=talB;Dbxref=ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=talB;gbkey=Gene;gene=talB;gene_biotype=protein_coding;gene_synonym=ECK0008,yaaK;locus_tag=b0008 +NC_000913.3 RefSeq gene 8238 9191 . + . ID=gene-b0008;gene_name=talB;Dbxref=ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=talB;gbkey=Gene;gene=talB;gene_biotype=protein_coding;gene_synonym=ECK0008,yaaK;locus_tag=b0008 NC_000913.3 RefSeq CDS 8238 9191 . + 0 Parent=gene-b0008;Dbxref=UniProtKB/Swiss-Prot:P0A870,Genbank:NP_414549.1,ASAP:ABE-0000027,ECOCYC:EG11556,EcoGene:EG11556,GeneID:944748;Name=NP_414549.1;gbkey=CDS;gene=talB;locus_tag=b0008;orig_transcript_id=gnl|b0008|mrna.b0008;product=transaldolase B;protein_id=NP_414549.1;transl_table=11 -NC_000913.3 RefSeq gene 9306 9893 . + . ID=gene-b0009;geneID=gene-b0009;gene_name=mog;Dbxref=ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=mog;gbkey=Gene;gene=mog;gene_biotype=protein_coding;gene_synonym=bisD,chlG,ECK0009,mogA,yaaG;locus_tag=b0009 +NC_000913.3 RefSeq gene 9306 9893 . + . ID=gene-b0009;gene_name=mog;Dbxref=ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=mog;gbkey=Gene;gene=mog;gene_biotype=protein_coding;gene_synonym=bisD,chlG,ECK0009,mogA,yaaG;locus_tag=b0009 NC_000913.3 RefSeq CDS 9306 9893 . + 0 Parent=gene-b0009;Dbxref=UniProtKB/Swiss-Prot:P0AF03,Genbank:NP_414550.1,ASAP:ABE-0000030,ECOCYC:EG11511,EcoGene:EG11511,GeneID:944760;Name=NP_414550.1;gbkey=CDS;gene=mog;locus_tag=b0009;orig_transcript_id=gnl|b0009|mrna.b0009;product=molybdopterin adenylyltransferase;protein_id=NP_414550.1;transl_table=11 -NC_000913.3 RefSeq gene 9928 10494 . - . ID=gene-b0010;geneID=gene-b0010;gene_name=satP;Dbxref=ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=satP;gbkey=Gene;gene=satP;gene_biotype=protein_coding;gene_synonym=ECK0010,yaaH;locus_tag=b0010 +NC_000913.3 RefSeq gene 9928 10494 . - . ID=gene-b0010;gene_name=satP;Dbxref=ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=satP;gbkey=Gene;gene=satP;gene_biotype=protein_coding;gene_synonym=ECK0010,yaaH;locus_tag=b0010 NC_000913.3 RefSeq CDS 9928 10494 . - 0 Parent=gene-b0010;Dbxref=UniProtKB/Swiss-Prot:P0AC98,Genbank:NP_414551.1,ASAP:ABE-0000032,ECOCYC:EG11512,EcoGene:EG11512,GeneID:944792;Name=NP_414551.1;gbkey=CDS;gene=satP;locus_tag=b0010;orig_transcript_id=gnl|b0010|mrna.b0010;product=acetate/succinate:H(+) symporter;protein_id=NP_414551.1;transl_table=11 -NC_000913.3 RefSeq gene 10643 11356 . - . ID=gene-b0011;geneID=gene-b0011;gene_name=yaaW;Dbxref=ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=yaaW;gbkey=Gene;gene=yaaW;gene_biotype=protein_coding;gene_synonym=ECK0011;locus_tag=b0011 +NC_000913.3 RefSeq gene 10643 11356 . - . ID=gene-b0011;gene_name=yaaW;Dbxref=ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=yaaW;gbkey=Gene;gene=yaaW;gene_biotype=protein_coding;gene_synonym=ECK0011;locus_tag=b0011 NC_000913.3 RefSeq CDS 10643 11356 . - 0 Parent=gene-b0011;Dbxref=UniProtKB/Swiss-Prot:P75617,Genbank:NP_414552.1,ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=NP_414552.1;gbkey=CDS;gene=yaaW;locus_tag=b0011;orig_transcript_id=gnl|b0011|mrna.b0011;product=putative enzyme-specific chaperone YaaW;protein_id=NP_414552.1;transl_table=11 -NC_000913.3 RefSeq gene 10830 11315 . + . ID=gene-b0012;geneID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012 +NC_000913.3 RefSeq gene 10830 11315 . + . ID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012 NC_000913.3 RefSeq CDS 10830 11315 . + 0 Parent=gene-b0012;Dbxref=UniProtKB/Swiss-Prot:P28697,Genbank:YP_009518733.1,ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=YP_009518733.1;gbkey=CDS;gene=mbiA;locus_tag=b0012;orig_transcript_id=gnl|b0012|mrna.CDS13;product=uncharacterized protein MbiA;protein_id=YP_009518733.1;transl_table=11 -NC_000913.3 RefSeq gene 11382 11786 . - . ID=gene-b0013;geneID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013 +NC_000913.3 RefSeq gene 11382 11786 . - . ID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013 NC_000913.3 RefSeq CDS 11382 11786 . - 0 Parent=gene-b0013;Dbxref=UniProtKB/Swiss-Prot:P28696,Genbank:NP_414554.1,ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=NP_414554.1;gbkey=CDS;gene=yaaI;locus_tag=b0013;orig_transcript_id=gnl|b0013|mrna.b0013;product=DUF2541 domain-containing protein YaaI;protein_id=NP_414554.1;transl_table=11 -NC_000913.3 RefSeq gene 12163 14079 . + . ID=gene-b0014;geneID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014 +NC_000913.3 RefSeq gene 12163 14079 . + . ID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014 NC_000913.3 RefSeq CDS 12163 14079 . + 0 Parent=gene-b0014;Dbxref=UniProtKB/Swiss-Prot:P0A6Y8,Genbank:NP_414555.1,ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=NP_414555.1;gbkey=CDS;gene=dnaK;locus_tag=b0014;orig_transcript_id=gnl|b0014|mrna.b0014;product=chaperone protein DnaK;protein_id=NP_414555.1;transl_table=11 -NC_000913.3 RefSeq gene 14168 15298 . + . ID=gene-b0015;geneID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015 +NC_000913.3 RefSeq gene 14168 15298 . + . ID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015 NC_000913.3 RefSeq CDS 14168 15298 . + 0 Parent=gene-b0015;Dbxref=UniProtKB/Swiss-Prot:P08622,Genbank:NP_414556.1,ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=NP_414556.1;gbkey=CDS;gene=dnaJ;locus_tag=b0015;orig_transcript_id=gnl|b0015|mrna.b0015;product=chaperone protein DnaJ;protein_id=NP_414556.1;transl_table=11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/stop_codons.gtf Sat Sep 25 15:38:01 2021 +0000 @@ -0,0 +1,14 @@ +19 protein_coding exon 291275 291386 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00001234447"; +19 protein_coding CDS 291275 291336 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding start_codon 291334 291336 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; +19 protein_coding exon 288020 288171 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003304149"; +19 protein_coding CDS 288020 288171 . - 2 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 287474 287751 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003352024"; +19 protein_coding CDS 287474 287751 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 282752 282809 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951309"; +19 protein_coding CDS 282752 282809 . - 1 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 282134 282310 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951310"; +19 protein_coding CDS 282134 282310 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding exon 281043 281537 . - . gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951311"; +19 protein_coding CDS 281391 281537 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812"; +19 protein_coding stop_codon 281388 281390 . - 0 gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001";