Mercurial > repos > dereeper > subseq_protein
changeset 0:60507a6de56c draft
Uploaded
author | dereeper |
---|---|
date | Sun, 16 Sep 2012 09:26:09 -0400 |
parents | |
children | 5547f1dfd12e |
files | extract_proteic_seq_using_coordinates.pl extract_proteic_seq_using_coordinates.sh extract_proteic_seq_using_coordinates.xml |
diffstat | 3 files changed, 139 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.pl Sun Sep 16 09:26:09 2012 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/perl + +use strict; +use Bio::SeqIO; +use Getopt::Long; + +my $usage = qq~Usage:$0 <args> [<opts>] +where <args> are: + -i, --input_fasta <input protein FASTA file> + -o, --output_fasta <output FASTA file> + -c, --coordinates <coordinates file> +~; +$usage .= "\n"; + +my ($input_fasta,$output_fasta,$coordinate_file); + +GetOptions( + "input_fasta=s" => \$input_fasta, + "output_fasta=s" => \$output_fasta, + "coordinates=s" => \$coordinate_file +); + + +die $usage + if ( !$input_fasta || !$output_fasta || !$coordinate_file ); + + +my %coordinates; +open(my $COORD,$coordinate_file); +while(<$COORD>) +{ + my $line = $_; + chomp($line); + my ($id,$start,$end) = split(/\t/,$line); + $coordinates{$id}{"start"} = $start; + $coordinates{$id}{"end"} = $end; +} +close($COORD); + +my $in = Bio::SeqIO->new(-file => "$input_fasta" , '-format' => 'Fasta'); +my $out = Bio::SeqIO->new(-file => ">$output_fasta" , '-format' => 'Fasta'); + +while ( my $seq = $in->next_seq() ) +{ + my $id = $seq -> id(); + my $start = $coordinates{$id}{"start"}; + my $end = $coordinates{$id}{"end"}; + + if ($start && $end) + { + my $subseq = $seq->subseq($start,$end); + my $new_seq = Bio::Seq->new( -seq => $subseq, + -id => $id + ); + $out->write_seq($new_seq); + } + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.sh Sun Sep 16 09:26:09 2012 -0400 @@ -0,0 +1,5 @@ +#!/bin/bash +input=$1 +output=$2 +coordinates=$3 +$HOME/galaxy_dist/tools/extract/extract_proteic_seq_using_coordinates.pl -i $input -o $output -c $coordinates;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.xml Sun Sep 16 09:26:09 2012 -0400 @@ -0,0 +1,75 @@ +<tool id="extract_proteic_seq_from_coordinates" name="Extract protein sequences"> + <description>using coordinates</description> + <command interpreter="bash">./extract_proteic_seq_using_coordinates.sh $input $output $coordinates</command> + <inputs> + <param format="fasta" name="input" type="data" label="Protein FASTA file"/> + <param format="txt" name="coordinates" type="data" label="Coordinates for extraction"/> + </inputs> + <outputs> + <data format="fasta" name="output" label="Extracted proteins"/> + </outputs> + <help> + +.. class:: infomark + +**Program encapsulated in Galaxy by Southgreen** + +.. class:: infomark + +**extract_proteic_seq_using_coordinates.pl version 1.0, 2012** + +----- + +========== + Authors: +========== + +**Dereeper A** + +----- + +=========== + Overview: +=========== + +Extract sequences from a protein FASTA file using coordinates. + + +----- + +**Example** + +If the input dataset is:: + + >MCCS00001-0.9-1 + MRLQLGLRRLHFLRRRDHCNHHRRGFATKYSGRVVVETDNGRSFAVEVDNPILQTDVRGY + PLPRRDLICKVVSILQSPPSTASSSSFDDLFMDLSDYLETLNVMITPSEASEILKSLKSP + NLALKFFQFCSSEIPDFRHNSFTYNRILLILSKAYLPNRLDLVRNILNEMDQSATGGSIS + TVNILIGIFSDGQEYGGIDELEKCLGLVKKWELSLNCYTYKCLMQGYLRLNDSKKALEVY + REMTRRGYKLDIFAYNMLLDALAKDEK + >MCCS00001-0.1-1 + MRLNSRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQA + NYKPYGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGI + LSTTNPGVVIDLKTQLEYFHKVQRSLAEKLGTAEAEEIISNAVYFISMGSNDYMGGYLGN + PEMQQLHPPEDYVRMVIGNLTQGIQELYDRGARKFGFLSLCPLGCLPALRVLNPKGHDAG + CFEQASALALAHSNALQAVLPNLELLLPKGFKYCNSNFYDWLLDRINDPTKYGFKEGESA + CCGAGPYRGIFTCGGTKKDPNYELCDNPSDYVWFDSFHPTERIHEQFAKALWDGLSPSVG + PYNLEGLFFNKQTIADVVDNPETQQIF + +Interval file must be in the form:: + + MCCS00001-0.9-1 2 6 + MCCS00001-0.1-1 5 132 + +Extracting sequences returns:: + + >MCCS00001-0.9-1 + RLQLG + >MCCS00001-0.1-1 + SRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQANYKP + YGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGILSTT + NPGVVIDL + + + </help> +</tool>