# HG changeset patch # User dereeper # Date 1347805227 14400 # Node ID 542fcfc6e126de9d87c9423fca6f04b9215fefd6 # Parent 5547f1dfd12e0c9566f80bb10dba4417c2cb27d7 Uploaded diff -r 5547f1dfd12e -r 542fcfc6e126 extract_proteic_seq_using_coordinates.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.pl Sun Sep 16 10:20:27 2012 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/perl + +use strict; +use Bio::SeqIO; +use Getopt::Long; + +my $usage = qq~Usage:$0 [] +where are: + -i, --input_fasta + -o, --output_fasta + -c, --coordinates +~; +$usage .= "\n"; + +my ($input_fasta,$output_fasta,$coordinate_file); + +GetOptions( + "input_fasta=s" => \$input_fasta, + "output_fasta=s" => \$output_fasta, + "coordinates=s" => \$coordinate_file +); + + +die $usage + if ( !$input_fasta || !$output_fasta || !$coordinate_file ); + + +my %coordinates; +open(my $COORD,$coordinate_file); +while(<$COORD>) +{ + my $line = $_; + chomp($line); + my ($id,$start,$end) = split(/\t/,$line); + $coordinates{$id}{"start"} = $start; + $coordinates{$id}{"end"} = $end; +} +close($COORD); + +my $in = Bio::SeqIO->new(-file => "$input_fasta" , '-format' => 'Fasta'); +my $out = Bio::SeqIO->new(-file => ">$output_fasta" , '-format' => 'Fasta'); + +while ( my $seq = $in->next_seq() ) +{ + my $id = $seq -> id(); + my $start = $coordinates{$id}{"start"}; + my $end = $coordinates{$id}{"end"}; + + if ($start && $end) + { + my $subseq = $seq->subseq($start,$end); + my $new_seq = Bio::Seq->new( -seq => $subseq, + -id => $id + ); + $out->write_seq($new_seq); + } + + +} diff -r 5547f1dfd12e -r 542fcfc6e126 extract_proteic_seq_using_coordinates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.xml Sun Sep 16 10:20:27 2012 -0400 @@ -0,0 +1,75 @@ + + using coordinates + extract_proteic_seq_using_coordinates.pl -i $input -o $output -c $coordinates + + + + + + + + + +.. class:: infomark + +**Program encapsulated in Galaxy by Southgreen** + +.. class:: infomark + +**extract_proteic_seq_using_coordinates.pl version 1.0, 2012** + +----- + +========== + Authors: +========== + +**Dereeper A** + +----- + +=========== + Overview: +=========== + +Extract sequences from a protein FASTA file using coordinates. + + +----- + +**Example** + +If the input dataset is:: + + >MCCS00001-0.9-1 + MRLQLGLRRLHFLRRRDHCNHHRRGFATKYSGRVVVETDNGRSFAVEVDNPILQTDVRGY + PLPRRDLICKVVSILQSPPSTASSSSFDDLFMDLSDYLETLNVMITPSEASEILKSLKSP + NLALKFFQFCSSEIPDFRHNSFTYNRILLILSKAYLPNRLDLVRNILNEMDQSATGGSIS + TVNILIGIFSDGQEYGGIDELEKCLGLVKKWELSLNCYTYKCLMQGYLRLNDSKKALEVY + REMTRRGYKLDIFAYNMLLDALAKDEK + >MCCS00001-0.1-1 + MRLNSRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQA + NYKPYGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGI + LSTTNPGVVIDLKTQLEYFHKVQRSLAEKLGTAEAEEIISNAVYFISMGSNDYMGGYLGN + PEMQQLHPPEDYVRMVIGNLTQGIQELYDRGARKFGFLSLCPLGCLPALRVLNPKGHDAG + CFEQASALALAHSNALQAVLPNLELLLPKGFKYCNSNFYDWLLDRINDPTKYGFKEGESA + CCGAGPYRGIFTCGGTKKDPNYELCDNPSDYVWFDSFHPTERIHEQFAKALWDGLSPSVG + PYNLEGLFFNKQTIADVVDNPETQQIF + +Interval file must be in the form:: + + MCCS00001-0.9-1 2 6 + MCCS00001-0.1-1 5 132 + +Extracting sequences returns:: + + >MCCS00001-0.9-1 + RLQLG + >MCCS00001-0.1-1 + SRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQANYKP + YGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGILSTT + NPGVVIDL + + + +