# HG changeset patch # User dereeper # Date 1347815805 14400 # Node ID 621bec1d98eaadbf0b48ff2389b1244aeb4e6793 # Parent 840657df66236601013e642e2fe4ad8675954008 Uploaded diff -r 840657df6623 -r 621bec1d98ea extract_proteic_seq_using_coordinates.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.pl Sun Sep 16 13:16:45 2012 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/perl + +use strict; +use Bio::SeqIO; +use Getopt::Long; + +my $usage = qq~Usage:$0 [] +where are: + -i, --input_fasta + -o, --output_fasta + -c, --coordinates +~; +$usage .= "\n"; + +my ($input_fasta,$output_fasta,$coordinate_file); + +GetOptions( + "input_fasta=s" => \$input_fasta, + "output_fasta=s" => \$output_fasta, + "coordinates=s" => \$coordinate_file +); + + +die $usage + if ( !$input_fasta || !$output_fasta || !$coordinate_file ); + + +my %coordinates; +open(my $COORD,$coordinate_file); +while(<$COORD>) +{ + my $line = $_; + chomp($line); + my ($id,$start,$end) = split(/\t/,$line); + $coordinates{$id}{"start"} = $start; + $coordinates{$id}{"end"} = $end; +} +close($COORD); + +my $in = Bio::SeqIO->new(-file => "$input_fasta" , '-format' => 'Fasta'); +my $out = Bio::SeqIO->new(-file => ">$output_fasta" , '-format' => 'Fasta'); + +while ( my $seq = $in->next_seq() ) +{ + my $id = $seq -> id(); + my $start = $coordinates{$id}{"start"}; + my $end = $coordinates{$id}{"end"}; + + if ($start && $end) + { + my $subseq = $seq->subseq($start,$end); + my $new_seq = Bio::Seq->new( -seq => $subseq, + -id => $id + ); + $out->write_seq($new_seq); + } + + +} diff -r 840657df6623 -r 621bec1d98ea extract_proteic_seq_using_coordinates.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.sh Sun Sep 16 13:16:45 2012 -0400 @@ -0,0 +1,6 @@ +#!/bin/bash +input=$1 +output=$2 +coordinates=$3 +directory=`dirname $0` +$directory/extract_proteic_seq_using_coordinates.pl -i $input -o $output -c $coordinates; diff -r 840657df6623 -r 621bec1d98ea extract_proteic_seq_using_coordinates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_proteic_seq_using_coordinates.xml Sun Sep 16 13:16:45 2012 -0400 @@ -0,0 +1,75 @@ + + from FASTA using coordinates + extract_proteic_seq_using_coordinates.sh $input $output $coordinates + + + + + + + + + +.. class:: infomark + +**Program encapsulated in Galaxy by Southgreen** + +.. class:: infomark + +**extract_proteic_seq_using_coordinates.pl version 1.0, 2012** + +----- + +========== + Authors: +========== + +**Dereeper A** + +----- + +=========== + Overview: +=========== + +Extract protein subsequences from FASTA file using coordinates. + + +----- + +**Example** + +If the input dataset is:: + + >MCCS00001-0.9-1 + MRLQLGLRRLHFLRRRDHCNHHRRGFATKYSGRVVVETDNGRSFAVEVDNPILQTDVRGY + PLPRRDLICKVVSILQSPPSTASSSSFDDLFMDLSDYLETLNVMITPSEASEILKSLKSP + NLALKFFQFCSSEIPDFRHNSFTYNRILLILSKAYLPNRLDLVRNILNEMDQSATGGSIS + TVNILIGIFSDGQEYGGIDELEKCLGLVKKWELSLNCYTYKCLMQGYLRLNDSKKALEVY + REMTRRGYKLDIFAYNMLLDALAKDEK + >MCCS00001-0.1-1 + MRLNSRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQA + NYKPYGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGI + LSTTNPGVVIDLKTQLEYFHKVQRSLAEKLGTAEAEEIISNAVYFISMGSNDYMGGYLGN + PEMQQLHPPEDYVRMVIGNLTQGIQELYDRGARKFGFLSLCPLGCLPALRVLNPKGHDAG + CFEQASALALAHSNALQAVLPNLELLLPKGFKYCNSNFYDWLLDRINDPTKYGFKEGESA + CCGAGPYRGIFTCGGTKKDPNYELCDNPSDYVWFDSFHPTERIHEQFAKALWDGLSPSVG + PYNLEGLFFNKQTIADVVDNPETQQIF + +Interval file must be in the form:: + + MCCS00001-0.9-1 2 6 + MCCS00001-0.1-1 5 132 + +Extracting sequences returns:: + + >MCCS00001-0.9-1 + RLQLG + >MCCS00001-0.1-1 + SRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQANYKP + YGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGILSTT + NPGVVIDL + + + +