changeset 0:60507a6de56c draft

Uploaded
author dereeper
date Sun, 16 Sep 2012 09:26:09 -0400
parents
children 5547f1dfd12e
files extract_proteic_seq_using_coordinates.pl extract_proteic_seq_using_coordinates.sh extract_proteic_seq_using_coordinates.xml
diffstat 3 files changed, 139 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_proteic_seq_using_coordinates.pl	Sun Sep 16 09:26:09 2012 -0400
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+
+use strict;
+use Bio::SeqIO;
+use Getopt::Long;
+
+my $usage = qq~Usage:$0 <args> [<opts>]
+where <args> are:
+    -i, --input_fasta       <input protein FASTA file>
+    -o, --output_fasta      <output FASTA file>
+    -c, --coordinates       <coordinates file>
+~;
+$usage .= "\n";
+
+my ($input_fasta,$output_fasta,$coordinate_file);
+
+GetOptions(
+	"input_fasta=s"  => \$input_fasta,
+	"output_fasta=s" => \$output_fasta,
+	"coordinates=s"  => \$coordinate_file
+);
+
+
+die $usage
+  if ( !$input_fasta || !$output_fasta || !$coordinate_file );
+  
+
+my %coordinates;
+open(my $COORD,$coordinate_file);
+while(<$COORD>)
+{
+	my $line = $_;
+	chomp($line);
+	my ($id,$start,$end) = split(/\t/,$line);
+	$coordinates{$id}{"start"} = $start;
+	$coordinates{$id}{"end"} = $end;
+}
+close($COORD);
+
+my $in  = Bio::SeqIO->new(-file => "$input_fasta" , '-format' => 'Fasta');
+my $out = Bio::SeqIO->new(-file => ">$output_fasta" , '-format' => 'Fasta');
+    
+while ( my $seq = $in->next_seq() ) 
+{
+	my $id = $seq -> id();
+	my $start = $coordinates{$id}{"start"};
+	my $end = $coordinates{$id}{"end"};
+	
+	if ($start && $end)
+	{
+		my $subseq = $seq->subseq($start,$end);
+		my $new_seq = Bio::Seq->new( -seq => $subseq,
+                                 -id  => $id
+                                 );
+		$out->write_seq($new_seq);
+	}
+
+	
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_proteic_seq_using_coordinates.sh	Sun Sep 16 09:26:09 2012 -0400
@@ -0,0 +1,5 @@
+#!/bin/bash
+input=$1
+output=$2
+coordinates=$3
+$HOME/galaxy_dist/tools/extract/extract_proteic_seq_using_coordinates.pl -i $input -o $output -c $coordinates;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_proteic_seq_using_coordinates.xml	Sun Sep 16 09:26:09 2012 -0400
@@ -0,0 +1,75 @@
+<tool id="extract_proteic_seq_from_coordinates" name="Extract protein sequences">
+	<description>using coordinates</description>
+	<command interpreter="bash">./extract_proteic_seq_using_coordinates.sh $input $output $coordinates</command>
+	<inputs>
+		<param format="fasta" name="input" type="data" label="Protein FASTA file"/>	
+		<param format="txt" name="coordinates" type="data" label="Coordinates for extraction"/>
+	</inputs>
+	<outputs>
+		<data format="fasta" name="output" label="Extracted proteins"/>
+	</outputs>
+	<help>
+	
+.. class:: infomark
+
+**Program encapsulated in Galaxy by Southgreen**
+
+.. class:: infomark
+
+**extract_proteic_seq_using_coordinates.pl version 1.0, 2012**
+
+-----
+
+==========
+ Authors:
+==========
+
+**Dereeper A**
+
+-----
+
+===========
+ Overview:
+===========
+
+Extract sequences from a protein FASTA file using coordinates.  
+
+
+-----
+
+**Example**
+
+If the input dataset is::
+
+	>MCCS00001-0.9-1
+	MRLQLGLRRLHFLRRRDHCNHHRRGFATKYSGRVVVETDNGRSFAVEVDNPILQTDVRGY
+	PLPRRDLICKVVSILQSPPSTASSSSFDDLFMDLSDYLETLNVMITPSEASEILKSLKSP
+	NLALKFFQFCSSEIPDFRHNSFTYNRILLILSKAYLPNRLDLVRNILNEMDQSATGGSIS
+	TVNILIGIFSDGQEYGGIDELEKCLGLVKKWELSLNCYTYKCLMQGYLRLNDSKKALEVY
+	REMTRRGYKLDIFAYNMLLDALAKDEK
+	>MCCS00001-0.1-1
+	MRLNSRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQA
+	NYKPYGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGI
+	LSTTNPGVVIDLKTQLEYFHKVQRSLAEKLGTAEAEEIISNAVYFISMGSNDYMGGYLGN
+	PEMQQLHPPEDYVRMVIGNLTQGIQELYDRGARKFGFLSLCPLGCLPALRVLNPKGHDAG
+	CFEQASALALAHSNALQAVLPNLELLLPKGFKYCNSNFYDWLLDRINDPTKYGFKEGESA
+	CCGAGPYRGIFTCGGTKKDPNYELCDNPSDYVWFDSFHPTERIHEQFAKALWDGLSPSVG
+	PYNLEGLFFNKQTIADVVDNPETQQIF
+
+Interval file must be in the form::
+
+	MCCS00001-0.9-1	2	6
+	MCCS00001-0.1-1	5	132
+
+Extracting sequences returns::
+
+	>MCCS00001-0.9-1
+	RLQLG
+	>MCCS00001-0.1-1
+	SRFGTSSLIHVSLVLLLCFKASGGSAERSSAFFIFGDSTVDPGNNNYIKTTPENQANYKP
+	YGQNGFFKEPTGRFSDGRIIVDYIAEYAKLPIIPPYLQPSADYSHGVNFASGGAGILSTT
+	NPGVVIDL
+
+
+	</help>
+</tool>