view cf_scripts/filter_contigs.0403.pl @ 19:f51dd12f2263 draft

Uploaded
author geert-vandeweyer
date Mon, 28 Jul 2014 13:34:56 -0400
parents 4a3afa90ff7f
children
line wrap: on
line source

#!/usr/bin/perl

use strict;
my $tol=300;
my $assembler=$ARGV[3];


#read in .psl file (blat output),
open(PSL, "$ARGV[0]") or die "Can't open .psl.\n";
my %blat_results;
#hash blat results by query name
while(my $line=<PSL>) {															
    chomp($line);
    my ($match, $mm, $rm, $nn, $qgpct, $qgpb, $tgpct, $tgpb, $strand, $qname, $qsize, $qstart, $qend, $tname, $tsize, $tstart, $tend, $stuff)=split(/\s+/, $line);
    if( ! exists $blat_results{$qname}) {
        $blat_results{$qname}=();     
    }
    push (@{$blat_results{$qname}}, $line);
}
close(PSL);

my %good_contigs;
my %good_blats;

#read in cf output

open(QP, "$ARGV[1]");
while(my $line=<QP>) {
    chomp($line);
    my $pf="pass";
    my ($x, $chr1, $pos1, $chr2, $pos2, $or, $ct, $splct)=split(/\s+/, $line);
    if($pf eq "pass") {
	
        foreach my $key(keys %blat_results) {
            my @score=(0,0);
            my @blat_out=();		

            for(my $j=0; $j<@{$blat_results{$key}}; $j++) {
                my ($match, $mm, $rm, $nn, $qgpct, $qgpb, $tgpct, $tgpb, $strand, $qname, $qsize, $qstart, $qend, $tname, $tsize, $tstart, $tend, $stuff)=split(/\s+/, $blat_results{$key}[$j]);
                if(($tname eq $chr1) && ($pos1>$tstart-$tol)&&($pos1<$tend+$tol)) {
                    $score[0]++;
                    push(@blat_out, $blat_results{$key}[$j]);
                }
                if(($tname eq $chr2) && ($pos2>$tstart-$tol)&&($pos2<$tend+$tol)) {
                    $score[1]++;
                    push(@blat_out, $blat_results{$key}[$j]);
                }
            }
           
            if($score[0]>0 && $score[1]>0) {			# if there is a contig with blat results corresponding to both sides of the cf result
				$good_contigs{$key}=$line;
				@{$good_blats{$key}}=@blat_out;
            }
        }
    }
}
close(QP);

my %all_ctgs;
print STDOUT "*********************************************\n$assembler results\n\n";

if(keys(%good_contigs)>0) {
	# read in fasta file containing contigs
    open(CT, "$ARGV[2]") or die "Can't open contigs.\n";
    my $curr_ctg="";
    while (my $line=<CT>) {
		chomp($line);
		if(substr($line, 0, 1) eq ">") {
			$curr_ctg=substr($line,1);
			$all_ctgs{$curr_ctg}=();								#read each contig into its own array
		}
		else {
			push(@{$all_ctgs{$curr_ctg}}, $line);			
		}
    }
    close(CT);

    foreach my $key(keys %all_ctgs) {
		if(exists $good_contigs{$key}) {
			my $ctg="";
			for(my $k=0; $k<@{$all_ctgs{$key}}; $k++) {
				$ctg.=$all_ctgs{$key}[$k];
			}
			foreach(@{$good_blats{$key}}) {
				my ($match, $mm, $rm, $nn, $qgpct, $qgpb, $tgpct, $tgpb, $strand, $qname, $qsize, $qstart, $qend, $tname, $size, $tstart, $tend, $stuff)=split(/\s+/, $_);
				print STDOUT "$qname\t$qstart\t$qend\t$tname\t$tstart\t$tend\t$strand\n";
	    		}
	    		print STDOUT "$key $good_contigs{$key}\n$ctg\n";

		}
    	}
}