Mercurial > repos > cwallon > split_transcript
comparison SplitTranscript/splitTranscriptGff.pl @ 0:0153bf95965c draft default tip
Uploaded
| author | cwallon |
|---|---|
| date | Fri, 01 Mar 2013 11:44:48 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0153bf95965c |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 ### | |
| 3 # But : croiser | |
| 4 # | |
| 5 # Entrees : 2 fichiers gff à croiser | |
| 6 # | |
| 7 # Sortie : gff affiche a l'ecran | |
| 8 # | |
| 9 ###------------------------------------------------------ | |
| 10 use vars qw($USAGE); | |
| 11 use strict; | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 % intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h] | |
| 20 | |
| 21 =head1 DESCRIPTION | |
| 22 This script will sort 2 gff files and compute distance between 2 successives lines. | |
| 23 | |
| 24 -i|--input1 fileName gff input file name: included elements | |
| 25 -j|--input2 fileName gff input file name: extended elements | |
| 26 [-s|--strand] [s|d] s for single strand (colinear) or d for double strands (antisense) [default d] | |
| 27 [-h|--help] help mode then die | |
| 28 | |
| 29 =head1 USECASE | |
| 30 Define 3 fragments for each extended element (CDS): CDS before/after antisens mapping, and overlaping antisense. | |
| 31 intervallsExtractorGff.pl -i *_anti_Jacq.gff -j *_antiCDS.gff > *_antiFragments.gff; | |
| 32 | |
| 33 =head1 KWON BUG | |
| 34 No disjonction of overlapping elements of the included elements (-i file). | |
| 35 In usecase, many overlapping genes (-c -d0) are fused in one long gene. | |
| 36 | |
| 37 =head1 AUTHOR | |
| 38 Claire Toffano-Nioche - sep.11 | |
| 39 | |
| 40 =cut | |
| 41 #----------------------- | |
| 42 sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ; | |
| 43 #print "feedPositionTab::$#info, ", ($#info+1)/4," \n"; | |
| 44 for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element | |
| 45 #print "....$info[$i+2]\n"; | |
| 46 if ($info[$i+3] =~ /\+/) { | |
| 47 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # sequence Forward | |
| 48 } else { | |
| 49 for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # sequence Backward | |
| 50 } | |
| 51 } | |
| 52 #print "feedPos...:: ", join(".", @$pF[0..100]), "\n"; | |
| 53 #print "feedPos...:: ", join(".", @$pB[0..100]), "\n"; | |
| 54 } | |
| 55 #----------------------- | |
| 56 sub recupInfo { my ($pInfo, @lines) = @_ ; | |
| 57 for (my $i=0 ; $i <= ($#lines+1)*4-1 ; $i+=4) { | |
| 58 my @line = split("\t",$lines[$i/4]); | |
| 59 push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=nom, 3=debut, 4=fin, 6=sens | |
| 60 } | |
| 61 #print "recupInfo::fin=", ($#lines+1)*4, "\n" ; | |
| 62 } | |
| 63 #----------------------- | |
| 64 sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ; | |
| 65 my $tagN=$seqN.$strand.$posB."..".$posE; | |
| 66 #print "tagName:",join("_",@_)," et tagName:$tagN\n"; | |
| 67 return $tagN; | |
| 68 } | |
| 69 #----------------------- | |
| 70 sub transitionAnalysis { | |
| 71 my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ; | |
| 72 my $enCours = 0 ; my $precedant = 0 ; | |
| 73 $enCours = @$ptag[$pos] ; | |
| 74 $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ; | |
| 75 if ($enCours ne $precedant) { | |
| 76 #print "transi...:: $s, $pos, $precedant, $enCours\n"; | |
| 77 #print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n"; | |
| 78 SWITCH: for ($precedant.$enCours) { | |
| 79 /01/ && do { $$pdebAmont = $pos ; last SWITCH ;}; | |
| 80 /02/ && do { $$pdebIn = $pos ; last SWITCH ;}; | |
| 81 /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ; | |
| 82 if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) { | |
| 83 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 84 $seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ; | |
| 85 #if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"}; | |
| 86 } elsif ($$pfinAval!=$$pdebAval) { | |
| 87 printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 88 $seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ; | |
| 89 #if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"}; | |
| 90 } | |
| 91 $$pdebAval = 0 ; $$pfinAval = 0 ; | |
| 92 last SWITCH ; | |
| 93 }; | |
| 94 /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ; | |
| 95 my $type="utr5"; | |
| 96 if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3 | |
| 97 $$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ; | |
| 98 $type="operon" ; | |
| 99 } | |
| 100 if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) { | |
| 101 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 102 $seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ; | |
| 103 # if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"}; | |
| 104 } elsif ($$pfinAmont!=$$pdebAmont) { | |
| 105 printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 106 $seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ; | |
| 107 #if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ; | |
| 108 } | |
| 109 $$pdebAmont = 0 ; $$pfinAmont = 0 ; | |
| 110 last SWITCH ; | |
| 111 }; | |
| 112 /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; | |
| 113 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { | |
| 114 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 115 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; | |
| 116 } elsif ($$pfinIn!=$$pdebIn) { | |
| 117 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 118 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; | |
| 119 } | |
| 120 $$pdebIn = 0 ; $$pfinIn = 0 ; | |
| 121 last SWITCH ; | |
| 122 }; | |
| 123 /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; | |
| 124 if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) { | |
| 125 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 126 $seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; | |
| 127 } elsif ($$pfinIn!=$$pdebIn) { | |
| 128 printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", | |
| 129 $seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; | |
| 130 } | |
| 131 #$$pdebIn = 0 ; $$pfinIn = 0 ; | |
| 132 last SWITCH ; | |
| 133 }; | |
| 134 } | |
| 135 } | |
| 136 } | |
| 137 #----------------------- | |
| 138 my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ; | |
| 139 # command line check | |
| 140 foreach my $num (0 .. $#ARGV) { | |
| 141 SWITCH: for ($ARGV[$num]) { | |
| 142 /--input1|-i/ && do { | |
| 143 $fileNameI=$ARGV[$num+1]; | |
| 144 open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ; | |
| 145 last }; | |
| 146 /--input2|-j/ && do { | |
| 147 $fileNameE=$ARGV[$num+1]; | |
| 148 open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ; | |
| 149 last }; | |
| 150 /--strand|-s/ && do { | |
| 151 if ($ARGV[$num+1] eq "s") { $strand=1}; | |
| 152 last }; | |
| 153 /--help|-h/ && do { exec("pod2text $0\n") ; die }; | |
| 154 } | |
| 155 } | |
| 156 # memory declarations: | |
| 157 my @infoI ; my @infoE ; | |
| 158 my $seqName ; | |
| 159 my @tagF ; my @tagB ; # Forward and Backward sequence | |
| 160 # data retrieval: | |
| 161 my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ; | |
| 162 close fichierGffI ; close fichierGffE ; | |
| 163 #print "gff files read ; number of lines : $#lines1 + $#lines2\n"; | |
| 164 # positions management | |
| 165 &recupInfo(\@infoI, @linesI) ; | |
| 166 &recupInfo(\@infoE, @linesE) ; | |
| 167 # treatement: | |
| 168 # transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements | |
| 169 if (($#infoI) and ($#infoE)) { | |
| 170 $seqName=$infoI[0] ; | |
| 171 #print "fin : $infoE[$#infoE-1]\n"; | |
| 172 for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions | |
| 173 #print "seqName : $seqName\n" ; | |
| 174 &feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements | |
| 175 &feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements | |
| 176 #print join("", @tagF), "\n"; | |
| 177 #print join("", @tagB), "\n"; | |
| 178 # transition management: | |
| 179 my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) | |
| 180 = (0, 0, 0, 0, 0, 0) ; | |
| 181 for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) { | |
| 182 &transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ; | |
| 183 } | |
| 184 ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ; | |
| 185 for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) { | |
| 186 &transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ; | |
| 187 } | |
| 188 } | |
| 189 exit(0) ; |
