Mercurial > repos > devteam > categorize_elements_satisfying_criteria
changeset 0:314830c0db00 draft default tip
Uploaded
author | devteam |
---|---|
date | Tue, 20 Aug 2013 09:22:17 -0400 |
parents | |
children | |
files | categorize_elements_satisfying_criteria.pl categorize_elements_satisfying_criteria.xml test-data/categories.tabular test-data/categorized_elements.tabular test-data/criteria_elements_data.tabular |
diffstat | 5 files changed, 275 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/categorize_elements_satisfying_criteria.pl Tue Aug 20 09:22:17 2013 -0400 @@ -0,0 +1,172 @@ +#!/usr/bin/perl -w + +# The program takes as input a set of categories, such that each category contains many elements. +# It also takes a table relating elements with criteria, such that each element is assigned a number +# representing the number of times the element satisfies a certain criterion. +# The first input is a TABULAR format file, such that the left column represents the name of categories and, +# all other columns represent the names of elements. +# The second input is a TABULAR format file relating elements with criteria, such that the first line +# represents the names of criteria and the left column represents the names of elements. +# The output is a TABULAR format file relating catergories with criteria, such that each categoy is +# assigned a number representing the total number of times its elements satisfies a certain criterion. +# Each category is assigned as many numbers as criteria. + +use strict; +use warnings; + +#variables to handle information of the categories input file +my @categoryElementsArray = (); +my @categoriesArray = (); +my $categoryMemberNames; +my $categoryName; +my %categoryMembersHash = (); +my $memberNumber = 0; +my $totalMembersNumber = 0; +my $totalCategoriesNumber = 0; +my @categoryCountersTwoDimArray = (); +my $lineCounter1 = 0; + +#variables to handle information of the criteria and elements data input file +my $elementLine; +my @elementDataArray = (); +my $elementName; +my @criteriaArray = (); +my $criteriaNumber = 0; +my $totalCriteriaNumber = 0; +my $lineCounter2 = 0; + +#variable representing the row and column indices used to store results into a two-dimensional array +my $row = 0; +my $column = 0; + +# check to make sure having correct files +my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n"; +die $usage unless @ARGV == 3; + +#get the categories input file +my $categories_inputFile = $ARGV[0]; + +#get the criteria and data input file +my $elements_data_inputFile = $ARGV[1]; + +#get the output file +my $categorized_data_outputFile = $ARGV[2]; + +#open the input and output files +open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n"); +open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n"); +open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n"); + +#store the first input file into an array +my @categoriesData = <INPUT1>; + +#reset the value of $lineCounter1 to 0 +$lineCounter1 = 0; + +#iterate through the first input file to get the names of categories and their corresponding elements +foreach $categoryMemberNames (@categoriesData){ + chomp ($categoryMemberNames); + + @categoryElementsArray = split(/\t/, $categoryMemberNames); + + #store the name of the current category into an array + $categoriesArray [$lineCounter1] = $categoryElementsArray[0]; + + #store the name of the current category into a two-dimensional array + $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0]; + + #get the total number of elements in the current category + $totalMembersNumber = @categoryElementsArray; + + #store the names of categories and their corresponding elements into a hash + for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) { + + $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1]; + } + + $lineCounter1++; +} + +#store the second input file into an array +my @elementsData = <INPUT2>; + +#reset the value of $lineCounter2 to 0 +$lineCounter2 = 0; + +#iterate through the second input file in order to count the number of elements +#in each category that satisfy each criterion +foreach $elementLine (@elementsData){ + chomp ($elementLine); + + $lineCounter2++; + + @elementDataArray = split(/\t/, $elementLine); + + #if at the first line, get the total number of criteria and the total + #number of catergories and initialize the two-dimensional array + if ($lineCounter2 == 1){ + @criteriaArray = @elementDataArray; + $totalCriteriaNumber = @elementDataArray; + + $totalCategoriesNumber = @categoriesArray; + + #initialize the two-dimensional array + for ($row = 0; $row < $totalCategoriesNumber; $row++) { + + for ($column = 1; $column <= $totalCriteriaNumber; $column++) { + + $categoryCountersTwoDimArray [$row][$column] = 0; + } + } + } + else{ + #get the element data + $elementName = $elementDataArray[0]; + + #do the counting and store the result in the two-dimensional array + for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) { + + if ($elementDataArray[$criteriaNumber + 1] > 0){ + + $categoryName = $categoryMembersHash{$elementName}; + + my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray; + + $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1]; + } + } + } +} + +print OUTPUT "\t"; + +#store the criteria names into the output file +for ($column = 1; $column <= $totalCriteriaNumber; $column++) { + + if ($column < $totalCriteriaNumber){ + print OUTPUT $criteriaArray[$column - 1] . "\t"; + } + else{ + print OUTPUT $criteriaArray[$column - 1] . "\n"; + } +} + +#store the category names and their corresponding number of elements satisfying criteria into the output file +for ($row = 0; $row < $totalCategoriesNumber; $row++) { + + for ($column = 0; $column <= $totalCriteriaNumber; $column++) { + + if ($column < $totalCriteriaNumber){ + print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t"; + } + else{ + print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n"; + } + } +} + +#close the input and output file +close(OUTPUT); +close(INPUT2); +close(INPUT1); +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/categorize_elements_satisfying_criteria.xml Tue Aug 20 09:22:17 2013 -0400 @@ -0,0 +1,78 @@ +<tool id="categorize_elements_satisfying_criteria" name="Categorize Elements" version="1.0.0"> + <description>satisfying criteria</description> + + <command interpreter="perl"> + categorize_elements_satisfying_criteria.pl $inputFile1 $inputFile2 $outputFile1 + </command> + + <inputs> + <param format="tabular" name="inputFile1" type="data" label="Select file containing categories and their elements"/> + <param format="tabular" name="inputFile2" type="data" label="Select file containing criteria and elements data"/> + </inputs> + + <outputs> + <data format="tabular" name="outputFile1"/> + </outputs> + + <tests> + <test> + <param name="inputFile1" value="categories.tabular" ftype="tabular" /> + <param name="inputFile2" value="criteria_elements_data.tabular" ftype="tabular" /> + <output name="outputFile1" file="categorized_elements.tabular" /> + </test> + </tests> + + + <help> + +.. class:: infomark + +**What it does** + +The program takes as input a set of categories, such that each category contains many elements. It also takes a table relating elements with criteria, such that each element is assigned a number representing the number of times the element satisfies a certain criterion. + +- The first input is a TABULAR format file, such that the left column represents the names of categories and, all other columns represent the names of elements in each category. +- The second input is a TABULAR format file relating elements with criteria, such that the first line represents the names of criteria and the left column represents the names of elements. +- The output is a TABULAR format file relating catergories with criteria, such that each categoy is assigned a number representing the total number of times its elements satisfies a certain criterion.. Each category is assigned as many numbers as criteria. + + +**Example** + +Let the first input file be a group of motif categories as follows:: + + Deletion_Hotspots deletionHoptspot1 deletionHoptspot2 deletionHoptspot3 + Dna_Pol_Pause_Frameshift dnaPolPauseFrameshift1 dnaPolPauseFrameshift2 dnaPolPauseFrameshift3 dnaPolPauseFrameshift4 + Indel_Hotspots indelHotspot1 + Insertion_Hotspots insertionHotspot1 insertionHotspot2 + Topoisomerase_Cleavage_Sites topoisomeraseCleavageSite1 topoisomeraseCleavageSite2 topoisomeraseCleavageSite3 + + +And let the second input file represent the number of times each motif occurs in a certain window size of indel flanking regions, as follows:: + + 10bp 20bp 40bp + deletionHoptspot1 1 1 2 + deletionHoptspot2 1 1 1 + deletionHoptspot3 0 0 0 + dnaPolPauseFrameshift1 1 1 1 + dnaPolPauseFrameshift2 0 2 1 + dnaPolPauseFrameshift3 0 0 0 + dnaPolPauseFrameshift4 0 1 2 + indelHotspot1 0 0 0 + insertionHotspot1 0 0 1 + insertionHotspot2 1 1 1 + topoisomeraseCleavageSite1 1 1 1 + topoisomeraseCleavageSite2 1 2 1 + topoisomeraseCleavageSite3 0 0 2 + +Running the program will give the total number of times the motifs of each category occur in every window size of indel flanking regions:: + + 10bp 20bp 40bp + Deletion_Hotspots 2 2 3 + Dna_Pol_Pause_Frameshift 1 4 4 + Indel_Hotspots 0 0 0 + Insertion_Hotspots 1 1 2 + Topoisomerase_Cleavage_Sites 2 3 4 + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/categories.tabular Tue Aug 20 09:22:17 2013 -0400 @@ -0,0 +1,5 @@ +Deletion_Hotspots deletionHoptspot1 deletionHoptspot2 deletionHoptspot3 +Dna_Pol_Pause_Frameshift dnaPolPauseFrameshift1 dnaPolPauseFrameshift2 dnaPolPauseFrameshift3 dnaPolPauseFrameshift4 +Indel_Hotspots indelHotspot1 +Insertion_Hotspots insertionHotspot1 insertionHotspot2 +Topoisomerase_Cleavage_Sites topoisomeraseCleavageSite1 topoisomeraseCleavageSite2 topoisomeraseCleavageSite3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/categorized_elements.tabular Tue Aug 20 09:22:17 2013 -0400 @@ -0,0 +1,6 @@ + 10bp 20bp 40bp +Deletion_Hotspots 2 2 3 +Dna_Pol_Pause_Frameshift 1 4 4 +Indel_Hotspots 0 0 0 +Insertion_Hotspots 1 1 2 +Topoisomerase_Cleavage_Sites 2 3 4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/criteria_elements_data.tabular Tue Aug 20 09:22:17 2013 -0400 @@ -0,0 +1,14 @@ +10bp 20bp 40bp +deletionHoptspot1 1 1 2 +deletionHoptspot2 1 1 1 +deletionHoptspot3 0 0 0 +dnaPolPauseFrameshift1 1 1 1 +dnaPolPauseFrameshift2 0 2 1 +dnaPolPauseFrameshift3 0 0 0 +dnaPolPauseFrameshift4 0 1 2 +indelHotspot1 0 0 0 +insertionHotspot1 0 0 1 +insertionHotspot2 1 1 1 +topoisomeraseCleavageSite1 1 1 1 +topoisomeraseCleavageSite2 1 2 1 +topoisomeraseCleavageSite3 0 0 2