changeset 0:314830c0db00 draft default tip

Uploaded
author devteam
date Tue, 20 Aug 2013 09:22:17 -0400
parents
children
files categorize_elements_satisfying_criteria.pl categorize_elements_satisfying_criteria.xml test-data/categories.tabular test-data/categorized_elements.tabular test-data/criteria_elements_data.tabular
diffstat 5 files changed, 275 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/categorize_elements_satisfying_criteria.pl	Tue Aug 20 09:22:17 2013 -0400
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+
+# The program takes as input a set of categories, such that each category contains many elements.
+# It also takes a table relating elements with criteria, such that each element is assigned a number
+# representing the number of times the element satisfies a certain criterion. 
+# The first input is a TABULAR format file, such that the left column represents the name of categories and, 
+# all other columns represent the names of elements.
+# The second input is a TABULAR format file relating elements with criteria, such that the first line
+# represents the names of criteria and the left column represents the names of elements.
+# The output is a TABULAR format file relating catergories with criteria, such that each categoy is 
+# assigned a number representing the total number of times its elements satisfies a certain criterion.
+# Each category is assigned as many numbers as criteria.
+
+use strict;
+use warnings;
+
+#variables to handle information of the categories input file
+my @categoryElementsArray = ();
+my @categoriesArray = ();
+my $categoryMemberNames;
+my $categoryName;
+my %categoryMembersHash = ();
+my $memberNumber = 0;
+my $totalMembersNumber = 0;
+my $totalCategoriesNumber = 0;
+my @categoryCountersTwoDimArray = ();
+my $lineCounter1 = 0;
+
+#variables to handle information of the criteria and elements data input file
+my $elementLine;
+my @elementDataArray = ();
+my $elementName;
+my @criteriaArray = ();
+my $criteriaNumber = 0;
+my $totalCriteriaNumber = 0;
+my $lineCounter2 = 0;
+
+#variable representing the row and column indices used to store results into a two-dimensional array
+my $row = 0;
+my $column = 0;
+
+# check to make sure having correct files
+my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n";
+die $usage unless @ARGV == 3;
+
+#get the categories input file
+my $categories_inputFile = $ARGV[0];
+
+#get the criteria and data input file
+my $elements_data_inputFile = $ARGV[1];
+
+#get the output file
+my $categorized_data_outputFile = $ARGV[2];
+
+#open the input and output files
+open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n");
+open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile  \n");
+open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n"); 
+
+#store the first input file into an array
+my @categoriesData = <INPUT1>;
+
+#reset the value of $lineCounter1 to 0 
+$lineCounter1 = 0;
+
+#iterate through the first input file to get the names of categories and their corresponding elements	
+foreach $categoryMemberNames (@categoriesData){
+	chomp ($categoryMemberNames);
+		
+	@categoryElementsArray = split(/\t/, $categoryMemberNames);
+	
+	#store the name of the current category into an array
+	$categoriesArray [$lineCounter1] = $categoryElementsArray[0];
+	
+	#store the name of the current category into a two-dimensional array
+	$categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0];
+		
+	#get the total number of elements in the current category
+	$totalMembersNumber = @categoryElementsArray;
+	
+	#store the names of categories and their corresponding elements	into a hash
+	for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) {
+			
+		$categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1];
+	}
+	
+	$lineCounter1++;
+}
+
+#store the second input file into an array
+my @elementsData = <INPUT2>;
+
+#reset the value of $lineCounter2 to 0 
+$lineCounter2 = 0;
+
+#iterate through the second input file in order to count the number of elements
+#in each category that satisfy each criterion	
+foreach $elementLine (@elementsData){
+	chomp ($elementLine);
+		
+	$lineCounter2++;
+	
+	@elementDataArray = split(/\t/, $elementLine);
+	
+	#if at the first line, get the total number of criteria and the total  
+	#number of catergories and initialize the two-dimensional array
+	if ($lineCounter2 == 1){
+		@criteriaArray = @elementDataArray;	
+		$totalCriteriaNumber = @elementDataArray;
+		
+		$totalCategoriesNumber = @categoriesArray;
+		
+		#initialize the two-dimensional array
+		for ($row = 0; $row < $totalCategoriesNumber; $row++) {
+	
+			for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
+				
+				$categoryCountersTwoDimArray [$row][$column] = 0;
+			}
+		}
+	}
+	else{
+		#get the element data
+		$elementName = $elementDataArray[0];
+		
+		#do the counting and store the result in the two-dimensional array
+		for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) {
+			
+			if ($elementDataArray[$criteriaNumber + 1] > 0){
+				
+				$categoryName = $categoryMembersHash{$elementName};
+				
+				my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray;
+				
+				$categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1];
+			}
+		}
+	}
+}
+
+print OUTPUT "\t";
+
+#store the criteria names into the output file	
+for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
+		
+	if ($column < $totalCriteriaNumber){
+		print OUTPUT $criteriaArray[$column - 1] . "\t";
+	}
+	else{
+		print OUTPUT $criteriaArray[$column - 1] . "\n";
+	}
+}
+	
+#store the category names and their corresponding number of elements satisfying criteria into the output file
+for ($row = 0; $row < $totalCategoriesNumber; $row++) {
+	
+	for ($column = 0; $column <= $totalCriteriaNumber; $column++) {
+		
+		if ($column < $totalCriteriaNumber){
+			print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t";
+		}
+		else{
+			print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n";
+		}
+	}
+}
+
+#close the input and output file
+close(OUTPUT);
+close(INPUT2);
+close(INPUT1);
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/categorize_elements_satisfying_criteria.xml	Tue Aug 20 09:22:17 2013 -0400
@@ -0,0 +1,78 @@
+<tool id="categorize_elements_satisfying_criteria" name="Categorize Elements" version="1.0.0">
+  <description>satisfying criteria</description>
+  
+  <command interpreter="perl">
+  	categorize_elements_satisfying_criteria.pl $inputFile1 $inputFile2 $outputFile1
+  </command>
+
+  <inputs>
+  	<param format="tabular" name="inputFile1" type="data" label="Select file containing categories and their elements"/>
+  	<param format="tabular" name="inputFile2" type="data" label="Select file containing criteria and elements data"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+  </outputs>
+
+  <tests>
+  	<test>
+  		<param name="inputFile1" value="categories.tabular" ftype="tabular" />
+  		<param name="inputFile2" value="criteria_elements_data.tabular" ftype="tabular" />
+    	<output name="outputFile1" file="categorized_elements.tabular" />
+  	</test>
+  </tests>
+  
+  	
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+The program takes as input a set of categories, such that each category contains many elements. It also takes a table relating elements with criteria, such that each element is assigned a number representing the number of times the element satisfies a certain criterion. 
+
+- The first input is a TABULAR format file, such that the left column represents the names of categories and, all other columns represent the names of elements in each category.
+- The second input is a TABULAR format file relating elements with criteria, such that the first line represents the names of criteria and the left column represents the names of elements.
+- The output is a TABULAR format file relating catergories with criteria, such that each categoy is assigned a number representing the total number of times its elements satisfies a certain criterion.. Each category is assigned as many numbers as criteria.
+
+
+**Example**
+
+Let the first input file be a group of motif categories as follows::
+
+	Deletion_Hotspots		deletionHoptspot1		deletionHoptspot2		deletionHoptspot3	
+	Dna_Pol_Pause_Frameshift	dnaPolPauseFrameshift1		dnaPolPauseFrameshift2		dnaPolPauseFrameshift3		dnaPolPauseFrameshift4
+	Indel_Hotspots			indelHotspot1			
+	Insertion_Hotspots		insertionHotspot1		insertionHotspot2		
+	Topoisomerase_Cleavage_Sites	topoisomeraseCleavageSite1	topoisomeraseCleavageSite2	topoisomeraseCleavageSite3	
+
+
+And let the second input file represent the number of times each motif occurs in a certain window size of indel flanking regions, as follows::
+
+					10bp	20bp	40bp	
+	deletionHoptspot1		1	1	2
+	deletionHoptspot2		1	1	1
+	deletionHoptspot3		0	0	0
+	dnaPolPauseFrameshift1		1	1	1
+	dnaPolPauseFrameshift2		0	2	1
+	dnaPolPauseFrameshift3		0	0	0
+	dnaPolPauseFrameshift4		0	1	2
+	indelHotspot1			0	0	0
+	insertionHotspot1		0	0	1
+	insertionHotspot2		1	1	1
+	topoisomeraseCleavageSite1	1	1	1
+	topoisomeraseCleavageSite2	1	2	1
+	topoisomeraseCleavageSite3	0	0	2
+
+Running the program will give the total number of times the motifs of each category occur in every window size of indel flanking regions::
+
+					10bp	20bp	40bp
+	Deletion_Hotspots		2	2	3
+	Dna_Pol_Pause_Frameshift	1	4	4
+	Indel_Hotspots			0	0	0
+	Insertion_Hotspots		1	1	2
+	Topoisomerase_Cleavage_Sites	2	3	4
+
+    </help> 
+    
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/categories.tabular	Tue Aug 20 09:22:17 2013 -0400
@@ -0,0 +1,5 @@
+Deletion_Hotspots	deletionHoptspot1	deletionHoptspot2	deletionHoptspot3
+Dna_Pol_Pause_Frameshift	dnaPolPauseFrameshift1	dnaPolPauseFrameshift2	dnaPolPauseFrameshift3	dnaPolPauseFrameshift4
+Indel_Hotspots	indelHotspot1
+Insertion_Hotspots	insertionHotspot1	insertionHotspot2
+Topoisomerase_Cleavage_Sites	topoisomeraseCleavageSite1	topoisomeraseCleavageSite2	topoisomeraseCleavageSite3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/categorized_elements.tabular	Tue Aug 20 09:22:17 2013 -0400
@@ -0,0 +1,6 @@
+	10bp	20bp	40bp
+Deletion_Hotspots	2	2	3
+Dna_Pol_Pause_Frameshift	1	4	4
+Indel_Hotspots	0	0	0
+Insertion_Hotspots	1	1	2
+Topoisomerase_Cleavage_Sites	2	3	4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/criteria_elements_data.tabular	Tue Aug 20 09:22:17 2013 -0400
@@ -0,0 +1,14 @@
+10bp	20bp	40bp
+deletionHoptspot1	1	1	2
+deletionHoptspot2	1	1	1
+deletionHoptspot3	0	0	0
+dnaPolPauseFrameshift1	1	1	1
+dnaPolPauseFrameshift2	0	2	1
+dnaPolPauseFrameshift3	0	0	0
+dnaPolPauseFrameshift4	0	1	2
+indelHotspot1	0	0	0
+insertionHotspot1	0	0	1
+insertionHotspot2	1	1	1
+topoisomeraseCleavageSite1	1	1	1
+topoisomeraseCleavageSite2	1	2	1
+topoisomeraseCleavageSite3	0	0	2