diff seg2matrix/CGData/CsegToMatrix.cc @ 31:ab20c0d04f4a

add seg2matrix tool
author jingchunzhu
date Fri, 24 Jul 2015 13:10:11 -0700
parents
children 59dbe857f5d4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seg2matrix/CGData/CsegToMatrix.cc	Fri Jul 24 13:10:11 2015 -0700
@@ -0,0 +1,355 @@
+
+#include <map>
+#include <iostream>
+#include <list>
+#include <set>
+#include <vector>
+#include <sstream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+using namespace std;
+
+#define MISSING_VAL -99
+
+class segment {
+	public:
+	string target;
+	string chrome;
+	int start, end;
+	float value;
+};
+
+typedef map<string,list<segment> > chromemap;
+typedef map< string, chromemap > segmap;
+typedef map<string,set<int> > breakmap;
+typedef vector<vector<float> > genemap;
+
+
+segmap * new_segment() {
+	return new segmap();
+}
+
+set<string> * new_target_set() {
+	return new set<string>();
+}
+
+void add_segment(segmap * seg, set<string> *targetSet, char *sample, char *chrom, int chrom_start, int chrom_end, float value) {
+	string tmp;
+	segment a;
+	
+	a.target = strdup(sample);
+	a.chrome = strdup(chrom);
+	a.start = chrom_start;
+	a.end = chrom_end;
+	a.value = value;
+		
+	if ( a.target.size() > 0 ) {
+		if ( a.chrome.compare("X") == 0 )
+			a.chrome = "23";
+		if ( a.chrome.compare("Y") == 0 )
+			a.chrome = "24";
+		
+		if ( a.chrome.compare("23") == 0 )
+			a.chrome = "chrX";
+		else if ( a.chrome.compare("24") == 0 )
+			a.chrome = "chrY";
+		else if ( a.chrome.find("chr") != 0 )
+			a.chrome = string("chr") + a.chrome;
+		
+		if(seg->find(a.target) == seg->end()) {
+			map<string, list<segment> > l;
+			(*seg)[ a.target ] = l;
+		} 
+		if ( (*seg)[ a.target ].find( a.chrome ) == (*seg)[ a.target ].end() ) {
+			list<segment> l;
+			(*seg)[ a.target ][ a.chrome ] = l;
+		}
+		(*seg)[ a.target ][a.chrome].push_back( a );
+		targetSet->insert( a.target );
+	}	
+}
+
+
+
+void print_matrix(segmap *data, set<string> *targetSet, void (*print)(const char *)) {
+	
+	//create a break map
+	breakmap breaks;
+	for( segmap::iterator t = data->begin(); t!=data->end(); ++t) {
+		for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) {
+			for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) {
+				if ( breaks.find( s->chrome ) == breaks.end() ) {
+					set<int> ns;
+					ns.insert( s->start );
+					ns.insert( s->end );
+					breaks[ s->chrome ] = ns;
+				} else {
+					breaks[ s->chrome ].insert( s->start );
+					breaks[ s->chrome ].insert( s->end );
+				}
+			}
+		}	
+	}
+	
+	//print out the matrix column names
+	(*print)( "probe" );
+	for ( set<string>::iterator ts = targetSet->begin(); ts != targetSet->end(); ++ts ) {
+		(*print)( "\t" );
+		(*print)( (*ts).c_str() );
+	}
+	(*print)( "\n" );
+	int targetCount = targetSet->size();
+	//check breakmap for each chrome
+	for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) {
+		vector<int> starts;
+		vector<int> ends;
+		vector<string> probeName;
+		
+		breakmap::iterator cset = breaks.find( b->first );
+		int blockCount = cset->second.size() - 1;
+		starts.resize( blockCount );
+		ends.resize( blockCount );
+		probeName.resize( blockCount );
+		int i = 0;
+		for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) {
+			if ( i > 0 ) {
+				ends[i-1] = (*v);
+			}
+			if ( i < blockCount ) {
+				starts[ i ] = (*v);
+			}
+			i++;
+		}
+		
+		//create names for the breakpoints
+		genemap gm;
+		gm.resize( blockCount );
+		for ( int i = 0; i < blockCount; i++ ) {
+			stringstream name;
+			name << b->first;
+			name << "_";
+			name << starts[i];
+			name << "_";
+			name << ends[i];
+			probeName[i] = name.str();
+			gm[ i ].resize( targetCount ); 
+		}
+		//scan the targets		
+		int curTarget = 0;
+		for ( set<string>::iterator t = targetSet->begin(); t != targetSet->end(); ++t ) {
+			//find the target values for the current chrome
+			segmap::iterator s = data->find( (*t) );
+			chromemap::iterator ss = s->second.find( b->first );
+			if ( ss != s->second.end() ) {
+				vector<float> vals;
+				vals.resize( blockCount );
+				for ( int i = 0; i < blockCount; i++ ) {
+					vals[ i ] = MISSING_VAL;
+				}
+				//if the segment overlaps the break, assign the value
+				for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) {
+					for ( int i = 0; i < blockCount; i++ ) {
+						if ( cs->end > starts[i] && cs->start < ends[i] ) {
+							vals[i] = cs->value;
+						}
+					}
+				}
+				//assign the values to the named map
+				for ( int i = 0; i < blockCount; i++ ) {
+					gm[ i ][ curTarget ] = vals[ i ];
+				}				
+				//cout << s->first << "\t" << ss->first << "\n";
+			}
+			curTarget++;
+		}
+		//print out this chrome's segments
+		//and the values for each target
+		for ( int i =0; i < blockCount; i++ ) {
+			print(probeName[i].c_str());
+			for ( int j = 0; j < targetCount; j++ ) {
+				float val = gm[i][j];
+				print("\t");
+				if ( val == MISSING_VAL )
+					print("NA");
+				else {							
+					char str[20]  = "";
+					sprintf(str, "%f", val);
+					print(str);
+				}
+			}
+			print("\n");
+		}
+	}
+	
+}
+
+
+/*
+* This program read's the stdin looking for a segment format of 
+* <targetname> <chrome_number> <chrome_start> <chrome_end>
+* breaks the segments into discreate probes (so segments on different targets
+* have the same probe name) and creates a matrix of probe values for each target
+*/
+
+
+int main(int argc, char** argv) {
+	
+	segmap data;
+	set<string> targetSet;
+	
+	//load in segment map
+	do { 
+		string tmp;
+		segment a;
+		cin >> a.target;
+		cin >> a.chrome;
+		cin >> tmp;
+		a.start = atoi( tmp.c_str() );
+		cin >> tmp;
+		a.end = atoi( tmp.c_str() );
+		cin >> tmp;
+		a.value = atof( tmp.c_str() );
+		if ( a.target.size() > 0 ) {
+			if ( a.chrome.compare("X") == 0 )
+				a.chrome = "23";
+			if ( a.chrome.compare("Y") == 0 )
+				a.chrome = "24";
+			
+			if ( a.chrome.compare("23") == 0 )
+				a.chrome = "chrX";
+			else if ( a.chrome.compare("24") == 0 )
+				a.chrome = "chrY";
+	        else
+				a.chrome = string("chr") + a.chrome;
+			
+			if(data.find(a.target) == data.end()) {
+				map<string, list<segment> > l;
+				data[ a.target ] = l;
+			} 
+			if ( data[ a.target ].find( a.chrome ) == data[ a.target ].end() ) {
+				list<segment> l;
+				data[ a.target ][ a.chrome ] = l;
+			}
+			data[ a.target ][a.chrome].push_back( a );
+			targetSet.insert( a.target );
+		}
+	} while ( cin );	
+	
+	//create a break map
+	breakmap breaks;
+	for( segmap::iterator t = data.begin(); t!=data.end(); ++t) {
+		for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) {
+			for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) {
+				if ( breaks.find( s->chrome ) == breaks.end() ) {
+					set<int> ns;
+					ns.insert( s->start );
+					ns.insert( s->end );
+					breaks[ s->chrome ] = ns;
+				} else {
+					breaks[ s->chrome ].insert( s->start );
+					breaks[ s->chrome ].insert( s->end );
+				}
+			}
+		}	
+	}
+	
+	//print out the matrix column names
+	cout << "probe";
+	for ( set<string>::iterator ts = targetSet.begin(); ts != targetSet.end(); ++ts ) {
+		cout << "\t";
+		cout << (*ts);
+	}
+	cout << "\n";
+	int targetCount = targetSet.size();
+	//check breakmap for each chrome
+	for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) {
+		cerr << b->first << "\n";
+		vector<int> starts;
+		vector<int> ends;
+		vector<string> probeName;
+		
+		breakmap::iterator cset = breaks.find( b->first );
+		int blockCount = cset->second.size() - 1;
+		starts.resize( blockCount );
+		ends.resize( blockCount );
+		probeName.resize( blockCount );
+		int i = 0;
+		for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) {
+			if ( i > 0 ) {
+				ends[i-1] = (*v);
+			}
+			if ( i < blockCount ) {
+				starts[ i ] = (*v);
+			}
+			i++;
+		}
+		
+		//create names for the breakpoints
+		genemap gm;
+		gm.resize( blockCount );
+		for ( int i = 0; i < blockCount; i++ ) {
+			stringstream name;
+			name << b->first;
+			name << "_";
+			name << starts[i];
+			name << "_";
+			name << ends[i];
+			probeName[i] = name.str();
+			gm[ i ].resize( targetCount ); 
+		}
+		//scan the targets		
+		int curTarget = 0;
+		for ( set<string>::iterator t = targetSet.begin(); t != targetSet.end(); ++t ) {
+			//find the target values for the current chrome
+			segmap::iterator s = data.find( (*t) );
+			chromemap::iterator ss = s->second.find( b->first );
+			if ( ss != s->second.end() ) {
+				vector<float> vals;
+				vals.resize( blockCount );
+				for ( int i = 0; i < blockCount; i++ ) {
+					vals[ i ] = MISSING_VAL;
+				}
+				//if the segment overlaps the break, assign the value
+				for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) {
+					for ( int i = 0; i < blockCount; i++ ) {
+						if ( cs->end > starts[i] && cs->start < ends[i] ) {
+							vals[i] = cs->value;
+						}
+					}
+				}
+				//assign the values to the named map
+				for ( int i = 0; i < blockCount; i++ ) {
+					gm[ i ][ curTarget ] = vals[ i ];
+				}				
+				//cout << s->first << "\t" << ss->first << "\n";
+			}
+			curTarget++;
+		}
+		//print out this chrome's segments
+		//and the values for each target
+		for ( int i =0; i < blockCount; i++ ) {
+			cout << probeName[i];
+			for ( int j = 0; j < targetCount; j++ ) {
+				float val = gm[i][j];
+				cout << "\t";
+				if ( val == MISSING_VAL )
+					cout << "null";
+				else
+					cout << val;
+			}
+			cout << "\n";
+		}
+	}
+	
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif