Mercurial > repos > bgruening > rnacommender
diff data.py @ 0:768beb05387d draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit cc090387231a51b44f84298cd3e149fc6643abb0
author | bgruening |
---|---|
date | Tue, 31 May 2016 04:29:57 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data.py Tue May 31 04:29:57 2016 -0400 @@ -0,0 +1,90 @@ +"""Dataset handler.""" + +import numpy as np + +import pandas as pd + +from theano import config + +__author__ = "Gianluca Corrado" +__copyright__ = "Copyright 2016, Gianluca Corrado" +__license__ = "MIT" +__maintainer__ = "Gianluca Corrado" +__email__ = "gianluca.corrado@unitn.it" +__status__ = "Production" + + +class Dataset(object): + """General dataset.""" + + def __init__(self, fp, fr, standardize_proteins=False, + standardize_rnas=False): + """ + Constructor. + + Parameters + ---------- + fp : str + Protein features + + fr : str + The name of the HDF5 file containing features for the RNAs. + """ + self.Fp = fp.astype(config.floatX) + + store = pd.io.pytables.HDFStore(fr) + self.Fr = store.features.astype(config.floatX) + store.close() + + def load(self): + """Load dataset in memory.""" + raise NotImplementedError() + + +class PredictDataset(Dataset): + """Test dataset.""" + + def __init__(self, fp, fr): + """ + Constructor. + + Parameters + ---------- + fp : str + The name of the HDF5 file containing features for the proteins. + + fr : str + The name of the HDF5 file containing features for the RNAs. + """ + super(PredictDataset, self).__init__(fp, fr) + + def load(self): + """ + Load dataset in memory. + + Return + ------ + Examples to predict. For each example: + - p contains the protein features, + - r contains the RNA features, + - p_names contains the name of the protein, + - r_names contains the name of the RNA. + + """ + protein_input_dim = self.Fp.shape[0] + rna_input_dim = self.Fr.shape[0] + num_examples = self.Fp.shape[1] * self.Fr.shape[1] + p = np.zeros((num_examples, protein_input_dim)).astype(config.floatX) + p_names = [] + r = np.zeros((num_examples, rna_input_dim)).astype(config.floatX) + r_names = [] + index = 0 + for protein in self.Fp.columns: + for rna in self.Fr.columns: + p[index] = self.Fp[protein] + p_names.append(protein) + r[index] = self.Fr[rna] + r_names.append(rna) + index += 1 + + return (p, np.array(p_names), r, np.array(r_names))