comparison data.py @ 0:d04fa5201f51 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 7ad344d108076116e702e1c1e91cea73d8fcadc4
author rnateam
date Thu, 28 Jul 2016 05:56:54 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d04fa5201f51
1 """Dataset handler."""
2
3 import numpy as np
4
5 import pandas as pd
6
7 __author__ = "Gianluca Corrado"
8 __copyright__ = "Copyright 2016, Gianluca Corrado"
9 __license__ = "MIT"
10 __maintainer__ = "Gianluca Corrado"
11 __email__ = "gianluca.corrado@unitn.it"
12 __status__ = "Production"
13
14
15 class Dataset(object):
16 """General dataset."""
17
18 def __init__(self, fp, fr, standardize_proteins=False,
19 standardize_rnas=False):
20 """
21 Constructor.
22
23 Parameters
24 ----------
25 fp : str
26 Protein features
27
28 fr : str
29 The name of the HDF5 file containing features for the RNAs.
30 """
31 self.Fp = fp.astype('float32')
32
33 store = pd.io.pytables.HDFStore(fr)
34 self.Fr = store.features.astype('float32')
35 store.close()
36
37 def load(self):
38 """Load dataset in memory."""
39 raise NotImplementedError()
40
41
42 class PredictDataset(Dataset):
43 """Test dataset."""
44
45 def __init__(self, fp, fr):
46 """
47 Constructor.
48
49 Parameters
50 ----------
51 fp : str
52 The name of the HDF5 file containing features for the proteins.
53
54 fr : str
55 The name of the HDF5 file containing features for the RNAs.
56 """
57 super(PredictDataset, self).__init__(fp, fr)
58
59 def load(self):
60 """
61 Load dataset in memory.
62
63 Return
64 ------
65 Examples to predict. For each example:
66 - p contains the protein features,
67 - r contains the RNA features,
68 - p_names contains the name of the protein,
69 - r_names contains the name of the RNA.
70
71 """
72 protein_input_dim = self.Fp.shape[0]
73 rna_input_dim = self.Fr.shape[0]
74 num_examples = self.Fp.shape[1] * self.Fr.shape[1]
75 p = np.zeros((num_examples, protein_input_dim)).astype('float32')
76 p_names = []
77 r = np.zeros((num_examples, rna_input_dim)).astype('float32')
78 r_names = []
79 index = 0
80 for protein in self.Fp.columns:
81 for rna in self.Fr.columns:
82 p[index] = self.Fp[protein]
83 p_names.append(protein)
84 r[index] = self.Fr[rna]
85 r_names.append(rna)
86 index += 1
87
88 return (p, np.array(p_names), r, np.array(r_names))