stacking_ensemble_models: model_prediction.py comparison

comparison model_prediction.py @ 2:38c4f8a98038 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"

author	bgruening
date	Mon, 16 Dec 2019 10:07:37 +0000
parents	c1b0c8232816
children	0a1812986bc3

comparison

equal deleted inserted replaced

-:c1b0c8232816
+:38c4f8a98038
 options['blacklist_regions'] = None
 pred_data_generator = klass(
 ref_genome_path=ref_seq, vcf_path=vcf_path, **options)
-pred_data_generator.fit()
+pred_data_generator.set_processing_attrs()
-preds = estimator.model_.predict_generator(
+variants = pred_data_generator.variants
-pred_data_generator.flow(batch_size=32),
-workers=N_JOBS,
+# predict 1600 sample at once then write to file
-use_multiprocessing=True)
+gen_flow = pred_data_generator.flow(batch_size=1600)
-if preds.min() < 0. or preds.max() > 1.:
+file_writer = open(outfile_predict, 'w')
-warnings.warn('Network returning invalid probability values. '
+header_row = '\t'.join(['chrom', 'pos', 'name', 'ref',
-'The last layer might not normalize predictions '
+'alt', 'strand'])
-'into probabilities '
+file_writer.write(header_row)
-'(like softmax or sigmoid would).')
+header_done = False
-if params['method'] == 'predict_proba' and preds.shape[1] == 1:
+steps_done = 0
-# first column is probability of class 0 and second is of class 1
-preds = np.hstack([1 - preds, preds])
+# TODO: multiple threading
+try:
-elif params['method'] == 'predict':
+while steps_done < len(gen_flow):
-if preds.shape[-1] > 1:
+index_array = next(gen_flow.index_generator)
-# if the last activation is `softmax`, the sum of all
+batch_X = gen_flow._get_batches_of_transformed_samples(
-# probibilities will 1, the classification is considered as
+index_array)
-# multi-class problem, otherwise, we take it as multi-label.
-act = getattr(estimator.model_.layers[-1], 'activation', None)
+if params['method'] == 'predict':
-if act and act.__name__ == 'softmax':
+batch_preds = estimator.predict(
-classes = preds.argmax(axis=-1)
+batch_X,
+# The presence of `pred_data_generator` below is to
+# override model carrying data_generator if there
+# is any.
+data_generator=pred_data_generator)
 else:
-preds = (preds > 0.5).astype('int32')
+batch_preds = estimator.predict_proba(
-else:
+batch_X,
-classes = (preds > 0.5).astype('int32')
+# The presence of `pred_data_generator` below is to
+# override model carrying data_generator if there
-preds = estimator.classes_[classes]
+# is any.
+data_generator=pred_data_generator)
+if batch_preds.ndim == 1:
+batch_preds = batch_preds[:, np.newaxis]
+batch_meta = variants[index_array]
+batch_out = np.column_stack([batch_meta, batch_preds])
+if not header_done:
+heads = np.arange(batch_preds.shape[-1]).astype(str)
+heads_str = '\t'.join(heads)
+file_writer.write("\t%s\n" % heads_str)
+header_done = True
+for row in batch_out:
+row_str = '\t'.join(row)
+file_writer.write("%s\n" % row_str)
+steps_done += 1
+finally:
+file_writer.close()
+# TODO: make api `pred_data_generator.close()`
+pred_data_generator.close()
+return 0
 # end input
 # output
-if input_type == 'variant_effect':   # TODO: save in batchs
+if len(preds.shape) == 1:
-rval = pd.DataFrame(preds)
-meta = pd.DataFrame(
-pred_data_generator.variants,
-columns=['chrom', 'pos', 'name', 'ref', 'alt', 'strand'])
-rval = pd.concat([meta, rval], axis=1)
-elif len(preds.shape) == 1:
 rval = pd.DataFrame(preds, columns=['Predicted'])
 else:
 rval = pd.DataFrame(preds)
-rval.to_csv(outfile_predict, sep='\t',
+rval.to_csv(outfile_predict, sep='\t', header=True, index=False)
-header=True, index=False)
 if __name__ == '__main__':
 aparser = argparse.ArgumentParser()
 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)

Mercurial > repos > bgruening > stacking_ensemble_models

comparison model_prediction.py @ 2:38c4f8a98038 draft