Source code for s3l.base

"""Base classes for all estimators and experiments."""

import re

import numpy as np

from abc import abstractmethod, ABC
from .utils.log_utils import get_logger
from .metrics import performance

LOGGER = get_logger("s3l.base")

########################
# BaseExperiments
########################


[docs]class BaseExperiments(object): ''' The base class for all experiments. You can inherit this class to design you own experiment process. ''' def __init__(self, transductive=True, n_jobs=1, all_class=True): self.datasets = [] self.splits = [] self.configs = [] self.evaluate_metric_name = [] self.evaluate_metric = [] self.evaluate_metri_param = [] self.evaluate_results = dict() # evaluate_result[data_name][config_name][evaluate_name] -> [0.] self.performance_metric = getattr(performance, "accuracy_score") self.metri_param = {} self.metri_large_better = True self.transductive = transductive self.n_jobs = n_jobs self.all_class = all_class pass
[docs] def append_configs(self, configs): """Append estimators configs to self.config Parameters ---------- configs: list of (name, estimator, param_dict) In which name: string, estimator: object of estimator, param_dict: dict of parameters for corresponding estimator. """ for name, estimator, param_dict in configs: self._append_config(name, estimator, param_dict)
def _append_config(self, name, estimator, param_dict): """Append estimator config to self.config self.config is list of tuple (string, estimator, param_dict) Parameters ---------- name: string estimator: object of estimator param_dict: dict of parameters for estimators """ self.configs.append((name, estimator, param_dict))
[docs] def append_datasets(self, datasets): """Append datasets file names to self.datasets Parameters ---------- datasets: list of (name,feature_file,label_file,split_path,graph_file) Detais:: name: string Name of the dataset. Arbitrary feature_file: string or None Absolute file name of the feature file. Can be any thing if label_file: string or None Absolute file name of the label file. split_path: string or None Absolute path in which store the split files. Should be None if no split files is provided. graph_file: string or None Absolute file name of the graph files. Should be None if no graph is provided. """ for name, feature_file, label_file, split_path, graph_file in datasets: self._append_dataset(name, feature_file, label_file, split_path, graph_file)
def _append_dataset(self, name, feature_file, label_file, split_path, graph_file): """Append dataset file name to self.datasets self.datasets is list of tuple (string, string, string) Parameters ---------- name: string Name of the dataset feature_file: string Absolute file name of the feature file. label_file: string Absolute file name of the label file. split_path: string Absolute path in which store the split files. Should be None if no split files is provided. graph_file: string Absolute file name of the graph files. Should be None if no graph is provided. """ self.datasets.append( (name, feature_file, label_file, split_path, graph_file) )
[docs] def set_metric(self, performance_metric='accuracy_score', metric_large_better=True, param_dict=None): """ Set the metric for experiment. Parameters ------------ performace_metric: str The query performance-metric function. Giving str to use a pre-defined performance-metric. kwargs: dict, optional The args used in performance-metric. if kwargs is None,the pre-defined performance will init in the default way. Note that, each parameters should be static. """ if performance_metric not in [ 'accuracy_score', 'zero_one_loss', 'roc_auc_score', 'get_fps_tps_thresholds', 'hamming_loss', 'one_error', 'f1_score', 'coverage_error', 'label_ranking_loss', 'label_ranking_average_precision_score', 'micro_auc_score', 'average_precision_score', 'minus_mean_square_error']: raise NotImplementedError('Performance {} is not' ' implemented.'.format(str(performance_metric))) # Need to modify if param_dict is None: param_dict = {} self.performance_metric_name = performance_metric self.performance_metric = getattr(performance, performance_metric) self.metri_param = param_dict self.metri_large_better = metric_large_better
[docs] def append_evaluate_metric(self, performance_metric='accuracy_score', kwargs=dict()): """ Append the metric for evaluation. Parameters ------------ performace_metric: str The query performance-metric function. Giving str to use a pre-defined performance-metric. kwargs: dict, optional The args used in performance-metric. if kwargs is None,the pre-defined performance will init in the default way. Note that, each parameters should be static. """ if performance_metric not in [ 'accuracy_score', 'zero_one_loss', 'roc_auc_score', 'get_fps_tps_thresholds', 'hamming_loss', 'one_error', 'f1_score', 'coverage_error', 'label_ranking_loss', 'label_ranking_average_precision_score', 'micro_auc_score', 'average_precision_score', 'minus_mean_square_error']: raise NotImplementedError('Performance {} is not' ' implemented.'.format(str(performance_metric))) # Need to modify self.evaluate_metric_name.append(performance_metric) self.evaluate_metric.append(getattr(performance, performance_metric)) self.evaluate_metri_param.append(kwargs)
def _evaluate_selected_model(self, data_name, preds, y_truth, unlabeled_idxs): """ Evaluate the predictions with some metrics configured for this experiments in `evaluate_metric`. """ if len(self.evaluate_metric_name) == 0: return assert len(self.configs) > 0 assert len(preds[self.configs[0][0]]) == len(unlabeled_idxs) self.evaluate_results[data_name] = dict() for config_name, _, _ in self.configs: self.evaluate_results[data_name][config_name] = dict() for name, metric, param in zip(self.evaluate_metric_name, self.evaluate_metric, self.evaluate_metri_param): self.evaluate_results[data_name][config_name][name] = [] for i in range(len(preds[config_name])): self.evaluate_results[data_name][config_name][name].append( metric(y_truth[unlabeled_idxs[i]], preds[config_name][i], param) )
[docs] def get_evaluation_results(self): return self.evaluate_results
######################### # BaseEstimator ######################### def _pprint(params, offset=0, printer=repr): """Pretty print the dictionary 'params'. Parameters ---------- params : dict The dictionary to pretty print offset : int The offset in characters to add at the begin of each line. printer : callable The function to convert entries to strings, typically the builtin str or repr """ # Do a multi-line justified repr: options = np.get_printoptions() np.set_printoptions(precision=5, threshold=64, edgeitems=2) params_list = list() this_line_length = offset line_sep = ',\n' + (1 + offset // 2) * ' ' # for i, (k, v) in enumerate(sorted(six.iteritems(params))): for i, (k, v) in enumerate(params.items()): if type(v) is float: # use str for representing floating point numbers # this way we get consistent representation across # architectures and versions. this_repr = '%s=%s' % (k, str(v)) else: # use repr of the rest this_repr = '%s=%s' % (k, printer(v)) if len(this_repr) > 500: this_repr = this_repr[:300] + '...' + this_repr[-100:] if i > 0: if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr): params_list.append(line_sep) this_line_length = len(line_sep) else: params_list.append(', ') this_line_length += 2 params_list.append(this_repr) this_line_length += len(this_repr) np.set_printoptions(**options) lines = ''.join(params_list) # Strip trailing space to avoid nightmare in doctests lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) return lines
[docs]class BaseEstimator(ABC): """ Base class for all estimators in s3l. Notes ----- All estimators should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). """ def __init__(self): pass
[docs] @abstractmethod def set_params(self, param): """ Update the parameters of the estimator and release old results to prepare for new training. """ raise NotImplementedError()
def _get_attr_names(self) -> []: """Get attributes of the class. Return ------ Sorted list of attribute names. """ return sorted([a for a, v in self.__dict__.items() if not re.match('<function.*?>', str(v)) and not (a.startswith('__') and a.endswith('__'))])
[docs] def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_attr_names(): value = getattr(self, key, None) if deep and hasattr(value, 'get_params'): # get parameters of contained subobjects deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def __repr__(self): """ Need Test. """ class_name = self.__class__.__name__ return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), offset=len(class_name),),)
[docs]class TransductiveEstimatorwithGraph(BaseEstimator): @abstractmethod def __init__(self): super(TransductiveEstimatorwithGraph, self).__init__()
[docs] @abstractmethod def fit(self, X, y, l_ind, W, **kwargs): """Takes X, y, label_index, affinity matrix""" raise NotImplementedError()
[docs] @abstractmethod def predict(self, u_ind, **kwargs): """Takes unlabel_index""" raise NotImplementedError()
[docs]class TransductiveEstimatorWOGraph(BaseEstimator): @abstractmethod def __init__(self): super(TransductiveEstimatorWOGraph, self).__init__()
[docs] @abstractmethod def fit(self, X, y, l_ind, **kwargs): """Takes X, y, label_index""" raise NotImplementedError()
[docs] @abstractmethod def predict(self, u_ind, **kwargs): """Takes unlabel_index""" raise NotImplementedError()
[docs]class InductiveEstimatorWOGraph(BaseEstimator): @abstractmethod def __init__(self): super(InductiveEstimatorWOGraph, self).__init__()
[docs] @abstractmethod def fit(self, X, y, l_ind, **kwargs): """Takes X, y, label_index""" raise NotImplementedError()
[docs] @abstractmethod def predict(self, X, **kwargs): """Takes X""" raise NotImplementedError()
[docs]class InductiveEstimatorwithGraph(BaseEstimator): @abstractmethod def __init__(self): super(InductiveEstimatorwithGraph, self).__init__()
[docs] @abstractmethod def fit(self, X, y, l_ind, W, **kwargs): """Takes X, y, label_index, affinity matrix""" raise NotImplementedError()
[docs] @abstractmethod def predict(self, X, **kwargs): """Takes X""" raise NotImplementedError()
[docs]class SupervisedEstimator(BaseEstimator): """ Supervised estimator of single-label task. """ @abstractmethod def __init__(self): super(SupervisedEstimator, self).__init__() self.model = None
[docs] def fit(self, X, y, l_ind=None, **kwargs): """ Takes X, y, label_index. """ if l_ind is not None: X = X[l_ind, :] if y.ndim == 2: y = y[l_ind, :].reshape(-1) else: y = y[l_ind] self.model.fit(X, y)
[docs] def predict(self, X, **kwargs): """ Takes X """ return self.model.predict(X)
[docs] def set_params(self, param): self.model.set_params(**param)
[docs] def predict_proba(self, X): return self.model.predict_proba(X)
[docs] def predict_log_proba(self, X): return self.model.predict_log_proba(X)
######################### # BaseEnsemble #########################
[docs]class SaferEnsemble(BaseEstimator): ''' Base class for SaferEnsemble for semi-supervised learning. Notes ----- All estimators should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). ''' @abstractmethod def __init__(self): pass
[docs] @abstractmethod def fit(self, X, y, l_ind, **kwargs): """Fit the model with base semi-supervised predictions. """ raise NotImplementedError()
[docs] @abstractmethod def predict(self, u_ind, baseline_pred=None): """ Should provide baseline prediction. Can only make safer prediction with given ones, so it's transductive. """ raise NotImplementedError()
######################### # Basemetric #########################