Source code for protopipe.mva.train_model

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from .utils import split_train_test

__all__ = ["TrainModel"]


[docs]class TrainModel(object): """ Train classification or regressor model. Parameters ---------- case: str Possibilities are regressor or classifier feature_name_list: list List of features target_name: str, optional Regression target """ def __init__(self, case, feature_name_list, target_name=None): self.case = case self.target_name = target_name self.feature_name_list = feature_name_list self.data_test = None self.data_train = None self.data_scikit = None
[docs] def split_data( self, data_sig, train_fraction, data_bkg=None, force_same_nsig_nbkg=False ): """ Load and split data to build train/test samples. Parameters ---------- data_sig: `~pandas.DataFrame` Data frame train_fraction: float Fraction of events to build the training sample data_bkg: `~pandas.DataFrame` Data frame force_same_nsig_nbkg: bool If true, the same number of signal and bkg events will be used to build a classifier """ if self.case == "regressor": ( X_train, X_test, y_train, y_test, self.data_train, self.data_test, ) = split_train_test( survived_images=data_sig, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) weight = np.ones(len(self.data_train)) weight_train = weight / sum(weight) else: ( X_train_sig, X_test_sig, y_train_sig, y_test_sig, data_train_sig, data_test_sig, ) = split_train_test( survived_images=data_sig, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) ( X_train_bkg, X_test_bkg, y_train_bkg, y_test_bkg, data_train_bkg, data_test_bkg, ) = split_train_test( survived_images=data_bkg, train_fraction=train_fraction, feature_name_list=self.feature_name_list, target_name=self.target_name, ) max_events = None if force_same_nsig_nbkg is True: if len(X_train_bkg) <= len(X_train_sig): max_events = len(X_train_bkg) else: max_events = len(X_train_sig) X_train = X_train_sig[0:max_events].append(X_train_bkg[0:max_events]) y_train = y_train_sig[0:max_events].append(y_train_bkg[0:max_events]) self.data_train = data_train_sig[0:max_events].append( data_train_bkg[0:max_events] ) if force_same_nsig_nbkg is True: if len(X_test_bkg) <= len(X_test_sig): max_events = len(X_test_bkg) else: max_events = len(X_test_sig) try: X_test = X_test_sig[0:max_events].append(X_test_bkg[0:max_events]) y_test = y_test_sig[0:max_events].append(y_test_bkg[0:max_events]) self.data_test = data_test_sig[0:max_events].append( data_test_bkg[0:max_events] ) except TypeError as e: if str(e) != "'NoneType' object is unsubscriptable": raise else: X_test = None y_test = None self.data_test = None weight = np.ones(len(X_train)) weight_train = weight / sum(weight) if X_test is not None: self.data_scikit = { "X_train": X_train.values, "X_test": X_test.values, "y_train": y_train.values, "y_test": y_test.values, "w_train": weight_train, } else: self.data_scikit = { "X_train": X_train.values, "X_test": None, "y_train": y_train.values, "y_test": None, "w_train": weight_train, }
[docs] def get_optimal_model(self, init_model, tuned_parameters, scoring, cv, refit=True, verbose=2, njobs=1): """ Get optimal hyperparameters for an estimator and return the best model. The best parameters are obtained by performing an exhaustive search over specified parameter values. Parameters ---------- init_model: `~sklearn.base.BaseEstimator` Model to optimise tuned_parameters: dict Contains parameter names and ranges to optimise on scoring: str Estimator cv: int number of split for x-validation refit: bool, str, or callable, default=False Refit the estimator using the best found parameters on the whole dataset. verbose: int Controls the verbosity: the higher, the more messages. >1 : the computation time for each fold and parameter candidate is displayed >2 : the score is also displayed >3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation njobs: int Number of jobs to run in parallel. -1 means using all processors. Returns ------- best_estimator: `~sklearn.base.BaseEstimator` Best model """ model = GridSearchCV(init_model, tuned_parameters, scoring=scoring, cv=cv, refit=refit, verbose=verbose) model.fit( self.data_scikit["X_train"], self.data_scikit["y_train"], sample_weight=self.data_scikit["w_train"], ) print("Best parameters set found on development set:") for key in model.best_params_.keys(): print(" - {}: {}".format(key, model.best_params_[key])) print("Grid scores on development set:") means = model.cv_results_["mean_test_score"] stds = model.cv_results_["std_test_score"] if verbose > 2: for mean, std, params in zip(means, stds, model.cv_results_["params"]): print(" - {:.3f}+/-{:.3f} for {}".format(mean, std * 2, params)) grid_search_cv_results = pd.DataFrame(model.cv_results_) if verbose > 3: print(grid_search_cv_results) best_estimator = model.best_estimator_ return best_estimator