Source code for aydin.regression.lgbm

import gc
import math
import multiprocessing
import tempfile
from importlib.util import find_spec
from os.path import join
from typing import Optional

import lightgbm
import numpy
from lightgbm import Booster, record_evaluation

from aydin.regression.base import RegressorBase
from aydin.regression.gbm_utils.callbacks import early_stopping
from aydin.util.log.log import lsection, lprint

[docs]class LGBMRegressor(RegressorBase): """ The LightGBM Regressor uses the gradient boosting library <a href="">LightGBM</a> to perform regression from a set of feature vectors and target values. LightGBM is a solid library but we do yet support GPU training and inference. Because of lack of GPU support LightGBM is slower than CatBoost, sometimes LightGBM gives better results than Catbboost, but not often enough to justify the loss of speed. """ def __init__( self, num_leaves: Optional[int] = None, max_num_estimators: Optional[int] = None, max_bin: int = 512, learning_rate: Optional[float] = None, loss: str = 'l1', patience: int = 5, verbosity: int = -1, compute_load: float = 0.95, inference_mode: str = None, compute_training_loss: bool = False, ): """Constructs a LightGBM regressor. Parameters ---------- num_leaves Number of leaves in the decision trees. We recommend values between 128 and 512. (advanced) max_num_estimators Maximum number of estimators (trees). Typical values range from 1024 to 4096. Use larger values for more difficult datasets. If training stops exactly at these values that is a sign you need to increase this number. Quality of the results typically increases with the number of estimators, but so does computation time too. We do not recommend using a value of more than 10000. max_bin Maximum number of allowed bins. The features are quantised into that many bins. Higher values achieve better quantisation of features but also leads to longer training and more memory consumption. We do not recommend changing this parameter. (advanced) learning_rate Learning rate for the catboost model. The learning rate is determined automatically if the value None is given. We recommend values around 0.01. (advanced) loss Type of loss to be used. Van be 'l1' for L1 loss (MAE), and 'l2' for L2 loss (RMSE), 'huber' for Huber loss, 'poisson' for Poisson loss, and 'quantile' for Auantile loss. We recommend using: 'l1'. (advanced) patience Number of rounds after which training stops if no improvement occurs. (advanced) verbosity Verbosity setting of LightGBM. (advanced) compute_load Allowed load on computational resources in percentage, typically used for CPU training when deciding on how many available cores to use. (advanced) inference_mode : str Choses inference mode: can be 'lleaves' for the very fast lleaves library (only OSX and Linux), 'lgbm' for the standard lightGBM inference engine, and 'auto' (or None) tries the best/fastest options first and fallback to lightGBM default inference. (advanced) compute_training_loss : bool Flag to tell LightGBM whether to compute training loss or not (advanced) """ super().__init__() self.force_verbose_eval = False self.num_leaves = 512 if num_leaves is None else num_leaves self.max_num_estimators = ( int(1e4) if max_num_estimators is None else max_num_estimators ) self.max_bin = max_bin self.learning_rate = 0.01 if learning_rate is None else learning_rate self.metric = loss self.early_stopping_rounds = patience self.verbosity = verbosity self.compute_load = compute_load self.inference_mode = 'auto' if inference_mode is None else inference_mode self.compute_training_loss = compute_training_loss # This can be expensive with lsection("LGBM Regressor"): lprint(f"learning rate: {self.learning_rate}") lprint(f"number of leaves: {self.num_leaves}") lprint(f"max bin: {self.max_bin}") lprint(f"n_estimators: {self.max_num_estimators}") lprint(f"patience: {self.early_stopping_rounds}") lprint(f"inference_mode: {self.inference_mode}") def __repr__(self): return f"<{self.__class__.__name__}, max_num_estimators={self.max_num_estimators}, lr={self.learning_rate}>" def _get_params(self, num_samples, dtype=numpy.float32): # min_data_in_leaf = 20 + int(0.01 * (num_samples / self.num_leaves)) # Preparing objective: objective = self.metric if objective.lower() == 'l1': objective = 'regression_l1' elif objective.lower() == 'l2': objective = 'regression_l2' elif objective.lower() == 'huber': objective = 'huber' elif objective.lower() == 'poisson': objective = 'poisson' elif objective.lower() == 'quantile': objective = 'quantile' else: objective = 'regression_l1' lprint(f'objective: {self.num_leaves}') # Setting max depth: max_depth = max(3, int(int(math.log2(self.num_leaves))) - 1) lprint(f'max_depth: {max_depth}') # Setting max bin: max_bin = 256 if dtype == numpy.uint8 else self.max_bin lprint(f'max_bin: {max_bin}') lprint(f'learning_rate: {self.learning_rate}') lprint(f'num_leaves: {self.num_leaves}') params = { "device": "cpu", "boosting_type": "gbdt", 'objective': objective, "learning_rate": self.learning_rate, "num_leaves": self.num_leaves, "max_depth": max_depth, "max_bin": max_bin, "subsample_for_bin": 200000, "num_threads": max(1, int(self.compute_load * multiprocessing.cpu_count())), "metric": self.metric.lower(), 'verbosity': -1, "bagging_freq": 1, "bagging_fraction": 0.8, "lambda_l1": 0.01, "lambda_l2": 0.01, } return params def _fit( self, x_train, y_train, x_valid=None, y_valid=None, regressor_callback=None ): with lsection("GBM regressor fitting:"): nb_data_points = y_train.shape[0] self.num_features = x_train.shape[-1] has_valid_dataset = x_valid is not None and y_valid is not None lprint(f"Number of data points: {nb_data_points}") if has_valid_dataset: lprint(f"Number of validation data points: {y_valid.shape[0]}") lprint(f"Number of features per data point: {self.num_features}") train_dataset = lightgbm.Dataset(x_train, y_train) valid_dataset = ( lightgbm.Dataset(x_valid, y_valid) if has_valid_dataset else None ) self.__epoch_counter = 0 # We translate the it fgr callback into a lightGBM callback: # This avoids propagating annoying 'evaluation_result_list[0][2]' # throughout the codebase... def lgbm_callback(env): try: val_loss = env.evaluation_result_list[0][2] except Exception as e: val_loss = 0 lprint("Problem with getting loss from LightGBM 'env' in callback") print(str(e)) if regressor_callback: regressor_callback(env.iteration, val_loss, env.model) else: lprint(f"Epoch {self.__epoch_counter}: Validation loss: {val_loss}") self.__epoch_counter += 1 evals_result = {} self.early_stopping_callback = early_stopping( self, self.early_stopping_rounds ) with lsection("GBM regressor fitting now:"): model = lightgbm.train( params=self._get_params(nb_data_points, dtype=x_train.dtype), init_model=None, train_set=train_dataset, valid_sets=[valid_dataset, train_dataset] if self.compute_training_loss else valid_dataset, early_stopping_rounds=None if has_valid_dataset else None, num_boost_round=self.max_num_estimators, callbacks=[ lgbm_callback, self.early_stopping_callback, record_evaluation(evals_result), ] if has_valid_dataset else [lgbm_callback], ) lprint("GBM fitting done.") del train_dataset del valid_dataset if has_valid_dataset: self.last_valid_loss = evals_result['valid_0'][self.metric][-1] if self.compute_training_loss: loss_history = { 'training': evals_result['training'][self.metric], 'validation': evals_result['valid_0'][self.metric], } else: loss_history = {'validation': evals_result['valid_0'][self.metric]} gc.collect() return _LGBMModel(model, self.inference_mode, loss_history)
class _LGBMModel: def __init__(self, model, inference_mode, loss_history): self.model: Booster = model self.inference_mode = inference_mode self.loss_history = loss_history def _save_internals(self, path: str): if self.model is not None: lgbm_model_file = join(path, 'lgbm_model.txt') self.model.save_model(lgbm_model_file) def _load_internals(self, path: str): lgbm_model_file = join(path, 'lgbm_model.txt') self.model = Booster(model_file=lgbm_model_file) # We exclude certain fields from saving: def __getstate__(self): state = self.__dict__.copy() del state['model'] return state def predict(self, x): with lsection("GBM regressor prediction:"): lprint(f"Number of data points : {x.shape[0]}") lprint(f"Number of features per data points: {x.shape[-1]}") # we decide here what 'auto' means: if self.inference_mode == 'auto': if x.shape[0] > 5e6: # Lleaves takes a long time to compile models, so only # interesting for very large inferences! self.inference_mode = 'lleaves' else: self.inference_mode = 'lgbm' lprint("GBM regressor predicting now...") if self.inference_mode == 'lleaves' and find_spec('lleaves'): try: return self._predict_lleaves(x) except Exception: # printing stack trace # traceback.print_exc() lprint("Failed lleaves-based regression!") # This must work! return self._predict_lgbm(x) def _predict_lleaves(self, x): with lsection("Attempting lleaves-based regression."): # Creating lleaves model and compiling it: with lsection("Model saving and compilation"): # Creating temporary file: with tempfile.NamedTemporaryFile() as temp_file: # Saving LGBM model: self.model.save_model(, num_iteration=self.model.best_iteration ) import lleaves llvm_model = lleaves.Model( llvm_model.compile() prediction = llvm_model.predict(x) return prediction def _predict_lgbm(self, x): prediction = self.model.predict(x, num_iteration=self.model.best_iteration) # LGBM is annoying, it spits out float64s prediction = prediction.astype(numpy.float32, copy=False) lprint("GBM regressor predicting done!") return prediction