import gc
import math
import multiprocessing
import tempfile
from importlib.util import find_spec
from os.path import join
from typing import Optional
import lightgbm
import numpy
from lightgbm import Booster, record_evaluation
from aydin.regression.base import RegressorBase
from aydin.regression.gbm_utils.callbacks import early_stopping
from aydin.util.log.log import lsection, lprint
[docs]class LGBMRegressor(RegressorBase):
"""
The LightGBM Regressor uses the gradient boosting library <a
href="https://github.com/microsoft/LightGBM">LightGBM</a> to perform
regression from a set of feature vectors and target values. LightGBM is a
solid library but we do yet support GPU training and inference. Because
of lack of GPU support LightGBM is slower than CatBoost, sometimes
LightGBM gives better results than Catbboost, but not often enough to
justify the loss of speed.
"""
def __init__(
self,
num_leaves: Optional[int] = None,
max_num_estimators: Optional[int] = None,
max_bin: int = 512,
learning_rate: Optional[float] = None,
loss: str = 'l1',
patience: int = 5,
verbosity: int = -1,
compute_load: float = 0.95,
inference_mode: str = None,
compute_training_loss: bool = False,
):
"""Constructs a LightGBM regressor.
Parameters
----------
num_leaves
Number of leaves in the decision trees.
We recommend values between 128 and 512.
(advanced)
max_num_estimators
Maximum number of estimators (trees). Typical values range from 1024
to 4096. Use larger values for more difficult datasets. If training
stops exactly at these values that is a sign you need to increase this
number. Quality of the results typically increases with the number of
estimators, but so does computation time too.
We do not recommend using a value of more than 10000.
max_bin
Maximum number of allowed bins. The features are quantised into that
many bins. Higher values achieve better quantisation of features but
also leads to longer training and more memory consumption. We do not
recommend changing this parameter.
(advanced)
learning_rate
Learning rate for the catboost model. The learning rate is determined
automatically if the value None is given. We recommend values around 0.01.
(advanced)
loss
Type of loss to be used. Van be 'l1' for L1 loss (MAE), and 'l2' for
L2 loss (RMSE), 'huber' for Huber loss, 'poisson' for Poisson loss,
and 'quantile' for Auantile loss. We recommend using: 'l1'.
(advanced)
patience
Number of rounds after which training stops if no improvement occurs.
(advanced)
verbosity
Verbosity setting of LightGBM.
(advanced)
compute_load
Allowed load on computational resources in percentage, typically used
for CPU training when deciding on how many available cores to use.
(advanced)
inference_mode : str
Choses inference mode: can be 'lleaves' for the very fast lleaves
library (only OSX and Linux), 'lgbm' for the standard lightGBM
inference engine, and 'auto' (or None) tries the best/fastest
options first and fallback to lightGBM default inference.
(advanced)
compute_training_loss : bool
Flag to tell LightGBM whether to compute training loss or not
(advanced)
"""
super().__init__()
self.force_verbose_eval = False
self.num_leaves = 512 if num_leaves is None else num_leaves
self.max_num_estimators = (
int(1e4) if max_num_estimators is None else max_num_estimators
)
self.max_bin = max_bin
self.learning_rate = 0.01 if learning_rate is None else learning_rate
self.metric = loss
self.early_stopping_rounds = patience
self.verbosity = verbosity
self.compute_load = compute_load
self.inference_mode = 'auto' if inference_mode is None else inference_mode
self.compute_training_loss = compute_training_loss # This can be expensive
with lsection("LGBM Regressor"):
lprint(f"learning rate: {self.learning_rate}")
lprint(f"number of leaves: {self.num_leaves}")
lprint(f"max bin: {self.max_bin}")
lprint(f"n_estimators: {self.max_num_estimators}")
lprint(f"patience: {self.early_stopping_rounds}")
lprint(f"inference_mode: {self.inference_mode}")
def __repr__(self):
return f"<{self.__class__.__name__}, max_num_estimators={self.max_num_estimators}, lr={self.learning_rate}>"
def _get_params(self, num_samples, dtype=numpy.float32):
# min_data_in_leaf = 20 + int(0.01 * (num_samples / self.num_leaves))
# Preparing objective:
objective = self.metric
if objective.lower() == 'l1':
objective = 'regression_l1'
elif objective.lower() == 'l2':
objective = 'regression_l2'
elif objective.lower() == 'huber':
objective = 'huber'
elif objective.lower() == 'poisson':
objective = 'poisson'
elif objective.lower() == 'quantile':
objective = 'quantile'
else:
objective = 'regression_l1'
lprint(f'objective: {self.num_leaves}')
# Setting max depth:
max_depth = max(3, int(int(math.log2(self.num_leaves))) - 1)
lprint(f'max_depth: {max_depth}')
# Setting max bin:
max_bin = 256 if dtype == numpy.uint8 else self.max_bin
lprint(f'max_bin: {max_bin}')
lprint(f'learning_rate: {self.learning_rate}')
lprint(f'num_leaves: {self.num_leaves}')
params = {
"device": "cpu",
"boosting_type": "gbdt",
'objective': objective,
"learning_rate": self.learning_rate,
"num_leaves": self.num_leaves,
"max_depth": max_depth,
"max_bin": max_bin,
"subsample_for_bin": 200000,
"num_threads": max(1, int(self.compute_load * multiprocessing.cpu_count())),
"metric": self.metric.lower(),
'verbosity': -1,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"lambda_l1": 0.01,
"lambda_l2": 0.01,
}
return params
def _fit(
self, x_train, y_train, x_valid=None, y_valid=None, regressor_callback=None
):
with lsection("GBM regressor fitting:"):
nb_data_points = y_train.shape[0]
self.num_features = x_train.shape[-1]
has_valid_dataset = x_valid is not None and y_valid is not None
lprint(f"Number of data points: {nb_data_points}")
if has_valid_dataset:
lprint(f"Number of validation data points: {y_valid.shape[0]}")
lprint(f"Number of features per data point: {self.num_features}")
train_dataset = lightgbm.Dataset(x_train, y_train)
valid_dataset = (
lightgbm.Dataset(x_valid, y_valid) if has_valid_dataset else None
)
self.__epoch_counter = 0
# We translate the it fgr callback into a lightGBM callback:
# This avoids propagating annoying 'evaluation_result_list[0][2]'
# throughout the codebase...
def lgbm_callback(env):
try:
val_loss = env.evaluation_result_list[0][2]
except Exception as e:
val_loss = 0
lprint("Problem with getting loss from LightGBM 'env' in callback")
print(str(e))
if regressor_callback:
regressor_callback(env.iteration, val_loss, env.model)
else:
lprint(f"Epoch {self.__epoch_counter}: Validation loss: {val_loss}")
self.__epoch_counter += 1
evals_result = {}
self.early_stopping_callback = early_stopping(
self, self.early_stopping_rounds
)
with lsection("GBM regressor fitting now:"):
model = lightgbm.train(
params=self._get_params(nb_data_points, dtype=x_train.dtype),
init_model=None,
train_set=train_dataset,
valid_sets=[valid_dataset, train_dataset]
if self.compute_training_loss
else valid_dataset,
early_stopping_rounds=None if has_valid_dataset else None,
num_boost_round=self.max_num_estimators,
callbacks=[
lgbm_callback,
self.early_stopping_callback,
record_evaluation(evals_result),
]
if has_valid_dataset
else [lgbm_callback],
)
lprint("GBM fitting done.")
del train_dataset
del valid_dataset
if has_valid_dataset:
self.last_valid_loss = evals_result['valid_0'][self.metric][-1]
if self.compute_training_loss:
loss_history = {
'training': evals_result['training'][self.metric],
'validation': evals_result['valid_0'][self.metric],
}
else:
loss_history = {'validation': evals_result['valid_0'][self.metric]}
gc.collect()
return _LGBMModel(model, self.inference_mode, loss_history)
class _LGBMModel:
def __init__(self, model, inference_mode, loss_history):
self.model: Booster = model
self.inference_mode = inference_mode
self.loss_history = loss_history
def _save_internals(self, path: str):
if self.model is not None:
lgbm_model_file = join(path, 'lgbm_model.txt')
self.model.save_model(lgbm_model_file)
def _load_internals(self, path: str):
lgbm_model_file = join(path, 'lgbm_model.txt')
self.model = Booster(model_file=lgbm_model_file)
# We exclude certain fields from saving:
def __getstate__(self):
state = self.__dict__.copy()
del state['model']
return state
def predict(self, x):
with lsection("GBM regressor prediction:"):
lprint(f"Number of data points : {x.shape[0]}")
lprint(f"Number of features per data points: {x.shape[-1]}")
# we decide here what 'auto' means:
if self.inference_mode == 'auto':
if x.shape[0] > 5e6:
# Lleaves takes a long time to compile models, so only
# interesting for very large inferences!
self.inference_mode = 'lleaves'
else:
self.inference_mode = 'lgbm'
lprint("GBM regressor predicting now...")
if self.inference_mode == 'lleaves' and find_spec('lleaves'):
try:
return self._predict_lleaves(x)
except Exception:
# printing stack trace
# traceback.print_exc()
lprint("Failed lleaves-based regression!")
# This must work!
return self._predict_lgbm(x)
def _predict_lleaves(self, x):
with lsection("Attempting lleaves-based regression."):
# Creating lleaves model and compiling it:
with lsection("Model saving and compilation"):
# Creating temporary file:
with tempfile.NamedTemporaryFile() as temp_file:
# Saving LGBM model:
self.model.save_model(
temp_file.name, num_iteration=self.model.best_iteration
)
import lleaves
llvm_model = lleaves.Model(model_file=temp_file.name)
llvm_model.compile()
prediction = llvm_model.predict(x)
return prediction
def _predict_lgbm(self, x):
prediction = self.model.predict(x, num_iteration=self.model.best_iteration)
# LGBM is annoying, it spits out float64s
prediction = prediction.astype(numpy.float32, copy=False)
lprint("GBM regressor predicting done!")
return prediction