CatBoost throws an exception when dealing with a large dataset #2655

anton-yershov · 2024-05-03T18:58:30Z

Problem: On a large training dataset with over 3 billion samples CatBoost throws an exception when incorrectly typecasting a large object count value. (On a side note, any future plans to use 64-bit variables for dataset object counts?)
catboost version: 1.2.3
Operating System: Linux RHEL 8.6
CPU: Intel Xeon Platinum 8480+
GPU: A100-SXM4-80GB * 8

Stack trace:

---------------------------------------------------------------------------
CatBoostError                             Traceback (most recent call last)
Cell In[10, line 1
----> 1 model.fit(samples_df, targets_df, sample_weight=weights_df, verbose=True)

File /pythondir/lib/python3.10/site-packages/catboost/core.py:5807, in CatBoostRegressor.fit(self, X, y, cat_features, text_features, embedding_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file,, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   5804 if 'loss_function' in params:
   5805     CatBoostRegressor._check_is_compatible_loss(params['loss_function'])
-> 5807 return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
   5808                  use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description,
   5809                  verbose_eval, metric_period, silent, early_stopping_rounds,
   5810                  save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)

File /pythondir/lib/python3.10/site-packages/catboost/core.py:2381, in CatBoost._fit(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   2378 if y is None and not isinstance(X, PATH_TYPES + (Pool,)):
   2379     raise CatBoostError("y may be None only when X is an instance of catboost.Pool or string")
-> 2381 train_params = self._prepare_train_params(
   2382     X=X, y=y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features,
   2383     pairs=pairs, sample_weight=sample_weight, group_id=group_id, group_weight=group_weight,
   2384     subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline, use_best_model=use_best_model,
   2385     eval_set=eval_set, verbose=verbose, logging_level=logging_level, plot=plot, plot_file=plot_file,
   2386     column_description=column_description, verbose_eval=verbose_eval, metric_period=metric_period,
   2387     silent=silent, early_stopping_rounds=early_stopping_rounds, save_snapshot=save_snapshot,
   2388     snapshot_file=snapshot_file, snapshot_interval=snapshot_interval, init_model=init_model,
   2389     callbacks=callbacks
   2390 )
   2391 params = train_params["params"]
   2392 train_pool = train_params["train_pool"]

File /pythondir/lib/python3.10/site-packages/catboost/core.py:2261, in CatBoost._prepare_train_params(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks)
   2258 text_features = _process_feature_indices(text_features, X, params, 'text_features')
   2259 embedding_features = _process_feature_indices(embedding_features, X, params, 'embedding_features')
-> 2261 train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
   2262                                sample_weight, group_id, group_weight, subgroup_id, pairs_weight,
   2263                                baseline, column_description)
   2264 if train_pool.is_empty_:
   2265     raise CatBoostError("X is empty.")

File /pythondir/lib/python3.10/site-packages/catboost/core.py:1499, in _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, column_description)
   1497     if y is None:
   1498         raise CatBoostError("y has not initialized in fic(): X is not catboost.Pool object, y must be not None in fit().")
-> 1499     train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
   1500                       group_weight=group_weight, subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline)
   1501 return train_pool

File /pythondir/lib/python3.10/site-packages/catboost/core.py:844, in Pool.__init__(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count, log_cout, log_cerr, data_can_be_none)
    838         if isinstance(feature_names, PATH_TYPES):
    839             raise CatBoostError(
    840                 "feature_names must be None of have not-string type when the pool is created from "
    841                 "python objects."
    842             )
--> 844         self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
    845                    group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
    846 elif not data_can_be_none:
    847     raise CatBoostError("'data' parameter can't be None")

File /pythondir/lib/python3.10/site-packages/catboost/core.py:1477, in Pool._init(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
   1475 if feature_tags is not None:
   1476     feature_tags = self._check_transform_tags(feature_tags, feature_names)
-> 1477 self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
   1478                 group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)

File _catboost.pyx:4159, in _catboost._PoolBase._init_pool()

File _catboost.pyx:4209, in _catboost._PoolBase._init_pool()

File _catboost.pyx:4003, in _catboost._PoolBase._init_features_order_layout_pool()

CatBoostError: /src/catboost/util/generic/cast.h:131: Conversion 'unsigned int{3284861059}' to 'int', positive value converted to negative

Thank you in advance

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CatBoost throws an exception when dealing with a large dataset #2655

CatBoost throws an exception when dealing with a large dataset #2655

anton-yershov commented May 3, 2024

CatBoost throws an exception when dealing with a large dataset #2655

CatBoost throws an exception when dealing with a large dataset #2655

Comments

anton-yershov commented May 3, 2024