Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CatBoost throws an exception when dealing with a large dataset #2655

Open
anton-yershov opened this issue May 3, 2024 · 0 comments
Open

Comments

@anton-yershov
Copy link

Problem: On a large training dataset with over 3 billion samples CatBoost throws an exception when incorrectly typecasting a large object count value. (On a side note, any future plans to use 64-bit variables for dataset object counts?)
catboost version: 1.2.3
Operating System: Linux RHEL 8.6
CPU: Intel Xeon Platinum 8480+
GPU: A100-SXM4-80GB * 8

Stack trace:

---------------------------------------------------------------------------
CatBoostError                             Traceback (most recent call last)
Cell In[10, line 1
----> 1 model.fit(samples_df, targets_df, sample_weight=weights_df, verbose=True)

File /pythondir/lib/python3.10/site-packages/catboost/core.py:5807, in CatBoostRegressor.fit(self, X, y, cat_features, text_features, embedding_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file,, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   5804 if 'loss_function' in params:
   5805     CatBoostRegressor._check_is_compatible_loss(params['loss_function'])
-> 5807 return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
   5808                  use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description,
   5809                  verbose_eval, metric_period, silent, early_stopping_rounds,
   5810                  save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)

File /pythondir/lib/python3.10/site-packages/catboost/core.py:2381, in CatBoost._fit(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks, log_cout, log_cerr)
   2378 if y is None and not isinstance(X, PATH_TYPES + (Pool,)):
   2379     raise CatBoostError("y may be None only when X is an instance of catboost.Pool or string")
-> 2381 train_params = self._prepare_train_params(
   2382     X=X, y=y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features,
   2383     pairs=pairs, sample_weight=sample_weight, group_id=group_id, group_weight=group_weight,
   2384     subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline, use_best_model=use_best_model,
   2385     eval_set=eval_set, verbose=verbose, logging_level=logging_level, plot=plot, plot_file=plot_file,
   2386     column_description=column_description, verbose_eval=verbose_eval, metric_period=metric_period,
   2387     silent=silent, early_stopping_rounds=early_stopping_rounds, save_snapshot=save_snapshot,
   2388     snapshot_file=snapshot_file, snapshot_interval=snapshot_interval, init_model=init_model,
   2389     callbacks=callbacks
   2390 )
   2391 params = train_params["params"]
   2392 train_pool = train_params["train_pool"]

File /pythondir/lib/python3.10/site-packages/catboost/core.py:2261, in CatBoost._prepare_train_params(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, plot_file, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model, callbacks)
   2258 text_features = _process_feature_indices(text_features, X, params, 'text_features')
   2259 embedding_features = _process_feature_indices(embedding_features, X, params, 'embedding_features')
-> 2261 train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs,
   2262                                sample_weight, group_id, group_weight, subgroup_id, pairs_weight,
   2263                                baseline, column_description)
   2264 if train_pool.is_empty_:
   2265     raise CatBoostError("X is empty.")

File /pythondir/lib/python3.10/site-packages/catboost/core.py:1499, in _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, column_description)
   1497     if y is None:
   1498         raise CatBoostError("y has not initialized in fic(): X is not catboost.Pool object, y must be not None in fit().")
-> 1499     train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, weight=sample_weight, group_id=group_id,
   1500                       group_weight=group_weight, subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline)
   1501 return train_pool

File /pythondir/lib/python3.10/site-packages/catboost/core.py:844, in Pool.__init__(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count, log_cout, log_cerr, data_can_be_none)
    838         if isinstance(feature_names, PATH_TYPES):
    839             raise CatBoostError(
    840                 "feature_names must be None of have not-string type when the pool is created from "
    841                 "python objects."
    842             )
--> 844         self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
    845                    group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
    846 elif not data_can_be_none:
    847     raise CatBoostError("'data' parameter can't be None")

File /pythondir/lib/python3.10/site-packages/catboost/core.py:1477, in Pool._init(self, data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)
   1475 if feature_tags is not None:
   1476     feature_tags = self._check_transform_tags(feature_tags, feature_names)
-> 1477 self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
   1478                 group_id, group_weight, subgroup_id, pairs_weight, baseline, timestamp, feature_names, feature_tags, thread_count)

File _catboost.pyx:4159, in _catboost._PoolBase._init_pool()

File _catboost.pyx:4209, in _catboost._PoolBase._init_pool()

File _catboost.pyx:4003, in _catboost._PoolBase._init_features_order_layout_pool()

CatBoostError: /src/catboost/util/generic/cast.h:131: Conversion 'unsigned int{3284861059}' to 'int', positive value converted to negative

Thank you in advance

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant