Merge pull request #8 from FalseNegativeLab/feature/multiclass_new

Mainly documentation changes
FalseNegativeLab · Feb 18, 2024 · dcb1546 · dcb1546
2 parents cff2911 + 7125700
commit dcb1546
Show file tree

Hide file tree

Showing 40 changed files with 330 additions and 95 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -6,3 +6,5 @@ ignored-modules = numpy
 
 # Minimum lines number of a similarity.
 min-similarity-lines=30
+
+disable = too-many-arguments
diff --git a/README.rst b/README.rst
@@ -76,7 +76,7 @@ If you use the package, please consider citing the following paper:
 .. code-block:: BibTex
 
  @misc{fazekas2023testing,
- title={Testing the Consistency of Performance Scores Reported for Binary Classification Problems}, 
+ title={Testing the Consistency of Performance Scores Reported for Binary Classification Problems},
  author={Attila Fazekas and György Kovács},
  year={2023},
  eprint={2310.12527},
@@ -159,6 +159,8 @@ A simple binary classification testset consisting of ``p`` positive samples (usu
 
  testset = {"p": 10, "n": 20}
 
+We note that alternative notations, like using ``n_positive``, ``n_minority`` or ``n_1`` instead of ``p`` and similarly, ``n_negative``, ``n_majority`` and ``n_0`` instead of ``n`` are supported.
+
 One can also specify a commonly used dataset by its name and the package will look up the ``p`` and ``n`` counts of the datasets from its internal registry (based on the representations in the ``common-datasets`` package):
 
 .. code-block:: Python
@@ -261,7 +263,18 @@ Depending on the experimental setup, the consistency tests developed for binary
  * prevalence threshold (``pt``),
  * diagnostic odds ratio (``dor``),
  * Jaccard index (``ji``),
- * Cohen's kappa (``kappa``)
+ * Cohen's kappa (``kappa``).
+
+We note that synonyms and full names are also supported, for example:
+
+ * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
+ * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
+ * alternatives to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+
+Similarly, complements are supported as:
+
+ * one can specify ``false_positive_rate`` or ``fpr`` as a complement of ``spec``,
+ * and similarly, ``false_negative_rate`` or ``fnr`` as a complement of ``sens``.
 
 The tests are designed to detect inconsistencies. If the resulting ``inconsistency`` flag is ``False``, the scores can still be calculated in non-standard ways. However, **if the resulting ``inconsistency`` flag is ``True``, it conclusively indicates that inconsistencies are detected, and the reported scores could not be the outcome of the presumed experiment**.
 

diff --git a/docs/01a_requirements.rst b/docs/01a_requirements.rst
@@ -26,6 +26,8 @@ A simple binary classification testset consisting of ``p`` positive samples (usu
 
  testset = {"p": 10, "n": 20}
 
+We note that alternative notations, like using ``n_positive``, ``n_minority`` or ``n_1`` instead of ``p`` and similarly, ``n_negative``, ``n_majority`` and ``n_0`` instead of ``n`` are supported.
+
 One can also specify a commonly used dataset by its name and the package will look up the ``p`` and ``n`` counts of the datasets from its internal registry (based on the representations in the ``common-datasets`` package):
 
 .. code-block:: Python

diff --git a/docs/01c_consistency_checking.rst b/docs/01c_consistency_checking.rst
@@ -24,7 +24,18 @@ Depending on the experimental setup, the consistency tests developed for binary
  * prevalence threshold (``pt``),
  * diagnostic odds ratio (``dor``),
  * Jaccard index (``ji``),
- * Cohen's kappa (``kappa``)
+ * Cohen's kappa (``kappa``).
+
+We note that synonyms and full names are also supported, for example:
+
+ * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
+ * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
+ * alternatives to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+
+Similarly, complements are supported as:
+
+ * one can specify ``false_positive_rate`` or ``fpr`` as a complement of ``spec``,
+ * and similarly, ``false_negative_rate`` or ``fnr`` as a complement of ``sens``.
 
 The tests are designed to detect inconsistencies. If the resulting ``inconsistency`` flag is ``False``, the scores can still be calculated in non-standard ways. However, **if the resulting ``inconsistency`` flag is ``True``, it conclusively indicates that inconsistencies are detected, and the reported scores could not be the outcome of the presumed experiment**.
 

diff --git a/mlscorecheck/aggregated/_fold_enumeration.py b/mlscorecheck/aggregated/_fold_enumeration.py
@@ -370,7 +370,7 @@ def experiment_kfolds_generator(experiment: dict, available_scores: list):
  "aggregation": experiment["aggregation"],
  }
 
-def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int) -> dict:
+def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int):
  """
  Generates the configurations for two folds of cardinalities n0 and n1 and two
  classes of cardinalities c0 and n0 + n1 - c0
@@ -392,7 +392,7 @@ def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int) -> dict
  1: (c0 - c_00, n1 - c0 + c_00)
  }
 
-def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list) -> dict:
+def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list):
  """
  Generates the configurations for two folds of cardinalities n0 and n1 and a list
  of classes with sizes in cs
@@ -409,13 +409,17 @@ def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list) -> dic
  if len(cs) == 2:
  yield part
  else:
- for part_deep in multiclass_fold_partitioning_generator_2n(part[0][1], part[1][1], cs[1:]):
+ for part_deep in multiclass_fold_partitioning_generator_2n(
+ part[0][1],
+ part[1][1],
+ cs[1:]
+ ):
  yield {
  0: (part[0][0], *(part_deep[0])),
  1: (part[1][0], *(part_deep[1]))
  }
 
-def multiclass_fold_partitioning_generator_kn(ns: list, cs: list) -> dict:
+def multiclass_fold_partitioning_generator_kn(ns: list, cs: list):
  """
  Generates the configurations for a list of folds of sizes ns and a list
  of classes with sizes in cs

diff --git a/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py b/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py
@@ -5,7 +5,7 @@
 """
 
 from ...core import NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...aggregated import Experiment
 
 __all__ = ["check_1_dataset_kfold_som"]
@@ -32,7 +32,10 @@ def check_1_dataset_kfold_som(
  'f1', 'fm', 'f1n', 'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn',
  'mcc', 'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
  positive or f-beta negative, also set 'beta_positive' and
- 'beta_negative'.
+ 'beta_negative'. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
  eps (float|dict(str,float)): The numerical uncertainty(ies) of the scores.
  numerical_tolerance (float, optional): In practice, beyond the numerical uncertainty of
  the scores, some further tolerance is applied. This
@@ -90,6 +93,8 @@ def check_1_dataset_kfold_som(
  # True
 
  """
+ folding = translate_metadata(folding)
+
  if folding.get("folds") is None and folding.get("strategy") is None:
  # any folding strategy results the same
  folding = {**folding} | {"strategy": "stratified_sklearn"}

diff --git a/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py b/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py
@@ -6,6 +6,7 @@
 
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import check_aggregated_scores, Experiment, Evaluation
+from ...individual import translate_metadata
 
 __all__ = ["check_1_dataset_known_folds_mos"]
 
@@ -31,7 +32,10 @@ def check_1_dataset_known_folds_mos(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add ``fold_score_bounds`` when, for example, the minimum
- and the maximum scores over the folds are also provided.
+ and the maximum scores over the folds are also provided. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Args:
  dataset (dict): The dataset specification.
@@ -105,6 +109,9 @@ def check_1_dataset_known_folds_mos(
  # True
  """
 
+ dataset = translate_metadata(dataset)
+ folding = translate_metadata(folding)
+
  evaluation = Evaluation(
  dataset=dataset,
  folding=folding,

diff --git a/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py
@@ -5,6 +5,7 @@
 
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import Dataset, repeated_kfolds_generator, kfolds_generator
+from ...individual import translate_metadata
 from ._check_1_dataset_known_folds_mos import check_1_dataset_known_folds_mos
 
 __all__ = ["check_1_dataset_unknown_folds_mos", "estimate_n_evaluations"]
@@ -63,7 +64,10 @@ def check_1_dataset_unknown_folds_mos(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add fold_score_bounds when, for example, the minimum and
- the maximum scores over the folds are also provided.
+ the maximum scores over the folds are also provided. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Note that depending on the size of the dataset (especially the number of minority instances)
  and the folding configuration, this test might lead to an untractable number of problems to
@@ -126,6 +130,9 @@ def check_1_dataset_unknown_folds_mos(
  >>> result['inconsistency']
  # True
  """
+ dataset = translate_metadata(dataset)
+ folding = translate_metadata(folding)
+
  evaluation = {
  "dataset": dataset,
  "folding": folding,

diff --git a/mlscorecheck/check/binary/_check_1_testset_no_kfold.py b/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
@@ -6,7 +6,7 @@
 import warnings
 
 from ...core import logger, NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...experiments import dataset_statistics
 
 __all__ = ["check_1_testset_no_kfold"]
@@ -32,7 +32,11 @@ def check_1_testset_no_kfold(
  'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
  'bm', 'pt', 'dor', 'ji', 'kappa'), when using
  f-beta positive or f-beta negative, also set
- 'beta_positive' and 'beta_negative'.
+ 'beta_positive' and 'beta_negative'. Full names in camel case,
+ like 'positive_predictive_value', synonyms, like
+ 'true_positive_rate' or 'tpr' instead of 'sens' and
+ complements, like 'false_positive_rate' for (1 - 'spec') can
+ also be used.
  eps (float|dict(str,float)): the numerical uncertainty (potentially for each score)
  numerical_tolerance (float): in practice, beyond the numerical uncertainty of
  the scores, some further tolerance is applied. This is
@@ -90,6 +94,8 @@ def check_1_testset_no_kfold(
  "no aggregation of any kind."
  )
 
+ testset = translate_metadata(testset)
+
  if ("p" not in testset or "n" not in testset) and ("name" not in testset):
  raise ValueError('either "p" and "n" or "name" should be specified')
 

diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py b/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py
@@ -7,6 +7,7 @@
 import copy
 
 from ...aggregated import check_aggregated_scores, Experiment
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_datasets_mos_kfold_som"]
@@ -33,7 +34,10 @@ def check_n_datasets_mos_kfold_som(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add ``dataset_score_bounds`` when, for example, the minimum
- and the maximum scores over the datasets are also provided.
+ and the maximum scores over the datasets are also provided. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Args:
  evaluations (list(dict)): the list of evaluation specifications
@@ -105,6 +109,8 @@ def check_n_datasets_mos_kfold_som(
  # True
  """
 
+ evaluations = translate_metadata(evaluations)
+
  if any(evaluation.get("aggregation", "som") != "som" for evaluation in evaluations):
  raise ValueError(
  'the aggregation specified in each dataset must be "rom" or nothing.'

diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
@@ -7,6 +7,7 @@
 import copy
 
 from ...aggregated import check_aggregated_scores, Experiment
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_datasets_mos_known_folds_mos"]
@@ -33,7 +34,9 @@ def check_n_datasets_mos_known_folds_mos(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add ``dataset_score_bounds`` when, for example, the
- minimum and the maximum scores over the datasets are also provided.
+ minimum and the maximum scores over the datasets are also provided. Full names in camel case,
+ like 'positive_predictive_value', synonyms, like 'true_positive_rate' or 'tpr' instead of
+ 'sens' and complements, like 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Args:
  evaluations (list): The list of evaluation specifications.
@@ -107,6 +110,8 @@ def check_n_datasets_mos_known_folds_mos(
  ):
  raise ValueError("do not specify fold_score_bounds through this interface")
 
+ evaluations = translate_metadata(evaluations)
+
  evaluations = copy.deepcopy(evaluations)
 
  for evaluation in evaluations:

diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py
@@ -13,6 +13,7 @@
 from ._check_1_dataset_unknown_folds_mos import estimate_n_evaluations
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import experiment_kfolds_generator
+from ...individual import translate_metadata
 
 __all__ = ["check_n_datasets_mos_unknown_folds_mos", "estimate_n_experiments"]
 
@@ -27,6 +28,9 @@ def estimate_n_experiments(evaluations: list, available_scores: list = None) ->
  Returns:
  int: the estimated number of different fold configurations.
  """
+
+ evaluations = translate_metadata(evaluations)
+
  available_scores = [] if available_scores is None else available_scores
 
  counts = [
@@ -63,7 +67,10 @@ def check_n_datasets_mos_unknown_folds_mos(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add dataset_score_bounds when, for example, the minimum and
- the maximum scores over the datasets are also provided.
+ the maximum scores over the datasets are also provided. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Note that depending on the size of the dataset (especially the number of minority instances)
  and the folding configuration, this test might lead to an untractable number of problems to
@@ -130,6 +137,8 @@ def check_n_datasets_mos_unknown_folds_mos(
  >>> result['inconsistency']
  # True
  """
+ evaluations = translate_metadata(evaluations)
+
  if any(evaluation.get("aggregation", "mos") != "mos" for evaluation in evaluations):
  raise ValueError(
  'the aggregation specified in each dataset must be "mor" or nothing.'

diff --git a/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py b/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
@@ -7,7 +7,7 @@
 import copy
 
 from ...core import NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...aggregated import Experiment
 
 __all__ = ["check_n_datasets_som_kfold_som"]
@@ -34,7 +34,11 @@ def check_n_datasets_som_kfold_som(
  'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
  'bm', 'pt', 'dor', 'ji', 'kappa'), when using
  f-beta positive or f-beta negative, also set
- 'beta_positive' and 'beta_negative'.
+ 'beta_positive' and 'beta_negative'. Full names in camel case,
+ like 'positive_predictive_value', synonyms, like
+ 'true_positive_rate' or 'tpr' instead of 'sens' and
+ complements, like 'false_positive_rate' for (1 - 'spec') can
+ also be used.
  eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
  numerical_tolerance (float): in practice, beyond the numerical uncertainty of
  the scores, some further tolerance is applied. This is
@@ -97,6 +101,8 @@ def check_n_datasets_som_kfold_som(
  >>> result['inconsistency']
  # True
  """
+ evaluations = translate_metadata(evaluations)
+
  if any(evaluation.get("aggregation", "som") != "som" for evaluation in evaluations):
  raise ValueError(
  "the aggregation specifications cannot be anything else but 'rom'"

diff --git a/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py b/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py
@@ -5,6 +5,7 @@
 """
 
 from ...aggregated import check_aggregated_scores, Experiment, Dataset
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_testsets_mos_no_kfold"]
@@ -30,7 +31,10 @@ def check_n_testsets_mos_no_kfold(
 
  The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
  scores. For a stronger test, one can add ``testset_score_bounds`` when, for example, the minimum
- and the maximum scores over the testsets are also provided.
+ and the maximum scores over the testsets are also provided. Full names in camel case, like
+ 'positive_predictive_value', synonyms, like 'true_positive_rate'
+ or 'tpr' instead of 'sens' and complements, like
+ 'false_positive_rate' for (1 - 'spec') can also be used.
 
  Args:
  testsets (list(dict)): the list of testset specifications
@@ -90,6 +94,8 @@ def check_n_testsets_mos_no_kfold(
  # True
  """
 
+ testsets = translate_metadata(testsets)
+
  datasets = [Dataset(**dataset) for dataset in testsets]
 
  evaluations = [