From 53b4ce3ca90d1f5f1e09bd6737eae7ac712a5c56 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 09:56:48 +0200 Subject: [PATCH 1/7] patch {sparse_,}softmax_cross_entropy_with_logits Tied to open tensorflow issue #38185. Non-determinism in backprop of fused implementation of `{sparse_,}softmax_cross_entropy_with_logits` has been reported. This work will provide a patch by routing calls to a deterministic workaround first described in the tensorflow issue above. --- tfdeterminism/patch.py | 109 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index eaa7442..ab485b2 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -70,6 +70,7 @@ def _patch(): if re.match("(1\.(14|15)|2\.0)", tf_version): os.environ['TF_CUDNN_DETERMINISTIC'] = '1' _patch_bias_add() + _patch_fused_softmax_cross_entropy() print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) @@ -77,6 +78,7 @@ def _patch(): raise TypeError("tfdeterminism: No patch available " "for version %s of TensorFlow" % tf_version) + def _patch_bias_add(): tf.nn.bias_add = _new_bias_add_1_14 # access via public API nn.bias_add = _new_bias_add_1_14 # called from tf.keras.layers.convolutional.Conv @@ -135,3 +137,110 @@ def _new_bias_add_1_14(value, bias, data_format=None, name=None): value, array_ops.reshape(bias, broadcast_shape), name=name) else: # data_format == 'NHWC' or data_format == None return math_ops.add(value, bias, name=name) + + +def _patch_fused_softmax_cross_entropy(): + # Sparse + tf.nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14 # access via public API + nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14 # called from tf.keras.layers.convolutional.Conv + nn_ops.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14 # called from tests + + # Non-sparse + tf.nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14 # access via public API + nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14 # called from tf.keras.layers.convolutional.Conv + nn_ops.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14 # called from tests + + +# The original, pre-patched method can be viewed at +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=None): + """Computes softmax cross entropy between `logits` and `labels`. + Measures the probability error in discrete classification tasks in which the + classes are mutually exclusive (each entry is in exactly one class). For + example, each CIFAR-10 image is labeled with one and only one label: an image + can be a dog or a truck, but not both. + **NOTE:** While the classes are mutually exclusive, their probabilities + need not be. All that is required is that each row of `labels` is + a valid probability distribution. If they are not, the computation of the + gradient will be incorrect. + If using exclusive `labels` (wherein one and only + one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`. + **WARNING:** This op expects unscaled logits, since it performs a `softmax` + on `logits` internally for efficiency. Do not call this op with the + output of `softmax`, as it will produce incorrect results. + A common use case is to have logits and labels of shape + `[batch_size, num_classes]`, but higher dimensions are supported, with + the `dim` argument specifying the class dimension. + Backpropagation will happen only into `logits`. To calculate a cross entropy + loss that allows backpropagation into both `logits` and `labels`, see + `tf.nn.softmax_cross_entropy_with_logits_v2`. + **Note that to avoid confusion, it is required to pass only named arguments to + this function.** + Args: + _sentinel: Used to prevent positional parameters. Internal, do not use. + labels: Each vector along the class dimension should hold a valid + probability distribution e.g. for the case in which labels are of shape + `[batch_size, num_classes]`, each row of `labels[i]` must be a valid + probability distribution. + logits: Per-label activations, typically a linear output. These activation + energies are interpreted as unnormalized log probabilities. + dim: The class dimension. Defaulted to -1 which is the last dimension. + name: A name for the operation (optional). + axis: Alias for dim. + Returns: + A `Tensor` that contains the softmax cross entropy loss. Its type is the + same as `logits` and its shape is the same as `labels` except that it does + not have the last dimension of `labels`. + """ + raise NotImplementedError() + + +# The original, pre-patched method can be viewed at +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +def _new_sparse_softmax_cross_entropy_with_logits_1_14( + _sentinel=None, # pylint: disable=invalid-name + labels=None, + logits=None, + name=None): + """Computes sparse softmax cross entropy between `logits` and `labels`. + Measures the probability error in discrete classification tasks in which the + classes are mutually exclusive (each entry is in exactly one class). For + example, each CIFAR-10 image is labeled with one and only one label: an image + can be a dog or a truck, but not both. + **NOTE:** For this operation, the probability of a given label is considered + exclusive. That is, soft classes are not allowed, and the `labels` vector + must provide a single specific index for the true class for each row of + `logits` (each minibatch entry). For soft softmax classification with + a probability distribution for each entry, see + `softmax_cross_entropy_with_logits_v2`. + **WARNING:** This op expects unscaled logits, since it performs a `softmax` + on `logits` internally for efficiency. Do not call this op with the + output of `softmax`, as it will produce incorrect results. + A common use case is to have logits of shape + `[batch_size, num_classes]` and have labels of shape + `[batch_size]`, but higher dimensions are supported, in which + case the `dim`-th dimension is assumed to be of size `num_classes`. + `logits` must have the dtype of `float16`, `float32`, or `float64`, and + `labels` must have the dtype of `int32` or `int64`. + **Note that to avoid confusion, it is required to pass only named arguments to + this function.** + Args: + _sentinel: Used to prevent positional parameters. Internal, do not use. + labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of + `labels` and result) and dtype `int32` or `int64`. Each entry in `labels` + must be an index in `[0, num_classes)`. Other values will raise an + exception when this op is run on CPU, and return `NaN` for corresponding + loss and gradient rows on GPU. + logits: Per-label activations (typically a linear output) of shape + `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32`, or + `float64`. These activation energies are interpreted as unnormalized log + probabilities. + name: A name for the operation (optional). + Returns: + A `Tensor` of the same shape as `labels` and of the same type as `logits` + with the softmax cross entropy loss. + Raises: + ValueError: If logits are scalars (need to have rank >= 1) or if the rank + of the labels is not equal to the rank of the logits minus one. + """ + raise NotImplementedError() From aeb70ac21d12ece42b9cfffeb3fdf2c09e01f5f4 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 10:02:51 +0200 Subject: [PATCH 2/7] fix: line no for link to previous implementation --- tfdeterminism/patch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index ab485b2..05f82d2 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -152,7 +152,7 @@ def _patch_fused_softmax_cross_entropy(): # The original, pre-patched method can be viewed at -# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L3182 def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=None): """Computes softmax cross entropy between `logits` and `labels`. Measures the probability error in discrete classification tasks in which the @@ -196,7 +196,7 @@ def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=No # The original, pre-patched method can be viewed at -# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L3249 def _new_sparse_softmax_cross_entropy_with_logits_1_14( _sentinel=None, # pylint: disable=invalid-name labels=None, From bcc8475ffbe5a4837fe9aa596a515c9a0c4f6557 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 10:09:55 +0200 Subject: [PATCH 3/7] make this patch apply to tf 2.1 and 2.2 as well --- tfdeterminism/patch.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index 05f82d2..4348bca 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -74,6 +74,11 @@ def _patch(): print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) + elif re.match("2\.1|2\.2"): + _patch_fused_softmax_cross_entropy() + print("TensorFlow version %s has been patched " + "using tfdeterminism version %s" % + (tf_version, __version__), file=sys.stderr) else: raise TypeError("tfdeterminism: No patch available " "for version %s of TensorFlow" % tf_version) From f7d6b86feb11ad8f4317c8d807d097d56782ec61 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 10:12:09 +0200 Subject: [PATCH 4/7] fix: ammend mistake in re.match(...) statement --- tfdeterminism/patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index 4348bca..d6e9964 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -74,7 +74,7 @@ def _patch(): print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) - elif re.match("2\.1|2\.2"): + elif re.match("2\.1|2\.2", tf_version): _patch_fused_softmax_cross_entropy() print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % From 75ce3db9fc1b5c6ff3cf49368b085de6181b666e Mon Sep 17 00:00:00 2001 From: MFreidank Date: Tue, 23 Jun 2020 14:58:30 +0200 Subject: [PATCH 5/7] twd: softmax entropy patch in enable_determinism --- tfdeterminism/enable_determinism.py | 10 +-- tfdeterminism/patch.py | 110 +++++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 6 deletions(-) diff --git a/tfdeterminism/enable_determinism.py b/tfdeterminism/enable_determinism.py index 354bbd4..96ecec5 100644 --- a/tfdeterminism/enable_determinism.py +++ b/tfdeterminism/enable_determinism.py @@ -21,7 +21,7 @@ import tensorflow as tf -from .patch import _patch_bias_add +from .patch import _patch_bias_add, _patch_fused_softmax_cross_entropy from .utils import _Version as Version def _enable_determinism(seed=None): @@ -31,7 +31,7 @@ def _enable_determinism(seed=None): Call this method either before or after explicitly importing TensorFlow, but always before constructing any graphs. - This function cannot address all possible sources of non-determinism. Please + This function cannot address all possible sources of non-determinism. Please see further instructions at https://github.com/NVIDIA/tensorflow-determinism to understand how to use it in a larger deterministic context. @@ -52,7 +52,7 @@ def _enable_determinism(seed=None): _patch_bias_add() if in_ngc_cont and ngc_vers.at_least('19.06') or tf_vers.at_least('2.1'): os.environ['TF_DETERMINISTIC_OPS'] = '1' + # TODO: Add patch crossentropy here as well? Issue seems to still be present on tf 2.1, 2.2 if in_ngc_cont and ngc_vers.at_least('19.06') or tf_vers.at_least('1.14'): - # Apply the fused softmax/cross-entropy patch here - pass - # TODO: Add other recipe items + _patch_fused_softmax_cross_entropy() + # TODO: Add other recipe items diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index d4d3118..30fb12c 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -70,7 +70,7 @@ def _patch(): if re.match("(1\.(14|15)|2\.0)", tf_version): os.environ['TF_CUDNN_DETERMINISTIC'] = '1' _patch_bias_add() - # Apply the fused softmax/cross-entropy patch here + _patch_fused_softmax_cross_entropy() print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) @@ -78,6 +78,7 @@ def _patch(): raise TypeError("tfdeterminism: No patch available " "for version %s of TensorFlow" % tf_version) + def _patch_bias_add(): tf.nn.bias_add = _new_bias_add_1_14 # access via public API nn.bias_add = _new_bias_add_1_14 # called from tf.keras.layers.convolutional.Conv @@ -136,3 +137,110 @@ def _new_bias_add_1_14(value, bias, data_format=None, name=None): value, array_ops.reshape(bias, broadcast_shape), name=name) else: # data_format == 'NHWC' or data_format == None return math_ops.add(value, bias, name=name) + + +def _patch_fused_softmax_cross_entropy(): + # Sparse + tf.nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits # access via public API + nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits # called from tf.keras.layers.convolutional.Conv + nn_ops.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits # called from tests + + # Non-sparse + tf.nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits # access via public API + nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits # called from tf.keras.layers.convolutional.Conv + nn_ops.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits # called from tests + + +# The original, pre-patched method can be viewed at +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +def _new_softmax_cross_entropy_with_logits(labels, logits, axis=-1, name=None): + """Computes softmax cross entropy between `logits` and `labels`. + Measures the probability error in discrete classification tasks in which the + classes are mutually exclusive (each entry is in exactly one class). For + example, each CIFAR-10 image is labeled with one and only one label: an image + can be a dog or a truck, but not both. + **NOTE:** While the classes are mutually exclusive, their probabilities + need not be. All that is required is that each row of `labels` is + a valid probability distribution. If they are not, the computation of the + gradient will be incorrect. + If using exclusive `labels` (wherein one and only + one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`. + **WARNING:** This op expects unscaled logits, since it performs a `softmax` + on `logits` internally for efficiency. Do not call this op with the + output of `softmax`, as it will produce incorrect results. + A common use case is to have logits and labels of shape + `[batch_size, num_classes]`, but higher dimensions are supported, with + the `dim` argument specifying the class dimension. + Backpropagation will happen only into `logits`. To calculate a cross entropy + loss that allows backpropagation into both `logits` and `labels`, see + `tf.nn.softmax_cross_entropy_with_logits_v2`. + **Note that to avoid confusion, it is required to pass only named arguments to + this function.** + Args: + _sentinel: Used to prevent positional parameters. Internal, do not use. + labels: Each vector along the class dimension should hold a valid + probability distribution e.g. for the case in which labels are of shape + `[batch_size, num_classes]`, each row of `labels[i]` must be a valid + probability distribution. + logits: Per-label activations, typically a linear output. These activation + energies are interpreted as unnormalized log probabilities. + dim: The class dimension. Defaulted to -1 which is the last dimension. + name: A name for the operation (optional). + axis: Alias for dim. + Returns: + A `Tensor` that contains the softmax cross entropy loss. Its type is the + same as `logits` and its shape is the same as `labels` except that it does + not have the last dimension of `labels`. + """ + raise NotImplementedError() + + +# The original, pre-patched method can be viewed at +# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628 +def _new_sparse_softmax_cross_entropy_with_logits( + _sentinel=None, # pylint: disable=invalid-name + labels=None, + logits=None, + name=None): + """Computes sparse softmax cross entropy between `logits` and `labels`. + Measures the probability error in discrete classification tasks in which the + classes are mutually exclusive (each entry is in exactly one class). For + example, each CIFAR-10 image is labeled with one and only one label: an image + can be a dog or a truck, but not both. + **NOTE:** For this operation, the probability of a given label is considered + exclusive. That is, soft classes are not allowed, and the `labels` vector + must provide a single specific index for the true class for each row of + `logits` (each minibatch entry). For soft softmax classification with + a probability distribution for each entry, see + `softmax_cross_entropy_with_logits_v2`. + **WARNING:** This op expects unscaled logits, since it performs a `softmax` + on `logits` internally for efficiency. Do not call this op with the + output of `softmax`, as it will produce incorrect results. + A common use case is to have logits of shape + `[batch_size, num_classes]` and have labels of shape + `[batch_size]`, but higher dimensions are supported, in which + case the `dim`-th dimension is assumed to be of size `num_classes`. + `logits` must have the dtype of `float16`, `float32`, or `float64`, and + `labels` must have the dtype of `int32` or `int64`. + **Note that to avoid confusion, it is required to pass only named arguments to + this function.** + Args: + _sentinel: Used to prevent positional parameters. Internal, do not use. + labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of + `labels` and result) and dtype `int32` or `int64`. Each entry in `labels` + must be an index in `[0, num_classes)`. Other values will raise an + exception when this op is run on CPU, and return `NaN` for corresponding + loss and gradient rows on GPU. + logits: Per-label activations (typically a linear output) of shape + `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32`, or + `float64`. These activation energies are interpreted as unnormalized log + probabilities. + name: A name for the operation (optional). + Returns: + A `Tensor` of the same shape as `labels` and of the same type as `logits` + with the softmax cross entropy loss. + Raises: + ValueError: If logits are scalars (need to have rank >= 1) or if the rank + of the labels is not equal to the rank of the logits minus one. + """ + raise NotImplementedError() From 231c7c53ee2d15197779ede88eee416499ab304e Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 10:09:55 +0200 Subject: [PATCH 6/7] make this patch apply to tf 2.1 and 2.2 as well --- tfdeterminism/patch.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index 30fb12c..7a211ef 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -74,6 +74,11 @@ def _patch(): print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) + elif re.match("2\.1|2\.2"): + _patch_fused_softmax_cross_entropy() + print("TensorFlow version %s has been patched " + "using tfdeterminism version %s" % + (tf_version, __version__), file=sys.stderr) else: raise TypeError("tfdeterminism: No patch available " "for version %s of TensorFlow" % tf_version) From f22bcc9010603cd6b03448567befa7e04979b20c Mon Sep 17 00:00:00 2001 From: MFreidank Date: Mon, 22 Jun 2020 10:12:09 +0200 Subject: [PATCH 7/7] fix: ammend mistake in re.match(...) statement --- tfdeterminism/patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py index 7a211ef..c85e1f6 100644 --- a/tfdeterminism/patch.py +++ b/tfdeterminism/patch.py @@ -74,7 +74,7 @@ def _patch(): print("TensorFlow version %s has been patched " "using tfdeterminism version %s" % (tf_version, __version__), file=sys.stderr) - elif re.match("2\.1|2\.2"): + elif re.match("2\.1|2\.2", tf_version): _patch_fused_softmax_cross_entropy() print("TensorFlow version %s has been patched " "using tfdeterminism version %s" %