From 53b4ce3ca90d1f5f1e09bd6737eae7ac712a5c56 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 09:56:48 +0200
Subject: [PATCH 1/7] patch {sparse_,}softmax_cross_entropy_with_logits

Tied to open tensorflow issue #38185.
Non-determinism in backprop of fused implementation
of `{sparse_,}softmax_cross_entropy_with_logits`
has been reported. This work will provide a
patch by routing calls to a deterministic workaround
first described in the tensorflow issue above.
---
 tfdeterminism/patch.py | 109 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index eaa7442..ab485b2 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -70,6 +70,7 @@ def _patch():
   if re.match("(1\.(14|15)|2\.0)", tf_version):
     os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
     _patch_bias_add()
+    _patch_fused_softmax_cross_entropy()
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
@@ -77,6 +78,7 @@ def _patch():
     raise TypeError("tfdeterminism: No patch available "
                     "for version %s of TensorFlow" % tf_version)
 
+
 def _patch_bias_add():
   tf.nn.bias_add = _new_bias_add_1_14 # access via public API
   nn.bias_add = _new_bias_add_1_14 # called from tf.keras.layers.convolutional.Conv
@@ -135,3 +137,110 @@ def _new_bias_add_1_14(value, bias, data_format=None, name=None):
           value, array_ops.reshape(bias, broadcast_shape), name=name)
     else: # data_format == 'NHWC' or data_format == None
       return math_ops.add(value, bias, name=name)
+
+
+def _patch_fused_softmax_cross_entropy():
+  # Sparse
+  tf.nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14  # access via public API
+  nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14  # called from tf.keras.layers.convolutional.Conv
+  nn_ops.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits_1_14  # called from tests
+
+  # Non-sparse
+  tf.nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14  # access via public API
+  nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14  # called from tf.keras.layers.convolutional.Conv
+  nn_ops.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits_1_14  # called from tests
+
+
+# The original, pre-patched method can be viewed at
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
+  Backpropagation will happen only into `logits`.  To calculate a cross entropy
+  loss that allows backpropagation into both `logits` and `labels`, see
+  `tf.nn.softmax_cross_entropy_with_logits_v2`.
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+  Args:
+    _sentinel: Used to prevent positional parameters. Internal, do not use.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Per-label activations, typically a linear output. These activation
+      energies are interpreted as unnormalized log probabilities.
+    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+    axis: Alias for dim.
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
+  raise NotImplementedError()
+
+
+# The original, pre-patched method can be viewed at
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+def _new_sparse_softmax_cross_entropy_with_logits_1_14(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
+  """Computes sparse softmax cross entropy between `logits` and `labels`.
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+  **NOTE:**  For this operation, the probability of a given label is considered
+  exclusive.  That is, soft classes are not allowed, and the `labels` vector
+  must provide a single specific index for the true class for each row of
+  `logits` (each minibatch entry).  For soft softmax classification with
+  a probability distribution for each entry, see
+  `softmax_cross_entropy_with_logits_v2`.
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+  A common use case is to have logits of shape
+  `[batch_size, num_classes]` and have labels of shape
+  `[batch_size]`, but higher dimensions are supported, in which
+  case the `dim`-th dimension is assumed to be of size `num_classes`.
+  `logits` must have the dtype of `float16`, `float32`, or `float64`, and
+  `labels` must have the dtype of `int32` or `int64`.
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+  Args:
+    _sentinel: Used to prevent positional parameters. Internal, do not use.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+      must be an index in `[0, num_classes)`. Other values will raise an
+      exception when this op is run on CPU, and return `NaN` for corresponding
+      loss and gradient rows on GPU.
+    logits: Per-label activations (typically a linear output) of shape
+      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32`, or
+      `float64`. These activation energies are interpreted as unnormalized log
+      probabilities.
+    name: A name for the operation (optional).
+  Returns:
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
+  Raises:
+    ValueError: If logits are scalars (need to have rank >= 1) or if the rank
+      of the labels is not equal to the rank of the logits minus one.
+  """
+  raise NotImplementedError()

From aeb70ac21d12ece42b9cfffeb3fdf2c09e01f5f4 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 10:02:51 +0200
Subject: [PATCH 2/7] fix: line no for link to  previous implementation

---
 tfdeterminism/patch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index ab485b2..05f82d2 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -152,7 +152,7 @@ def _patch_fused_softmax_cross_entropy():
 
 
 # The original, pre-patched method can be viewed at
-# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L3182
 def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
   Measures the probability error in discrete classification tasks in which the
@@ -196,7 +196,7 @@ def _new_softmax_cross_entropy_with_logits_1_14(labels, logits, axis=-1, name=No
 
 
 # The original, pre-patched method can be viewed at
-# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L3249
 def _new_sparse_softmax_cross_entropy_with_logits_1_14(
     _sentinel=None,  # pylint: disable=invalid-name
     labels=None,

From bcc8475ffbe5a4837fe9aa596a515c9a0c4f6557 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 10:09:55 +0200
Subject: [PATCH 3/7] make this patch apply to tf 2.1 and 2.2 as well

---
 tfdeterminism/patch.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index 05f82d2..4348bca 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -74,6 +74,11 @@ def _patch():
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
+  elif re.match("2\.1|2\.2"):
+    _patch_fused_softmax_cross_entropy()
+    print("TensorFlow version %s has been patched "
+          "using tfdeterminism version %s" %
+          (tf_version, __version__), file=sys.stderr)
   else:
     raise TypeError("tfdeterminism: No patch available "
                     "for version %s of TensorFlow" % tf_version)

From f7d6b86feb11ad8f4317c8d807d097d56782ec61 Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 10:12:09 +0200
Subject: [PATCH 4/7] fix: ammend mistake in re.match(...) statement

---
 tfdeterminism/patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index 4348bca..d6e9964 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -74,7 +74,7 @@ def _patch():
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
-  elif re.match("2\.1|2\.2"):
+  elif re.match("2\.1|2\.2", tf_version):
     _patch_fused_softmax_cross_entropy()
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %

From 75ce3db9fc1b5c6ff3cf49368b085de6181b666e Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Tue, 23 Jun 2020 14:58:30 +0200
Subject: [PATCH 5/7] twd: softmax entropy patch in enable_determinism

---
 tfdeterminism/enable_determinism.py |  10 +--
 tfdeterminism/patch.py              | 110 +++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/tfdeterminism/enable_determinism.py b/tfdeterminism/enable_determinism.py
index 354bbd4..96ecec5 100644
--- a/tfdeterminism/enable_determinism.py
+++ b/tfdeterminism/enable_determinism.py
@@ -21,7 +21,7 @@
 
 import tensorflow as tf
 
-from .patch import _patch_bias_add
+from .patch import _patch_bias_add, _patch_fused_softmax_cross_entropy
 from .utils import _Version as Version
 
 def _enable_determinism(seed=None):
@@ -31,7 +31,7 @@ def _enable_determinism(seed=None):
     Call this method either before or after explicitly importing TensorFlow,
     but always before constructing any graphs.
 
-    This function cannot address all possible sources of non-determinism. Please 
+    This function cannot address all possible sources of non-determinism. Please
     see further instructions at https://github.com/NVIDIA/tensorflow-determinism
     to understand how to use it in a larger deterministic context.
 
@@ -52,7 +52,7 @@ def _enable_determinism(seed=None):
     _patch_bias_add()
   if in_ngc_cont and ngc_vers.at_least('19.06') or tf_vers.at_least('2.1'):
     os.environ['TF_DETERMINISTIC_OPS'] = '1'
+    # TODO: Add patch crossentropy here as well? Issue seems to still be present on tf 2.1, 2.2
   if in_ngc_cont and ngc_vers.at_least('19.06') or tf_vers.at_least('1.14'):
-    # Apply the fused softmax/cross-entropy patch here
-    pass
-  # TODO: Add other recipe items 
+    _patch_fused_softmax_cross_entropy()
+  # TODO: Add other recipe items
diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index d4d3118..30fb12c 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -70,7 +70,7 @@ def _patch():
   if re.match("(1\.(14|15)|2\.0)", tf_version):
     os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
     _patch_bias_add()
-    # Apply the fused softmax/cross-entropy patch here
+    _patch_fused_softmax_cross_entropy()
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
@@ -78,6 +78,7 @@ def _patch():
     raise TypeError("tfdeterminism: No patch available "
                     "for version %s of TensorFlow" % tf_version)
 
+
 def _patch_bias_add():
   tf.nn.bias_add = _new_bias_add_1_14 # access via public API
   nn.bias_add = _new_bias_add_1_14 # called from tf.keras.layers.convolutional.Conv
@@ -136,3 +137,110 @@ def _new_bias_add_1_14(value, bias, data_format=None, name=None):
           value, array_ops.reshape(bias, broadcast_shape), name=name)
     else: # data_format == 'NHWC' or data_format == None
       return math_ops.add(value, bias, name=name)
+
+
+def _patch_fused_softmax_cross_entropy():
+  # Sparse
+  tf.nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits  # access via public API
+  nn.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits  # called from tf.keras.layers.convolutional.Conv
+  nn_ops.softmax_cross_entropy_with_logits = _new_softmax_cross_entropy_with_logits  # called from tests
+
+  # Non-sparse
+  tf.nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits  # access via public API
+  nn.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits  # called from tf.keras.layers.convolutional.Conv
+  nn_ops.sparse_softmax_cross_entropy_with_logits = _new_sparse_softmax_cross_entropy_with_logits  # called from tests
+
+
+# The original, pre-patched method can be viewed at
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+def _new_softmax_cross_entropy_with_logits(labels, logits, axis=-1, name=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `dim` argument specifying the class dimension.
+  Backpropagation will happen only into `logits`.  To calculate a cross entropy
+  loss that allows backpropagation into both `logits` and `labels`, see
+  `tf.nn.softmax_cross_entropy_with_logits_v2`.
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+  Args:
+    _sentinel: Used to prevent positional parameters. Internal, do not use.
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Per-label activations, typically a linear output. These activation
+      energies are interpreted as unnormalized log probabilities.
+    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+    axis: Alias for dim.
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
+  raise NotImplementedError()
+
+
+# The original, pre-patched method can be viewed at
+# https://github.com/tensorflow/tensorflow/blob/v1.14.0/tensorflow/python/ops/nn_ops.py#L2628
+def _new_sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
+  """Computes sparse softmax cross entropy between `logits` and `labels`.
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+  **NOTE:**  For this operation, the probability of a given label is considered
+  exclusive.  That is, soft classes are not allowed, and the `labels` vector
+  must provide a single specific index for the true class for each row of
+  `logits` (each minibatch entry).  For soft softmax classification with
+  a probability distribution for each entry, see
+  `softmax_cross_entropy_with_logits_v2`.
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+  A common use case is to have logits of shape
+  `[batch_size, num_classes]` and have labels of shape
+  `[batch_size]`, but higher dimensions are supported, in which
+  case the `dim`-th dimension is assumed to be of size `num_classes`.
+  `logits` must have the dtype of `float16`, `float32`, or `float64`, and
+  `labels` must have the dtype of `int32` or `int64`.
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+  Args:
+    _sentinel: Used to prevent positional parameters. Internal, do not use.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
+      must be an index in `[0, num_classes)`. Other values will raise an
+      exception when this op is run on CPU, and return `NaN` for corresponding
+      loss and gradient rows on GPU.
+    logits: Per-label activations (typically a linear output) of shape
+      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32`, or
+      `float64`. These activation energies are interpreted as unnormalized log
+      probabilities.
+    name: A name for the operation (optional).
+  Returns:
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
+  Raises:
+    ValueError: If logits are scalars (need to have rank >= 1) or if the rank
+      of the labels is not equal to the rank of the logits minus one.
+  """
+  raise NotImplementedError()

From 231c7c53ee2d15197779ede88eee416499ab304e Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 10:09:55 +0200
Subject: [PATCH 6/7] make this patch apply to tf 2.1 and 2.2 as well

---
 tfdeterminism/patch.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index 30fb12c..7a211ef 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -74,6 +74,11 @@ def _patch():
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
+  elif re.match("2\.1|2\.2"):
+    _patch_fused_softmax_cross_entropy()
+    print("TensorFlow version %s has been patched "
+          "using tfdeterminism version %s" %
+          (tf_version, __version__), file=sys.stderr)
   else:
     raise TypeError("tfdeterminism: No patch available "
                     "for version %s of TensorFlow" % tf_version)

From f22bcc9010603cd6b03448567befa7e04979b20c Mon Sep 17 00:00:00 2001
From: MFreidank <freidankm@gmail.com>
Date: Mon, 22 Jun 2020 10:12:09 +0200
Subject: [PATCH 7/7] fix: ammend mistake in re.match(...) statement

---
 tfdeterminism/patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfdeterminism/patch.py b/tfdeterminism/patch.py
index 7a211ef..c85e1f6 100644
--- a/tfdeterminism/patch.py
+++ b/tfdeterminism/patch.py
@@ -74,7 +74,7 @@ def _patch():
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %
           (tf_version, __version__), file=sys.stderr)
-  elif re.match("2\.1|2\.2"):
+  elif re.match("2\.1|2\.2", tf_version):
     _patch_fused_softmax_cross_entropy()
     print("TensorFlow version %s has been patched "
           "using tfdeterminism version %s" %