From d0f8d10c4526a7bed7b295d5741822cd023ba63e Mon Sep 17 00:00:00 2001 From: Sandipan Panda Date: Thu, 14 Mar 2024 00:46:15 +0530 Subject: [PATCH] Migrate TFJob examples to TensorFlow 2 Run the automated script to convert some of TF1.x API usage to tf.compat.v1. Signed-off-by: Sandipan Panda --- examples/tensorflow_v2/dist-mnist/Dockerfile | 18 + .../dist-mnist/Dockerfile.ppc64le | 18 + examples/tensorflow_v2/dist-mnist/README.md | 23 + .../tensorflow_v2/dist-mnist/dist_mnist.py | 303 +++++++++++++ .../dist-mnist/tf_job_mnist.yaml | 22 + .../estimator-API/Dockerfile | 4 + .../estimator-API/Makefile | 38 ++ .../estimator-API/README.md | 22 + .../estimator-API/distributed_tfjob.yaml | 19 + .../estimator-API/keras_model_to_estimator.py | 78 ++++ .../keras-API/Dockerfile | 6 + .../distribution_strategy/keras-API/README.md | 29 ++ .../multi_worker_strategy-with-keras.py | 160 +++++++ .../keras-API/multi_worker_tfjob.yaml | 26 ++ .../distribution_strategy/keras-API/pvc.yaml | 13 + .../image-classification/create-tfjob.ipynb | 405 ++++++++++++++++++ .../mnist_with_summaries/Dockerfile | 18 + .../mnist_with_summaries/Dockerfile.ppc64le | 18 + .../mnist_with_summaries/README.md | 21 + .../mnist_with_summaries.py | 212 +++++++++ .../mnist_with_summaries/tf_job_mnist.yaml | 30 ++ .../tfevent-volume/tfevent-pv.yaml | 15 + .../tfevent-volume/tfevent-pvc.yaml | 14 + examples/tensorflow_v2/simple.yaml | 18 + examples/tensorflow_v2/tf_sample/Dockerfile | 5 + examples/tensorflow_v2/tf_sample/Makefile | 38 ++ examples/tensorflow_v2/tf_sample/setup.py | 26 ++ examples/tensorflow_v2/tf_sample/tf_smoke.py | 147 +++++++ examples/tf_upgrade_v2_report.txt | 186 ++++++++ 29 files changed, 1932 insertions(+) create mode 100644 examples/tensorflow_v2/dist-mnist/Dockerfile create mode 100644 examples/tensorflow_v2/dist-mnist/Dockerfile.ppc64le create mode 100644 examples/tensorflow_v2/dist-mnist/README.md create mode 100644 examples/tensorflow_v2/dist-mnist/dist_mnist.py create mode 100644 examples/tensorflow_v2/dist-mnist/tf_job_mnist.yaml create mode 100644 examples/tensorflow_v2/distribution_strategy/estimator-API/Dockerfile create mode 100644 examples/tensorflow_v2/distribution_strategy/estimator-API/Makefile create mode 100644 examples/tensorflow_v2/distribution_strategy/estimator-API/README.md create mode 100644 examples/tensorflow_v2/distribution_strategy/estimator-API/distributed_tfjob.yaml create mode 100644 examples/tensorflow_v2/distribution_strategy/estimator-API/keras_model_to_estimator.py create mode 100644 examples/tensorflow_v2/distribution_strategy/keras-API/Dockerfile create mode 100644 examples/tensorflow_v2/distribution_strategy/keras-API/README.md create mode 100644 examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py create mode 100644 examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_tfjob.yaml create mode 100644 examples/tensorflow_v2/distribution_strategy/keras-API/pvc.yaml create mode 100644 examples/tensorflow_v2/image-classification/create-tfjob.ipynb create mode 100644 examples/tensorflow_v2/mnist_with_summaries/Dockerfile create mode 100644 examples/tensorflow_v2/mnist_with_summaries/Dockerfile.ppc64le create mode 100644 examples/tensorflow_v2/mnist_with_summaries/README.md create mode 100644 examples/tensorflow_v2/mnist_with_summaries/mnist_with_summaries.py create mode 100644 examples/tensorflow_v2/mnist_with_summaries/tf_job_mnist.yaml create mode 100644 examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml create mode 100644 examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml create mode 100644 examples/tensorflow_v2/simple.yaml create mode 100644 examples/tensorflow_v2/tf_sample/Dockerfile create mode 100644 examples/tensorflow_v2/tf_sample/Makefile create mode 100644 examples/tensorflow_v2/tf_sample/setup.py create mode 100644 examples/tensorflow_v2/tf_sample/tf_smoke.py create mode 100644 examples/tf_upgrade_v2_report.txt diff --git a/examples/tensorflow_v2/dist-mnist/Dockerfile b/examples/tensorflow_v2/dist-mnist/Dockerfile new file mode 100644 index 0000000000..002a742d59 --- /dev/null +++ b/examples/tensorflow_v2/dist-mnist/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM tensorflow/tensorflow:2.16.1 + +ADD examples/tensorflow/dist-mnist/ /var/tf_dist_mnist +ENTRYPOINT ["python", "/var/tf_dist_mnist/dist_mnist.py"] diff --git a/examples/tensorflow_v2/dist-mnist/Dockerfile.ppc64le b/examples/tensorflow_v2/dist-mnist/Dockerfile.ppc64le new file mode 100644 index 0000000000..8b9bd79de1 --- /dev/null +++ b/examples/tensorflow_v2/dist-mnist/Dockerfile.ppc64le @@ -0,0 +1,18 @@ +# Copyright 2019 The Kubeflow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ibmcom/tensorflow-ppc64le:1.13.1 + +ADD . /var/tf_dist_mnist +ENTRYPOINT ["python", "/var/tf_dist_mnist/dist_mnist.py"] diff --git a/examples/tensorflow_v2/dist-mnist/README.md b/examples/tensorflow_v2/dist-mnist/README.md new file mode 100644 index 0000000000..80e37de649 --- /dev/null +++ b/examples/tensorflow_v2/dist-mnist/README.md @@ -0,0 +1,23 @@ +### Distributed mnist model for e2e test + +This folder containers Dockerfile and distributed mnist model for e2e test. + +**Build Image** + +The default image name and tag is `kubeflow/tf-dist-mnist-test:1.0`. + +To build this image on x86_64: +```shell +docker build -f Dockerfile -t kubeflow/tf-dist-mnist-test:1.0 ./ +``` +To build this image on ppc64le: +```shell +docker build -f Dockerfile.ppc64le -t kubeflow123/tf-dist-mnist-test:1.0 ./ +``` + +**Create TFJob YAML** + +``` +kubectl create -f ./tf_job_mnist.yaml +``` + * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. \ No newline at end of file diff --git a/examples/tensorflow_v2/dist-mnist/dist_mnist.py b/examples/tensorflow_v2/dist-mnist/dist_mnist.py new file mode 100644 index 0000000000..81a0553683 --- /dev/null +++ b/examples/tensorflow_v2/dist-mnist/dist_mnist.py @@ -0,0 +1,303 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Distributed MNIST training and validation, with model replicas. + +A simple softmax model with one hidden layer is defined. The parameters +(weights and biases) are located on one parameter server (ps), while the ops +are executed on two worker nodes by default. The TF sessions also run on the +worker node. +Multiple invocations of this script can be done in parallel, with different +values for --task_index. There should be exactly one invocation with +--task_index, which will create a master session that carries out variable +initialization. The other, non-master, sessions will wait for the master +session to finish the initialization before proceeding to the training stage. + +The coordination between the multiple worker invocations occurs due to +the definition of the parameters on the same ps devices. The parameter updates +from one worker is visible to all other workers. As such, the workers can +perform forward computation and gradient calculation in parallel, which +should lead to increased training speed for the simple model. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import math +import os +import sys +import tempfile +import time + +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data + +flags = tf.compat.v1.app.flags +flags.DEFINE_string("data_dir", "/tmp/mnist-data", + "Directory for storing mnist data") +flags.DEFINE_boolean("download_only", False, + "Only perform downloading of data; Do not proceed to " + "session preparation, model definition or training") +flags.DEFINE_integer("task_index", None, + "Worker task index, should be >= 0. task_index=0 is " + "the master worker task the performs the variable " + "initialization ") +flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine." + "If you don't use GPU, please set it to '0'") +flags.DEFINE_integer("replicas_to_aggregate", None, + "Number of replicas to aggregate before parameter update" + "is applied (For sync_replicas mode only; default: " + "num_workers)") +flags.DEFINE_integer("hidden_units", 100, + "Number of units in the hidden layer of the NN") +flags.DEFINE_integer("train_steps", 20000, + "Number of (global) training steps to perform") +flags.DEFINE_integer("batch_size", 100, "Training batch size") +flags.DEFINE_float("learning_rate", 0.01, "Learning rate") +flags.DEFINE_boolean( + "sync_replicas", False, + "Use the sync_replicas (synchronized replicas) mode, " + "wherein the parameter updates from workers are aggregated " + "before applied to avoid stale gradients") +flags.DEFINE_boolean( + "existing_servers", False, "Whether servers already exists. If True, " + "will use the worker hosts via their GRPC URLs (one client process " + "per worker host). Otherwise, will create an in-process TensorFlow " + "server.") +flags.DEFINE_string("ps_hosts", "localhost:2222", + "Comma-separated list of hostname:port pairs") +flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224", + "Comma-separated list of hostname:port pairs") +flags.DEFINE_string("job_name", None, "job name: worker or ps") + +FLAGS = flags.FLAGS + +IMAGE_PIXELS = 28 + +# Example: +# cluster = {'ps': ['host1:2222', 'host2:2222'], +# 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} +# os.environ['TF_CONFIG'] = json.dumps( +# {'cluster': cluster, +# 'task': {'type': 'worker', 'index': 1}}) + +def main(unused_argv): + # Parse environment variable TF_CONFIG to get job_name and task_index + + # If not explicitly specified in the constructor and the TF_CONFIG + # environment variable is present, load cluster_spec from TF_CONFIG. + tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}') + task_config = tf_config.get('task', {}) + task_type = task_config.get('type') + task_index = task_config.get('index') + + FLAGS.job_name = task_type + FLAGS.task_index = task_index + + mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) + if FLAGS.download_only: + sys.exit(0) + + if FLAGS.job_name is None or FLAGS.job_name == "": + raise ValueError("Must specify an explicit `job_name`") + if FLAGS.task_index is None or FLAGS.task_index == "": + raise ValueError("Must specify an explicit `task_index`") + + print("job name = %s" % FLAGS.job_name) + print("task index = %d" % FLAGS.task_index) + + cluster_config = tf_config.get('cluster', {}) + ps_hosts = cluster_config.get('ps') + worker_hosts = cluster_config.get('worker') + + ps_hosts_str = ','.join(ps_hosts) + worker_hosts_str = ','.join(worker_hosts) + + FLAGS.ps_hosts = ps_hosts_str + FLAGS.worker_hosts = worker_hosts_str + + # Construct the cluster and start the server + ps_spec = FLAGS.ps_hosts.split(",") + worker_spec = FLAGS.worker_hosts.split(",") + + # Get the number of workers. + num_workers = len(worker_spec) + + cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec}) + + if not FLAGS.existing_servers: + # Not using existing servers. Create an in-process server. + server = tf.distribute.Server( + cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) + if FLAGS.job_name == "ps": + server.join() + + is_chief = (FLAGS.task_index == 0) + if FLAGS.num_gpus > 0: + # Avoid gpu allocation conflict: now allocate task_num -> #gpu + # for each worker in the corresponding machine + gpu = (FLAGS.task_index % FLAGS.num_gpus) + worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) + elif FLAGS.num_gpus == 0: + # Just allocate the CPU to worker server + cpu = 0 + worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) + # The device setter will automatically place Variables ops on separate + # parameter servers (ps). The non-Variable ops will be placed on the workers. + # The ps use CPU and workers use corresponding GPU + with tf.device( + tf.compat.v1.train.replica_device_setter( + worker_device=worker_device, + ps_device="/job:ps/cpu:0", + cluster=cluster)): + global_step = tf.Variable(0, name="global_step", trainable=False) + + # Variables of the hidden layer + hid_w = tf.Variable( + tf.random.truncated_normal( + [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], + stddev=1.0 / IMAGE_PIXELS), + name="hid_w") + hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") + + # Variables of the softmax layer + sm_w = tf.Variable( + tf.random.truncated_normal( + [FLAGS.hidden_units, 10], + stddev=1.0 / math.sqrt(FLAGS.hidden_units)), + name="sm_w") + sm_b = tf.Variable(tf.zeros([10]), name="sm_b") + + # Ops: located on the worker specified with FLAGS.task_index + x = tf.compat.v1.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) + y_ = tf.compat.v1.placeholder(tf.float32, [None, 10]) + + hid_lin = tf.compat.v1.nn.xw_plus_b(x, hid_w, hid_b) + hid = tf.nn.relu(hid_lin) + + y = tf.nn.softmax(tf.compat.v1.nn.xw_plus_b(hid, sm_w, sm_b)) + cross_entropy = -tf.reduce_sum(y_ * tf.math.log(tf.clip_by_value(y, 1e-10, 1.0))) + + opt = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate) + + if FLAGS.sync_replicas: + if FLAGS.replicas_to_aggregate is None: + replicas_to_aggregate = num_workers + else: + replicas_to_aggregate = FLAGS.replicas_to_aggregate + + opt = tf.compat.v1.train.SyncReplicasOptimizer( + opt, + replicas_to_aggregate=replicas_to_aggregate, + total_num_replicas=num_workers, + name="mnist_sync_replicas") + + train_step = opt.minimize(cross_entropy, global_step=global_step) + + if FLAGS.sync_replicas: + local_init_op = opt.local_step_init_op + if is_chief: + local_init_op = opt.chief_init_op + + ready_for_local_init_op = opt.ready_for_local_init_op + + # Initial token and chief queue runners required by the sync_replicas mode + chief_queue_runner = opt.get_chief_queue_runner() + sync_init_op = opt.get_init_tokens_op() + + init_op = tf.compat.v1.global_variables_initializer() + train_dir = tempfile.mkdtemp() + + if FLAGS.sync_replicas: + sv = tf.compat.v1.train.Supervisor( + is_chief=is_chief, + logdir=train_dir, + init_op=init_op, + local_init_op=local_init_op, + ready_for_local_init_op=ready_for_local_init_op, + recovery_wait_secs=1, + global_step=global_step) + else: + sv = tf.compat.v1.train.Supervisor( + is_chief=is_chief, + logdir=train_dir, + init_op=init_op, + recovery_wait_secs=1, + global_step=global_step) + + sess_config = tf.compat.v1.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % FLAGS.task_index]) + + # The chief worker (task_index==0) session will prepare the session, + # while the remaining workers will wait for the preparation to complete. + if is_chief: + print("Worker %d: Initializing session..." % FLAGS.task_index) + else: + print("Worker %d: Waiting for session to be initialized..." % + FLAGS.task_index) + + if FLAGS.existing_servers: + server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] + print("Using existing server at: %s" % server_grpc_url) + + sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) + else: + sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) + + print("Worker %d: Session initialization complete." % FLAGS.task_index) + + if FLAGS.sync_replicas and is_chief: + # Chief worker will start the chief queue runner and call the init op. + sess.run(sync_init_op) + sv.start_queue_runners(sess, [chief_queue_runner]) + + # Perform training + time_begin = time.time() + print("Training begins @ %f" % time_begin) + + local_step = 0 + while True: + # Training feed + batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) + train_feed = {x: batch_xs, y_: batch_ys} + + _, step = sess.run([train_step, global_step], feed_dict=train_feed) + local_step += 1 + + now = time.time() + print("%f: Worker %d: training step %d done (global step: %d)" % + (now, FLAGS.task_index, local_step, step)) + + if step >= FLAGS.train_steps: + break + + time_end = time.time() + print("Training ends @ %f" % time_end) + training_time = time_end - time_begin + print("Training elapsed time: %f s" % training_time) + + # Validation feed + val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} + val_xent = sess.run(cross_entropy, feed_dict=val_feed) + print("After %d training step(s), validation cross entropy = %g" % + (FLAGS.train_steps, val_xent)) + + +if __name__ == "__main__": + tf.compat.v1.app.run() diff --git a/examples/tensorflow_v2/dist-mnist/tf_job_mnist.yaml b/examples/tensorflow_v2/dist-mnist/tf_job_mnist.yaml new file mode 100644 index 0000000000..cb6fad1495 --- /dev/null +++ b/examples/tensorflow_v2/dist-mnist/tf_job_mnist.yaml @@ -0,0 +1,22 @@ +apiVersion: "kubeflow.org/v1" +kind: "TFJob" +metadata: + name: "dist-mnist-for-e2e-test" +spec: + tfReplicaSpecs: + PS: + replicas: 2 + restartPolicy: Never + template: + spec: + containers: + - name: tensorflow + image: kubeflow/tf-dist-mnist-test:latest + Worker: + replicas: 4 + restartPolicy: Never + template: + spec: + containers: + - name: tensorflow + image: kubeflow/tf-dist-mnist-test:latest diff --git a/examples/tensorflow_v2/distribution_strategy/estimator-API/Dockerfile b/examples/tensorflow_v2/distribution_strategy/estimator-API/Dockerfile new file mode 100644 index 0000000000..f9f276fa0f --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/estimator-API/Dockerfile @@ -0,0 +1,4 @@ +FROM tensorflow/tensorflow:2.16.1 + +COPY examples/tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py / +ENTRYPOINT ["python", "/keras_model_to_estimator.py", "/tmp/tfkeras_example/"] diff --git a/examples/tensorflow_v2/distribution_strategy/estimator-API/Makefile b/examples/tensorflow_v2/distribution_strategy/estimator-API/Makefile new file mode 100644 index 0000000000..f0ce957ec1 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/estimator-API/Makefile @@ -0,0 +1,38 @@ +IMG = gcr.io/kubeflow-examples/distributed_worker + +# List any changed files. We only include files in the notebooks directory. +# because that is the code in the docker image. +# In particular we exclude changes to the ksonnet configs. +CHANGED_FILES := $(shell git diff-files --relative=tensorflow/tf_sample) + +ifeq ($(strip $(CHANGED_FILES)),) +# Changed files is empty; not dirty +# Don't include --dirty because it could be dirty if files outside the ones we care +# about changed. +GIT_VERSION := $(shell git describe --always) +else +GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6) +endif + +TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION) +all: build + +# To build without the cache set the environment variable +# export DOCKER_BUILD_OPTS=--no-cache +build: + docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) . \ + --label=git-verions=$(GIT_VERSION) + docker tag $(IMG):$(TAG) $(IMG):latest + @echo Built $(IMG):latest + @echo Built $(IMG):$(TAG) + + +# Build but don't attach the latest tag. This allows manual testing/inspection of the image +# first. +push: build + gcloud docker -- push $(IMG):$(TAG) + @echo Pushed $(IMG) with :$(TAG) tags + +push-latest: push + gcloud container images add-tag --quiet $(IMG):$(TAG) $(IMG):latest --verbosity=info + echo created $(IMG):latest diff --git a/examples/tensorflow_v2/distribution_strategy/estimator-API/README.md b/examples/tensorflow_v2/distribution_strategy/estimator-API/README.md new file mode 100644 index 0000000000..a4a009b2da --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/estimator-API/README.md @@ -0,0 +1,22 @@ +# Distributed Training on Kubeflow + +This is an example of running distributed training on Kubeflow. The source code is taken from +TensorFlow team's example [here](https://github.com/tensorflow/ecosystem/tree/master/distribution_strategy). + +The directory contains the following files: +* Dockerfile: Builds the independent worker image. +* Makefile: For building the above image. +* keras_model_to_estimator.py: This is the model code to run multi-worker training. Identical to the TensorFlow example. +* distributed_tfjob.yaml: The TFJob spec. + +To run the example, edit `distributed_tfjob.yaml` for your cluster's namespace. Then run +``` +kubectl apply -f distributed_tfjob.yaml +``` +to create the job. + +Then use +``` +kubectl -n ${NAMESPACE} describe tfjob distributed-training +``` +to see the status. diff --git a/examples/tensorflow_v2/distribution_strategy/estimator-API/distributed_tfjob.yaml b/examples/tensorflow_v2/distribution_strategy/estimator-API/distributed_tfjob.yaml new file mode 100644 index 0000000000..b7a0bc36b5 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/estimator-API/distributed_tfjob.yaml @@ -0,0 +1,19 @@ +apiVersion: "kubeflow.org/v1" +kind: "TFJob" +metadata: + name: "distributed-training" +spec: + runPolicy: + cleanPodPolicy: None + tfReplicaSpecs: + Worker: + replicas: 3 + restartPolicy: Never + template: + metadata: + annotations: + scheduling.k8s.io/group-name: "distributed-training" + spec: + containers: + - name: tensorflow + image: kubeflow/tf-distributed-worker:latest diff --git a/examples/tensorflow_v2/distribution_strategy/estimator-API/keras_model_to_estimator.py b/examples/tensorflow_v2/distribution_strategy/estimator-API/keras_model_to_estimator.py new file mode 100644 index 0000000000..8dfc5fbff4 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/estimator-API/keras_model_to_estimator.py @@ -0,0 +1,78 @@ +# Copyright 2018 The Kubeflow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""An example of training Keras model with multi-worker strategies.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import numpy as np +import tensorflow as tf + + +def input_fn(): + x = np.random.random((1024, 10)) + y = np.random.randint(2, size=(1024, 1)) + x = tf.cast(x, tf.float32) + dataset = tf.data.Dataset.from_tensor_slices((x, y)) + dataset = dataset.repeat(100) + dataset = dataset.batch(32) + return dataset + + +def main(args): + if len(args) < 2: + print('You must specify model_dir for checkpoints such as' + ' /tmp/tfkeras_example/.') + return + + model_dir = args[1] + print('Using %s to store checkpoints.' % model_dir) + + # Define a Keras Model. + model = tf.keras.Sequential() + model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,))) + model.add(tf.keras.layers.Dense(1, activation='sigmoid')) + + # Compile the model. + optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.2) + model.compile(loss='binary_crossentropy', optimizer=optimizer) + model.summary() + tf.keras.backend.set_learning_phase(True) + + # Define DistributionStrategies and convert the Keras Model to an + # Estimator that utilizes these DistributionStrateges. + # Evaluator is a single worker, so using MirroredStrategy. + config = tf.estimator.RunConfig( + experimental_distribute=tf.contrib.distribute.DistributeConfig( + train_distribute=tf.contrib.distribute.CollectiveAllReduceStrategy( + num_gpus_per_worker=0), + eval_distribute=tf.contrib.distribute.MirroredStrategy( + num_gpus_per_worker=0))) + keras_estimator = tf.keras.estimator.model_to_estimator( + keras_model=model, config=config, model_dir=model_dir) + + # Train and evaluate the model. Evaluation will be skipped if there is not an + # "evaluator" job in the cluster. + tf.estimator.train_and_evaluate( + keras_estimator, + train_spec=tf.estimator.TrainSpec(input_fn=input_fn), + eval_spec=tf.estimator.EvalSpec(input_fn=input_fn)) + + +if __name__ == '__main__': + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + tf.compat.v1.app.run(argv=sys.argv) diff --git a/examples/tensorflow_v2/distribution_strategy/keras-API/Dockerfile b/examples/tensorflow_v2/distribution_strategy/keras-API/Dockerfile new file mode 100644 index 0000000000..9e2e5c55b7 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/keras-API/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.9 + +RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0 + +COPY examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py / +ENTRYPOINT ["python", "/multi_worker_strategy-with-keras.py", "--saved_model_dir", "/train/saved_model/", "--checkpoint_dir", "/train/checkpoint"] diff --git a/examples/tensorflow_v2/distribution_strategy/keras-API/README.md b/examples/tensorflow_v2/distribution_strategy/keras-API/README.md new file mode 100644 index 0000000000..70b58421e0 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/keras-API/README.md @@ -0,0 +1,29 @@ +# Multi-worker training with Keras + +This directory contains a example for running multi-worker distributed training +using Tensorflow 2.1 keras API on Kubeflow. For more information about the +source code, please see TensorFlow tutorials [here](https://www.tensorflow.org/tutorials/distribute/keras) and [here](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras) + +## Prerequisite + +Your cluster must be configured to use Multiple GPUs, +please follow the [instructions](https://www.kubeflow.org/docs/components/training/tftraining/#using-gpus) + +## Steps + +1. Build a image + ``` + docker build -f Dockerfile -t kubeflow/multi_worker_strategy:v1.0 . + ``` + +2. Specify your storageClassName and create a persistent volume claim to save + models and checkpoints + ``` + kubectl -n ${NAMESPACE} create -f pvc.yaml + ``` + +3. Create a TFJob, if you use some GPUs other than NVIDIA, please replace + `nvidia.com/gpu` with your GPU vendor in the `limits` section. + ``` + kubectl -n ${NAMESPACE} create -f multi_worker_tfjob.yaml + ``` diff --git a/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py b/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py new file mode 100644 index 0000000000..9a1b9a71cb --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py @@ -0,0 +1,160 @@ +# Copyright 2020 The Kubeflow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""An example of multi-worker training with Keras model using Strategy API.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import json +import os + +import tensorflow_datasets as tfds +import tensorflow as tf +from tensorflow.keras import layers, models + + +def make_datasets_unbatched(): + BUFFER_SIZE = 10000 + + # Scaling MNIST data from (0, 255] to (0., 1.] + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + return image, label + + datasets, _ = tfds.load(name='mnist', with_info=True, as_supervised=True) + + return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE) + + +def build_and_compile_cnn_model(): + model = models.Sequential() + model.add( + layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1))) + model.add(layers.MaxPooling2D((2, 2))) + model.add(layers.Conv2D(64, (3, 3), activation='relu')) + model.add(layers.MaxPooling2D((2, 2))) + model.add(layers.Conv2D(64, (3, 3), activation='relu')) + model.add(layers.Flatten()) + model.add(layers.Dense(64, activation='relu')) + model.add(layers.Dense(10, activation='softmax')) + + model.summary() + + model.compile(optimizer='adam', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + + return model + + +def decay(epoch): + if epoch < 3: #pylint: disable=no-else-return + return 1e-3 + if 3 <= epoch < 7: + return 1e-4 + return 1e-5 + + +def main(args): + + # MultiWorkerMirroredStrategy creates copies of all variables in the model's + # layers on each device across all workers + # if your GPUs don't support NCCL, replace "communication" with another + strategy = tf.distribute.MultiWorkerMirroredStrategy( + communication_options=tf.distribute.experimental.CommunicationOptions(implementation=tf.distribute.experimental.CollectiveCommunication.AUTO)) + + BATCH_SIZE_PER_REPLICA = 64 + BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + + with strategy.scope(): + ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat() + options = tf.data.Options() + options.experimental_distribute.auto_shard_policy = \ + tf.data.experimental.AutoShardPolicy.DATA + ds_train = ds_train.with_options(options) + # Model building/compiling need to be within `strategy.scope()`. + multi_worker_model = build_and_compile_cnn_model() + + # Define the checkpoint directory to store the checkpoints + checkpoint_dir = args.checkpoint_dir + + # Name of the checkpoint files + checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") + + # Function for decaying the learning rate. + # You can define any decay function you need. + # Callback for printing the LR at the end of each epoch. + class PrintLR(tf.keras.callbacks.Callback): + + def on_epoch_end(self, epoch, logs=None): #pylint: disable=no-self-use + print('\nLearning rate for epoch {} is {}'.format( + epoch + 1, multi_worker_model.optimizer.lr.numpy())) + + callbacks = [ + tf.keras.callbacks.TensorBoard(log_dir='./logs'), + tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, + save_weights_only=True), + tf.keras.callbacks.LearningRateScheduler(decay), + PrintLR() + ] + + # Keras' `model.fit()` trains the model with specified number of epochs and + # number of steps per epoch. Note that the numbers here are for demonstration + # purposes only and may not sufficiently produce a model with good quality. + multi_worker_model.fit(ds_train, + epochs=10, + steps_per_epoch=70, + callbacks=callbacks) + + # Saving a model + # Let `is_chief` be a utility function that inspects the cluster spec and + # current task type and returns True if the worker is the chief and False + # otherwise. + def is_chief(): + return TASK_INDEX == 0 + + if is_chief(): + model_path = args.saved_model_dir + + else: + # Save to a path that is unique across workers. + model_path = args.saved_model_dir + '/worker_tmp_' + str(TASK_INDEX) + + multi_worker_model.save(model_path) + + +if __name__ == '__main__': + os.environ['NCCL_DEBUG'] = 'INFO' + + tfds.disable_progress_bar() + + # to decide if a worker is chief, get TASK_INDEX in Cluster info + tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}') + TASK_INDEX = tf_config['task']['index'] + + parser = argparse.ArgumentParser() + parser.add_argument('--saved_model_dir', + type=str, + required=True, + help='Tensorflow export directory.') + + parser.add_argument('--checkpoint_dir', + type=str, + required=True, + help='Tensorflow checkpoint directory.') + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_tfjob.yaml b/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_tfjob.yaml new file mode 100644 index 0000000000..b9500f2d5e --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/keras-API/multi_worker_tfjob.yaml @@ -0,0 +1,26 @@ +apiVersion: kubeflow.org/v1 +kind: TFJob +metadata: + name: multi-worker +spec: + runPolicy: + cleanPodPolicy: None + tfReplicaSpecs: + Worker: + replicas: 2 + restartPolicy: Never + template: + spec: + containers: + - name: tensorflow + image: kubeflow/tf-multi-worker-strategy:latest + volumeMounts: + - mountPath: /train + name: training + resources: + limits: + nvidia.com/gpu: 1 + volumes: + - name: training + persistentVolumeClaim: + claimName: strategy-volume diff --git a/examples/tensorflow_v2/distribution_strategy/keras-API/pvc.yaml b/examples/tensorflow_v2/distribution_strategy/keras-API/pvc.yaml new file mode 100644 index 0000000000..0036c1fb45 --- /dev/null +++ b/examples/tensorflow_v2/distribution_strategy/keras-API/pvc.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: strategy-volume + labels: + app: strategy-volume +spec: + storageClassName: "Your storageClassName" + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi diff --git a/examples/tensorflow_v2/image-classification/create-tfjob.ipynb b/examples/tensorflow_v2/image-classification/create-tfjob.ipynb new file mode 100644 index 0000000000..182e977ea4 --- /dev/null +++ b/examples/tensorflow_v2/image-classification/create-tfjob.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Create TFJob using Kubeflow Training SDK" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This is a sample for Kubeflow Training SDK `kubeflow-training`.\n", + "\n", + "The notebook shows how to use Kubeflow TFJob SDK to create, get, wait, check and delete TFJob." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Kubeflow Training Python SDKs\n", + "\n", + "You need to install Kubeflow Training SDK to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from kubernetes.client import V1PodTemplateSpec\n", + "from kubernetes.client import V1ObjectMeta\n", + "from kubernetes.client import V1PodSpec\n", + "from kubernetes.client import V1Container\n", + "\n", + "\n", + "from kubeflow.training import KubeflowOrgV1ReplicaSpec\n", + "from kubeflow.training import KubeflowOrgV1TFJob\n", + "from kubeflow.training import KubeflowOrgV1TFJobSpec\n", + "from kubeflow.training import KubeflowOrgV1RunPolicy\n", + "from kubeflow.training import TrainingClient\n", + "\n", + "from kubeflow.training import constants" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Define TFJob" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The demo runs Tensorflow MNIST example with 2 workers, chief, and parameter server for TFJob." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "name = \"mnist\"\n", + "namespace = \"kubeflow-user-example-com\"\n", + "container_name = \"tensorflow\"\n", + "\n", + "container = V1Container(\n", + " name=container_name,\n", + " image=\"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\",\n", + " command=[\n", + " \"python\",\n", + " \"/var/tf_mnist/mnist_with_summaries.py\",\n", + " \"--log_dir=/train/logs\", \"--learning_rate=0.01\",\n", + " \"--batch_size=150\"\n", + " ]\n", + ")\n", + "\n", + "worker = KubeflowOrgV1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "chief = KubeflowOrgV1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "ps = KubeflowOrgV1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "tfjob = KubeflowOrgV1TFJob(\n", + " api_version=constants.API_VERSION,\n", + " kind=constants.TFJOB_KIND,\n", + " metadata=V1ObjectMeta(name=\"mnist\",namespace=namespace),\n", + " spec=KubeflowOrgV1TFJobSpec(\n", + " run_policy=KubeflowOrgV1RunPolicy(clean_pod_policy=\"None\"),\n", + " tf_replica_specs={\"Worker\": worker,\n", + " \"Chief\": chief,\n", + " \"PS\": ps}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create TFJob\n", + "\n", + "You have to create Training Client to deploy your TFJob in you cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TFJob kubeflow-user-example-com/mnist has been created\n" + ] + } + ], + "source": [ + "# Namespace and Job kind will be reused in every APIs.\n", + "training_client = TrainingClient(namespace=namespace, job_kind=constants.TFJOB_KIND)\n", + "training_client.create_job(tfjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the Created TFJob\n", + "\n", + "You can verify the created TFJob status." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'completion_time': None,\n", + " 'conditions': [{'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'message': 'TFJob mnist is created.',\n", + " 'reason': 'TFJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'message': 'TFJob kubeflow-user-example-com/mnist is running.',\n", + " 'reason': 'TFJobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}],\n", + " 'last_reconcile_time': None,\n", + " 'replica_statuses': {'Chief': {'active': 1,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None},\n", + " 'PS': {'active': 1,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None},\n", + " 'Worker': {'active': 2,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None}},\n", + " 'start_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc())}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_client.get_job(name).status" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the TFJob Conditions" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'message': 'TFJob mnist is created.',\n", + " 'reason': 'TFJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'message': 'TFJob kubeflow-user-example-com/mnist is running.',\n", + " 'reason': 'TFJobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_client.get_job_conditions(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Until TFJob Finishes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.wait_for_job_conditions(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify if TFJob is Succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "training_client.is_job_succeeded(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the TFJob Training Logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.get_job_logs(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Delete the TFJob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.delete_job(name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/tensorflow_v2/mnist_with_summaries/Dockerfile b/examples/tensorflow_v2/mnist_with_summaries/Dockerfile new file mode 100644 index 0000000000..585fad5583 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/Dockerfile @@ -0,0 +1,18 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM tensorflow/tensorflow:2.16.1 + +ADD examples/tensorflow/mnist_with_summaries/ /var/tf_mnist +ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"] diff --git a/examples/tensorflow_v2/mnist_with_summaries/Dockerfile.ppc64le b/examples/tensorflow_v2/mnist_with_summaries/Dockerfile.ppc64le new file mode 100644 index 0000000000..68587dd875 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/Dockerfile.ppc64le @@ -0,0 +1,18 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ibmcom/tensorflow-ppc64le:1.13.1 + +ADD examples/tensorflow/tf_sample/ /var/tf_mnist +ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"] diff --git a/examples/tensorflow_v2/mnist_with_summaries/README.md b/examples/tensorflow_v2/mnist_with_summaries/README.md new file mode 100644 index 0000000000..075e87ce60 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/README.md @@ -0,0 +1,21 @@ +### Simple mnist example with persistent volume + +This is a simple example using an MNIST model that outputs a TF summary. +The example also mounts a persistent volume for output, making it suitable +for integrating with other components like Katib. + +The source code is borrowed from TensorFlow tutorials [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py). + +To build this image on x86_64: +```shell +docker build -f Dockerfile -t kubeflow/tf-mnist-with-summaries:1.0 ./ +``` +On ppc64le, run as: +```shell +docker build -f Dockerfile.ppc64le -t kubeflow123/tf-mnist-with-summaries:1.0 ./ +``` + +Usage: +1. Add the persistent volume and claim: `kubectl apply -f tfevent-volume/.` +1. Deploy the TFJob: `kubectl apply -f tf_job_mnist.yaml` + * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. \ No newline at end of file diff --git a/examples/tensorflow_v2/mnist_with_summaries/mnist_with_summaries.py b/examples/tensorflow_v2/mnist_with_summaries/mnist_with_summaries.py new file mode 100644 index 0000000000..559712fe9b --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/mnist_with_summaries.py @@ -0,0 +1,212 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A simple MNIST classifier which displays summaries in TensorBoard. +This is an unimpressive MNIST model, but it is a good example of using +tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of +naming summary tags so that they are grouped meaningfully in TensorBoard. +It demonstrates the functionality of every TensorBoard dashboard. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys + +import tensorflow as tf + +from tensorflow.examples.tutorials.mnist import input_data + +FLAGS = None + + +def train(): + # Import data + mnist = input_data.read_data_sets(FLAGS.data_dir, + fake_data=FLAGS.fake_data) + + sess = tf.compat.v1.InteractiveSession() + # Create a multilayer model. + + # Input placeholders + with tf.compat.v1.name_scope('input'): + x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input') + y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input') + + with tf.compat.v1.name_scope('input_reshape'): + image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) + tf.summary.image('input', image_shaped_input, 10) + + # We can't initialize these variables to 0 - the network will get stuck. + def weight_variable(shape): + """Create a weight variable with appropriate initialization.""" + initial = tf.random.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + + def bias_variable(shape): + """Create a bias variable with appropriate initialization.""" + initial = tf.constant(0.1, shape=shape) + return tf.Variable(initial) + + def variable_summaries(var): + """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" + with tf.compat.v1.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + with tf.compat.v1.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + tf.summary.histogram('histogram', var) + + def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): + """Reusable code for making a simple neural net layer. + It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. + It also sets up name scoping so that the resultant graph is easy to read, + and adds a number of summary ops. + """ + # Adding a name scope ensures logical grouping of the layers in the graph. + with tf.compat.v1.name_scope(layer_name): + # This Variable will hold the state of the weights for the layer + with tf.compat.v1.name_scope('weights'): + weights = weight_variable([input_dim, output_dim]) + variable_summaries(weights) + with tf.compat.v1.name_scope('biases'): + biases = bias_variable([output_dim]) + variable_summaries(biases) + with tf.compat.v1.name_scope('Wx_plus_b'): + preactivate = tf.matmul(input_tensor, weights) + biases + tf.summary.histogram('pre_activations', preactivate) + activations = act(preactivate, name='activation') + tf.summary.histogram('activations', activations) + return activations + + hidden1 = nn_layer(x, 784, 500, 'layer1') + + with tf.compat.v1.name_scope('dropout'): + keep_prob = tf.compat.v1.placeholder(tf.float32) + tf.summary.scalar('dropout_keep_probability', keep_prob) + dropped = tf.nn.dropout(hidden1, rate=1 - (keep_prob)) + + # Do not apply softmax activation yet, see below. + y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) + + with tf.compat.v1.name_scope('cross_entropy'): + # The raw formulation of cross-entropy, + # + # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), + # reduction_indices=[1])) + # + # can be numerically unstable. + # + # So here we use tf.losses.sparse_softmax_cross_entropy on the + # raw logit outputs of the nn_layer above, and then average across + # the batch. + with tf.compat.v1.name_scope('total'): + cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( + labels=y_, logits=y) + tf.summary.scalar('cross_entropy', cross_entropy) + + with tf.compat.v1.name_scope('train'): + train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize( + cross_entropy) + + with tf.compat.v1.name_scope('accuracy'): + with tf.compat.v1.name_scope('correct_prediction'): + correct_prediction = tf.equal(tf.argmax(y, 1), y_) + with tf.compat.v1.name_scope('accuracy'): + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + tf.summary.scalar('accuracy', accuracy) + + # Merge all the summaries and write them out to + # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) + merged = tf.compat.v1.summary.merge_all() + train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) + test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test') + tf.compat.v1.global_variables_initializer().run() + + # Train the model, and also write summaries. + # Every 10th step, measure test-set accuracy, and write test summaries + # All other steps, run train_step on training data, & add training summaries + + def feed_dict(train): # pylint: disable=redefined-outer-name + """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" + if train or FLAGS.fake_data: + xs, ys = mnist.train.next_batch(FLAGS.batch_size, fake_data=FLAGS.fake_data) + k = FLAGS.dropout + else: + xs, ys = mnist.test.images, mnist.test.labels + k = 1.0 + return {x: xs, y_: ys, keep_prob: k} + + for i in range(FLAGS.max_steps): + if i % 10 == 0: # Record summaries and test-set accuracy + summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) + test_writer.add_summary(summary, i) + print('Accuracy at step %s: %s' % (i, acc)) + else: # Record train set summaries, and train + if i % 100 == 99: # Record execution stats + run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + run_metadata = tf.compat.v1.RunMetadata() + summary, _ = sess.run([merged, train_step], + feed_dict=feed_dict(True), + options=run_options, + run_metadata=run_metadata) + train_writer.add_run_metadata(run_metadata, 'step%03d' % i) + train_writer.add_summary(summary, i) + print('Adding run metadata for', i) + else: # Record a summary + summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) + train_writer.add_summary(summary, i) + train_writer.close() + test_writer.close() + + +def main(_): + if tf.io.gfile.exists(FLAGS.log_dir): + tf.io.gfile.rmtree(FLAGS.log_dir) + tf.io.gfile.makedirs(FLAGS.log_dir) + train() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--fake_data', nargs='?', const=True, type=bool, + default=False, + help='If true, uses fake data for unit testing.') + parser.add_argument('--max_steps', type=int, default=1000, + help='Number of steps to run trainer.') + parser.add_argument('--learning_rate', type=float, default=0.001, + help='Initial learning rate') + parser.add_argument('--batch_size', type=int, default=100, + help='Training batch size') + parser.add_argument('--dropout', type=float, default=0.9, + help='Keep probability for training dropout.') + parser.add_argument( + '--data_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/input_data'), + help='Directory for storing input data') + parser.add_argument( + '--log_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/logs/mnist_with_summaries'), + help='Summaries log directory') + FLAGS, unparsed = parser.parse_known_args() + tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/examples/tensorflow_v2/mnist_with_summaries/tf_job_mnist.yaml b/examples/tensorflow_v2/mnist_with_summaries/tf_job_mnist.yaml new file mode 100644 index 0000000000..88e0e94848 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/tf_job_mnist.yaml @@ -0,0 +1,30 @@ +apiVersion: "kubeflow.org/v1" +kind: "TFJob" +metadata: + name: "mnist" + namespace: kubeflow +spec: + runPolicy: + cleanPodPolicy: None + tfReplicaSpecs: + Worker: + replicas: 1 + restartPolicy: Never + template: + spec: + containers: + - name: tensorflow + image: kubeflow/tf-mnist-with-summaries:latest + command: + - "python" + - "/var/tf_mnist/mnist_with_summaries.py" + - "--log_dir=/train/logs" + - "--learning_rate=0.01" + - "--batch_size=150" + volumeMounts: + - mountPath: "/train" + name: "training" + volumes: + - name: "training" + persistentVolumeClaim: + claimName: "tfevent-volume" diff --git a/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml b/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml new file mode 100644 index 0000000000..a450c6a492 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: tfevent-volume + labels: + type: local + app: tfjob +spec: + capacity: + storage: 10Gi + storageClassName: standard + accessModes: + - ReadWriteMany + hostPath: + path: /tmp/data diff --git a/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml b/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml new file mode 100644 index 0000000000..7d7f8487a1 --- /dev/null +++ b/examples/tensorflow_v2/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tfevent-volume + namespace: kubeflow + labels: + type: local + app: tfjob +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi diff --git a/examples/tensorflow_v2/simple.yaml b/examples/tensorflow_v2/simple.yaml new file mode 100644 index 0000000000..4c2a0a76e6 --- /dev/null +++ b/examples/tensorflow_v2/simple.yaml @@ -0,0 +1,18 @@ +apiVersion: "kubeflow.org/v1" +kind: TFJob +metadata: + name: tfjob-simple + namespace: kubeflow +spec: + tfReplicaSpecs: + Worker: + replicas: 2 + restartPolicy: OnFailure + template: + spec: + containers: + - name: tensorflow + image: kubeflow/tf-mnist-with-summaries:latest + command: + - "python" + - "/var/tf_mnist/mnist_with_summaries.py" diff --git a/examples/tensorflow_v2/tf_sample/Dockerfile b/examples/tensorflow_v2/tf_sample/Dockerfile new file mode 100644 index 0000000000..9cbaf32cb6 --- /dev/null +++ b/examples/tensorflow_v2/tf_sample/Dockerfile @@ -0,0 +1,5 @@ +FROM tensorflow/tensorflow:2.16.1 +RUN pip install retrying +RUN mkdir -p /opt/kubeflow +COPY examples/tensorflow/tf_sample/tf_smoke.py /opt/kubeflow/ +ENTRYPOINT ["python", "/opt/kubeflow/tf_smoke.py"] diff --git a/examples/tensorflow_v2/tf_sample/Makefile b/examples/tensorflow_v2/tf_sample/Makefile new file mode 100644 index 0000000000..5055001182 --- /dev/null +++ b/examples/tensorflow_v2/tf_sample/Makefile @@ -0,0 +1,38 @@ +IMG = gcr.io/kubeflow-examples/tf_smoke + +# List any changed files. We only include files in the notebooks directory. +# because that is the code in the docker image. +# In particular we exclude changes to the ksonnet configs. +CHANGED_FILES := $(shell git diff-files --relative=tensorflow/tf_sample) + +ifeq ($(strip $(CHANGED_FILES)),) +# Changed files is empty; not dirty +# Don't include --dirty because it could be dirty if files outside the ones we care +# about changed. +GIT_VERSION := $(shell git describe --always) +else +GIT_VERSION := $(shell git describe --always)-dirty-$(shell git diff | shasum -a256 | cut -c -6) +endif + +TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION) +all: build + +# To build without the cache set the environment variable +# export DOCKER_BUILD_OPTS=--no-cache +build: + docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) . \ + --label=git-verions=$(GIT_VERSION) + docker tag $(IMG):$(TAG) $(IMG):latest + @echo Built $(IMG):latest + @echo Built $(IMG):$(TAG) + + +# Build but don't attach the latest tag. This allows manual testing/inspection of the image +# first. +push: build + gcloud docker -- push $(IMG):$(TAG) + @echo Pushed $(IMG) with :$(TAG) tags + +push-latest: push + gcloud container images add-tag --quiet $(IMG):$(TAG) $(IMG):latest --verbosity=info + echo created $(IMG):latest diff --git a/examples/tensorflow_v2/tf_sample/setup.py b/examples/tensorflow_v2/tf_sample/setup.py new file mode 100644 index 0000000000..bde533c8bd --- /dev/null +++ b/examples/tensorflow_v2/tf_sample/setup.py @@ -0,0 +1,26 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A setup.py file for the tf_sample package.""" +from setuptools import find_packages, setup + +REQUIRED_PACKAGES = [] + +setup( + name='tf_sample', + version='0.1.1', + author='Jeremy Lewi', + author_email='jlewi@google.com', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + description='Sample TF program', + requires=[], +) diff --git a/examples/tensorflow_v2/tf_sample/tf_smoke.py b/examples/tensorflow_v2/tf_sample/tf_smoke.py new file mode 100644 index 0000000000..6c76944247 --- /dev/null +++ b/examples/tensorflow_v2/tf_sample/tf_smoke.py @@ -0,0 +1,147 @@ +"""Train a simple TF program to verify we can execute ops. + +The program does a simple matrix multiplication. + +Only the master assigns ops to devices/workers. + +The master will assign ops to every task in the cluster. This way we can verify +that distributed training is working by executing ops on all devices. +""" +import argparse +import json +import logging +import os +import retrying + +import tensorflow as tf + + +def parse_args(): + """Parse the command line arguments.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--sleep_secs", + default=0, + type=int, + help=("Amount of time to sleep at the end")) + + # TODO(jlewi): We ignore unknown arguments because the backend is currently + # setting some flags to empty values like metadata path. + args, _ = parser.parse_known_args() + return args + +# Add retries to deal with things like gRPC errors that result in +# UnavailableError. +@retrying.retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, + stop_max_delay=60*3*1000) +def run(server, cluster_spec): # pylint: disable=too-many-statements, too-many-locals + """Build the graph and run the example. + + Args: + server: The TensorFlow server to use. + + Raises: + RuntimeError: If the expected log entries aren't found. + """ + + # construct the graph and create a saver object + with tf.Graph().as_default(): # pylint: disable=not-context-manager + # The initial value should be such that type is correctly inferred as + # float. + width = 10 + height = 10 + results = [] + + # The master assigns ops to every TFProcess in the cluster. + for job_name in cluster_spec.keys(): + for i in range(len(cluster_spec[job_name])): + d = "/job:{0}/task:{1}".format(job_name, i) + with tf.device(d): + a = tf.constant(range(width * height), shape=[height, width]) + b = tf.constant(range(width * height), shape=[height, width]) + c = tf.multiply(a, b) + results.append(c) + + init_op = tf.compat.v1.global_variables_initializer() + + if server: + target = server.target + else: + # Create a direct session. + target = "" + + logging.info("Server target: %s", target) + with tf.compat.v1.Session( + target, config=tf.compat.v1.ConfigProto(log_device_placement=True)) as sess: + sess.run(init_op) + for r in results: + result = sess.run(r) + logging.info("Result: %s", result) + + +def main(): + """Run training. + + Raises: + ValueError: If the arguments are invalid. + """ + logging.info("Tensorflow version: %s", tf.__version__) + logging.info("Tensorflow git version: %s", tf.__git_version__) + + tf_config_json = os.environ.get("TF_CONFIG", "{}") + tf_config = json.loads(tf_config_json) + logging.info("tf_config: %s", tf_config) + + task = tf_config.get("task", {}) + logging.info("task: %s", task) + + cluster_spec = tf_config.get("cluster", {}) + logging.info("cluster_spec: %s", cluster_spec) + + server = None + device_func = None + if cluster_spec: + cluster_spec_object = tf.train.ClusterSpec(cluster_spec) + server_def = tf.train.ServerDef( + cluster=cluster_spec_object.as_cluster_def(), + protocol="grpc", + job_name=task["type"], + task_index=task["index"]) + + logging.info("server_def: %s", server_def) + + logging.info("Building server.") + # Create and start a server for the local task. + server = tf.distribute.Server(server_def) + logging.info("Finished building server.") + + # Assigns ops to the local worker by default. + device_func = tf.compat.v1.train.replica_device_setter( + worker_device="/job:worker/task:%d" % server_def.task_index, + cluster=server_def.cluster) + else: + # This should return a null op device setter since we are using + # all the defaults. + logging.error("Using default device function.") + device_func = tf.compat.v1.train.replica_device_setter() + + job_type = task.get("type", "").lower() + if job_type == "ps": + logging.info("Running PS code.") + server.join() + elif job_type == "worker": + logging.info("Running Worker code.") + # The worker just blocks because we let the master assign all ops. + server.join() + elif job_type in ["master", "chief"] or not job_type: + logging.info("Running master/chief.") + with tf.device(device_func): + run(server=server, cluster_spec=cluster_spec) + else: + raise ValueError("invalid job_type %s" % (job_type,)) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + main() diff --git a/examples/tf_upgrade_v2_report.txt b/examples/tf_upgrade_v2_report.txt new file mode 100644 index 0000000000..52e276ce93 --- /dev/null +++ b/examples/tf_upgrade_v2_report.txt @@ -0,0 +1,186 @@ +TensorFlow 2.0 Upgrade Script +----------------------------- +Converted 6 files +Detected 9 issues that require attention +-------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- +File: tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py +-------------------------------------------------------------------------------- +tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py:136:2: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +-------------------------------------------------------------------------------- +File: tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py +-------------------------------------------------------------------------------- +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:60:30: WARNING: Using member tf.contrib.distribute.DistributeConfig in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:60:30: ERROR: Using member tf.contrib.distribute.DistributeConfig in deprecated module tf.contrib. tf.contrib.distribute.DistributeConfig cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:61:27: ERROR: tf.contrib.distribute.CollectiveAllReduceStrategy requires manual check. (Manual edit required) tf.contrib.distribute.CollectiveAllReduceStrategy has been migrated to tf.distribute.experimental.MultiWorkerMirroredStrategy. Note the changes in constructor. If you're using the strategy with a custom training loop, note the following changes in methods: make_dataset_iterator->experimental_distribute_dataset, experimental_make_numpy_iterator->experimental_make_numpy_dataset, extended.call_for_each_replica->run, reduce requires an axis argument, unwrap->experimental_local_results experimental_initialize and experimental_finalize no longer needed +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:61:27: WARNING: Using member tf.contrib.distribute.CollectiveAllReduceStrategy in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:61:27: ERROR: Using member tf.contrib.distribute.CollectiveAllReduceStrategy in deprecated module tf.contrib. tf.contrib.distribute.CollectiveAllReduceStrategy cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:63:26: ERROR: tf.contrib.distribute.MirroredStrategy requires manual check. (Manual edit required) tf.contrib.distribute.MirroredStrategy has been migrated to tf.distribute.MirroredStrategy. Things to note: Constructor arguments have changed. If you are using MirroredStrategy with Keras training framework, the input provided to `model.fit` will be assumed to have global batch size and split across the replicas. If you're using the strategy with a custom training loop, note the following changes in methods: make_dataset_iterator->experimental_distribute_dataset, experimental_make_numpy_iterator->experimental_make_numpy_dataset, extended.call_for_each_replica->run, reduce requires an axis argument, unwrap->experimental_local_results experimental_initialize and experimental_finalize no longer needed +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:63:26: WARNING: Using member tf.contrib.distribute.MirroredStrategy in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py:63:26: ERROR: Using member tf.contrib.distribute.MirroredStrategy in deprecated module tf.contrib. tf.contrib.distribute.MirroredStrategy cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +================================================================================ +Detailed log follows: + +================================================================================ +================================================================================ +Input tree: 'tensorflow/' +================================================================================ +-------------------------------------------------------------------------------- +Processing file 'tensorflow/tf_sample/setup.py' + outputting to 'tensorflow_v2/tf_sample/setup.py' +-------------------------------------------------------------------------------- + + +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'tensorflow/tf_sample/tf_smoke.py' + outputting to 'tensorflow_v2/tf_sample/tf_smoke.py' +-------------------------------------------------------------------------------- + +66:14: INFO: Renamed 'tf.global_variables_initializer' to 'tf.compat.v1.global_variables_initializer' +75:9: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +76:27: INFO: Renamed 'tf.ConfigProto' to 'tf.compat.v1.ConfigProto' +116:13: INFO: Renamed 'tf.train.Server' to 'tf.distribute.Server' +120:18: INFO: Renamed 'tf.train.replica_device_setter' to 'tf.compat.v1.train.replica_device_setter' +127:18: INFO: Renamed 'tf.train.replica_device_setter' to 'tf.compat.v1.train.replica_device_setter' +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'tensorflow/mnist_with_summaries/mnist_with_summaries.py' + outputting to 'tensorflow_v2/mnist_with_summaries/mnist_with_summaries.py' +-------------------------------------------------------------------------------- + +41:9: INFO: Renamed 'tf.InteractiveSession' to 'tf.compat.v1.InteractiveSession' +45:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +45:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +46:8: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +47:9: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +49:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +49:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +51:4: INFO: tf.summary.image requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +56:14: INFO: Renamed 'tf.truncated_normal' to 'tf.random.truncated_normal' +66:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +66:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +68:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +69:11: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +69:11: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +71:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +72:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +73:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +74:6: INFO: tf.summary.histogram requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +83:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +83:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +85:11: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +85:11: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +88:11: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +88:11: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +91:11: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +91:11: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +93:8: INFO: tf.summary.histogram requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +95:6: INFO: tf.summary.histogram requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +100:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +100:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +101:16: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +102:4: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +103:14: INFO: Changing keep_prob arg of tf.nn.dropout to rate, and recomputing value. + +108:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +108:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +119:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +119:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +120:22: INFO: tf.losses.sparse_softmax_cross_entropy requires manual check. tf.losses have been replaced with object oriented versions in TF 2.0 and after. The loss function calls have been converted to compat.v1 for backward compatibility. Please update these calls to the TF 2.0 versions. +120:22: INFO: Renamed 'tf.losses.sparse_softmax_cross_entropy' to 'tf.compat.v1.losses.sparse_softmax_cross_entropy' +122:2: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +124:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +124:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +125:17: INFO: Renamed 'tf.train.AdamOptimizer' to 'tf.compat.v1.train.AdamOptimizer' +128:7: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +128:7: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +129:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +129:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +131:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +131:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +133:2: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +137:11: INFO: tf.summary.merge_all requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +137:11: INFO: Renamed 'tf.summary.merge_all' to 'tf.compat.v1.summary.merge_all' +138:17: INFO: tf.summary.FileWriter requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +138:17: INFO: Renamed 'tf.summary.FileWriter' to 'tf.compat.v1.summary.FileWriter' +139:16: INFO: tf.summary.FileWriter requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +139:16: INFO: Renamed 'tf.summary.FileWriter' to 'tf.compat.v1.summary.FileWriter' +140:2: INFO: Renamed 'tf.global_variables_initializer' to 'tf.compat.v1.global_variables_initializer' +163:22: INFO: Renamed 'tf.RunOptions' to 'tf.compat.v1.RunOptions' +163:48: INFO: Renamed 'tf.RunOptions' to 'tf.compat.v1.RunOptions' +164:23: INFO: Renamed 'tf.RunMetadata' to 'tf.compat.v1.RunMetadata' +180:5: INFO: Renamed 'tf.gfile.Exists' to 'tf.io.gfile.exists' +181:4: INFO: Renamed 'tf.gfile.DeleteRecursively' to 'tf.io.gfile.rmtree' +182:2: INFO: Renamed 'tf.gfile.MakeDirs' to 'tf.io.gfile.makedirs' +212:2: INFO: Renamed 'tf.app.run' to 'tf.compat.v1.app.run' +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py' + outputting to 'tensorflow_v2/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py' +-------------------------------------------------------------------------------- + +136:2: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py' + outputting to 'tensorflow_v2/distribution_strategy/estimator-API/keras_model_to_estimator.py' +-------------------------------------------------------------------------------- + +51:14: INFO: Renamed 'tf.train.GradientDescentOptimizer' to 'tf.compat.v1.train.GradientDescentOptimizer' +60:30: WARNING: Using member tf.contrib.distribute.DistributeConfig in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +60:30: ERROR: Using member tf.contrib.distribute.DistributeConfig in deprecated module tf.contrib. tf.contrib.distribute.DistributeConfig cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +61:27: ERROR: tf.contrib.distribute.CollectiveAllReduceStrategy requires manual check. (Manual edit required) tf.contrib.distribute.CollectiveAllReduceStrategy has been migrated to tf.distribute.experimental.MultiWorkerMirroredStrategy. Note the changes in constructor. If you're using the strategy with a custom training loop, note the following changes in methods: make_dataset_iterator->experimental_distribute_dataset, experimental_make_numpy_iterator->experimental_make_numpy_dataset, extended.call_for_each_replica->run, reduce requires an axis argument, unwrap->experimental_local_results experimental_initialize and experimental_finalize no longer needed +61:27: WARNING: Using member tf.contrib.distribute.CollectiveAllReduceStrategy in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +61:27: ERROR: Using member tf.contrib.distribute.CollectiveAllReduceStrategy in deprecated module tf.contrib. tf.contrib.distribute.CollectiveAllReduceStrategy cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +63:26: ERROR: tf.contrib.distribute.MirroredStrategy requires manual check. (Manual edit required) tf.contrib.distribute.MirroredStrategy has been migrated to tf.distribute.MirroredStrategy. Things to note: Constructor arguments have changed. If you are using MirroredStrategy with Keras training framework, the input provided to `model.fit` will be assumed to have global batch size and split across the replicas. If you're using the strategy with a custom training loop, note the following changes in methods: make_dataset_iterator->experimental_distribute_dataset, experimental_make_numpy_iterator->experimental_make_numpy_dataset, extended.call_for_each_replica->run, reduce requires an axis argument, unwrap->experimental_local_results experimental_initialize and experimental_finalize no longer needed +63:26: WARNING: Using member tf.contrib.distribute.MirroredStrategy in deprecated module tf.contrib.distribute. (Manual edit required) tf.contrib.distribute.* have been migrated to tf.distribute.*. Please check out the new module for updated APIs. +63:26: ERROR: Using member tf.contrib.distribute.MirroredStrategy in deprecated module tf.contrib. tf.contrib.distribute.MirroredStrategy cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +77:2: INFO: Renamed 'tf.logging.set_verbosity' to 'tf.compat.v1.logging.set_verbosity' +77:27: INFO: Renamed 'tf.logging.INFO' to 'tf.compat.v1.logging.INFO' +78:2: INFO: Renamed 'tf.app.run' to 'tf.compat.v1.app.run' +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'tensorflow/dist-mnist/dist_mnist.py' + outputting to 'tensorflow_v2/dist-mnist/dist_mnist.py' +-------------------------------------------------------------------------------- + +48:8: INFO: Renamed 'tf.app.flags' to 'tf.compat.v1.app.flags' +143:13: INFO: Renamed 'tf.train.Server' to 'tf.distribute.Server' +162:6: INFO: Renamed 'tf.train.replica_device_setter' to 'tf.compat.v1.train.replica_device_setter' +170:8: INFO: Renamed 'tf.truncated_normal' to 'tf.random.truncated_normal' +178:8: INFO: Renamed 'tf.truncated_normal' to 'tf.random.truncated_normal' +185:8: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +186:9: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +188:14: INFO: Renamed 'tf.nn.xw_plus_b' to 'tf.compat.v1.nn.xw_plus_b' +191:22: INFO: Renamed 'tf.nn.xw_plus_b' to 'tf.compat.v1.nn.xw_plus_b' +192:40: INFO: Renamed 'tf.log' to 'tf.math.log' +194:10: INFO: Renamed 'tf.train.AdamOptimizer' to 'tf.compat.v1.train.AdamOptimizer' +202:12: INFO: Renamed 'tf.train.SyncReplicasOptimizer' to 'tf.compat.v1.train.SyncReplicasOptimizer' +221:14: INFO: Renamed 'tf.global_variables_initializer' to 'tf.compat.v1.global_variables_initializer' +225:11: INFO: Renamed 'tf.train.Supervisor' to 'tf.compat.v1.train.Supervisor' +234:11: INFO: Renamed 'tf.train.Supervisor' to 'tf.compat.v1.train.Supervisor' +241:18: INFO: Renamed 'tf.ConfigProto' to 'tf.compat.v1.ConfigProto' +303:2: INFO: Renamed 'tf.app.run' to 'tf.compat.v1.app.run' +-------------------------------------------------------------------------------- +