Skip to content

Tensorflow Mnist for Github Actions

Casper da Costa-Luis edited this page Jun 16, 2021 · 1 revision

WARNING: outdated/old docs

This is an introductory example of how to create a ci/cd pipeline with DVC-CML and Github actions.

Setup

1. Install dvc in your computer if you haven't done it already

2. Create a repo in your Github account

3. Clone the repo in your computer

git clone your-repo-url

4. Setup your project structure:

mkdir models metrics code
touch models/.gitkeep
touch metrics/.gitkeep

echo -e "tensorflow\nwget" >> requirements.txt

mkdir -p .github/workflows

5. Install requirements:

pip install tensorflow wget

6. Create code/mnist.py file with the following content

code/mnist.py
import os
import sys
import gzip
import shutil

import numpy as np

import wget

def download(uri, path):
  wget.download(uri, path)

def unzip(path):
    input = gzip.GzipFile(path, 'rb')
    s = input.read()
    input.close()

    output = open(path.replace('.gz', ''), 'wb')
    output.write(s)
    output.close()

def get_images(imgf, n):
    f = open(imgf, "rb")
    f.read(16)
    images = []

    for i in range(n):
        image = []
        for j in range(28*28):
            image.append(ord(f.read(1)))
        images.append(image)

    return images

def get_labels(labelf, n):
    l = open(labelf, "rb")
    l.read(8)
    labels = []
    for i in range(n):
        labels.append(ord(l.read(1)))
        
    return labels

def output_csv(folder, images, labels, prefix):
    if not os.path.exists(folder):
        os.mkdir(folder)

    o = open(os.path.join(folder, "mnist_%s.csv"%prefix), "w")
    for i in range(len(images)):
        o.write(",".join(str(x) for x in [labels[i]] + images[i]) + "\n")
    o.close()

def process(folder, imgf, labelf, prefix, n):
    images = get_images(os.path.join(folder, imgf), n)
    labels = get_labels(os.path.join(folder, labelf), n)
    output_csv(folder, images, labels, prefix)
    
def read_csv(path):
    labels = []
    imgs = []

    with open(path) as f:
        for i, line in enumerate(f): 
            data = line.split(',')  

            label = data[0]
            label_one_hot = np.zeros(10)
            label_one_hot[int(label)] = 1
            labels.append(label_one_hot)

            img = np.array(data[1:])
            img = img.astype(np.float32)
            img = np.multiply(img, 1.0 / 255.0)
            imgs.append(img)
    
    return (np.asarray(labels), np.asarray(imgs))


class DataSet(object):
  def __init__(self, images, labels):   
    self.num_examples = images.shape[0]
    self.images = images
    self.labels = labels
    self.epochs_completed = 0
    self.index_in_epoch = 0

  def next_batch(self, batch_size):
    start = self.index_in_epoch
    self.index_in_epoch += batch_size

    if self.index_in_epoch > self.num_examples:
      self.epochs_completed += 1
      
      # Shuffle the data
      perm = np.arange(self.num_examples)
      np.random.shuffle(perm)
      self.images = self.images[perm]
      self.labels = self.labels[perm]

      # Start next epoch
      start = 0
      self.index_in_epoch = batch_size
      assert batch_size <= self.num_examples

    end = self.index_in_epoch
    return self.images[start:end], self.labels[start:end]


if __name__== "__main__":
    if len(sys.argv) < 2:
        print('folder is missing. Run command with folder path.')
        exit(1)

    out_folder = sys.argv[1]

    if not os.path.exists(out_folder):
        os.mkdir(out_folder)
    else:
        print('folder ' + out_folder + ' already exists! Delete it with all its content in order to prepare it')
        exit(1)

    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    files = ['train-images-idx3-ubyte.gz',
        'train-labels-idx1-ubyte.gz', 
        't10k-images-idx3-ubyte.gz', 
        't10k-labels-idx1-ubyte.gz' ]

    for fil in files:
        path = os.path.join(out_folder, fil)
        download(SOURCE_URL + fil, out_folder)
        unzip(path)

    process(out_folder, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", 'train', 60000)
    process(out_folder, "t10k-images-idx3-ubyte",  "t10k-labels-idx1-ubyte", 'test', 10000)

    for filename in files:
        path = os.path.join(out_folder, filename)
        os.remove(path)
        os.remove(path.replace('.gz', ''))

7. Create code/train.py file with the following content:

code/train.py
import os
import json
import time
import tensorflow.compat.v1 as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.disable_v2_behavior()

import mnist

dirname = os.path.dirname(__file__)

train_labels, train_images = mnist.read_csv(os.path.join(dirname, '../data/mnist_train.csv'))
DATASET = mnist.DataSet(train_images, train_labels)
OUT = os.path.join(dirname, "../models/mnist")

batch_size = 128
num_steps = 1800
learning_rate = 0.01
start = time.time()

# input
x = tf.placeholder(tf.float32, [None, 784], "x")
y_ = tf.placeholder(tf.float32, [None, 10], "y")

# weight
W = tf.Variable(tf.zeros([784, 10]))
# bias
b = tf.Variable(tf.zeros([10]))
# test_data * W + b
y = tf.matmul(x, W) + b
sm = tf.nn.softmax(y, name="softmax")

# cross entropy (loss function)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_), name="loss")

# train step
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# evaluating the model
correct_prediction = tf.equal(tf.argmax(sm, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

saver = tf.train.Saver()
init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)

    # training
    for step in range(num_steps):
        batch_data, batch_labels = DATASET.next_batch(batch_size)
        feed_dict = {x: batch_data, y_: batch_labels}
        
        loss_out, ts_out, acc_out = session.run([loss, train_step, accuracy], feed_dict=feed_dict)  

    save_path = saver.save(session, OUT)

    with open(os.path.join(dirname, '../metrics/train.json'), 'w') as outfile:
        json.dump({ 
            "batch_size": batch_size, 
            "num_steps": num_steps, 
            "learning_rate": learning_rate,  
            "took" : (time.time() - start) / 1000 }, outfile)

8. Create code/eval.py file with the following content:

code/eval.py
import os
import json
import tensorflow.compat.v1 as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.disable_v2_behavior()

import mnist

dirname = os.path.dirname(__file__)

LABELS, IMAGES = mnist.read_csv(os.path.join(dirname, '../data/mnist_test.csv'))

META = os.path.join(dirname, '../models/mnist.meta')
MODELS = os.path.join(dirname, '../models/')

init = tf.global_variables_initializer()
with tf.Session() as sess:
    saver = tf.train.import_meta_graph(META)
    saver.restore(sess, tf.train.latest_checkpoint(MODELS))

    graph = tf.get_default_graph()

    x = graph.get_tensor_by_name("x:0")
    y = graph.get_tensor_by_name("y:0")
    softmax = graph.get_tensor_by_name("softmax:0")
    accuracy = graph.get_tensor_by_name("accuracy:0")
    feed_dict = { x: IMAGES, y: LABELS }

    pred = sess.run([softmax, accuracy], feed_dict=feed_dict)
    with open(os.path.join(dirname, '../metrics/eval.json'), 'w') as outfile:
        json.dump({ "accuracy" : pred[1].item() }, outfile)

9. Setup dvc in your project, you will need to add a dvc remote storage

dvc init
dvc remote add -d myremote s3://your-s3-bucket/dvc-mnist-example

10. Setup project data

python code/mnist.py data

If everything has gone fine, you should have two folders inside data:

  • train
  • test

containing 60000 and 10000 small images respectively.

now track data with dvc running:

dvc add data

11. Let's create your dvc pipeline running the following commands:

dvc run --no-exec \
    -f train.dvc \
    -d code/train.py \
    -d data/mnist_train.csv \
    -o models \
    -M metrics/train.json \
    python code/train.py
dvc run --no-exec \
    -f eval.dvc \
    -d code/eval.py \
    -d data/mnist_test.csv \
    -d models \
    -M metrics/eval.json \
    python code/eval.py

12. Create .github/workflows/dvc.yaml file with the following content:

.github/workflows/dvc.yaml
name: default

on: [push, pull_request]

jobs:
  dvc:
    runs-on: ubuntu-latest
    container: docker://dvcorg/dvc-cml:latest

    steps:
      - uses: actions/checkout@v2
      
      - name: dvc_action_run
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          repro_targets: eval.dvc
        run: |
          apt-get update && apt-get install -y python-pip && pip install --upgrade pip
          pip install -r requirements.txt
          dvc_cml_run

12. Setup you AWS credentials as secrets in your repo

13. You can now commit and push your first code to your repo

git add --all
git commit -m "mnist ready"

dvc push
git push

Overview

Congratulations! 🎉 you have created your first CD ML pipeline with DVC-CML Let's check what it's going to happen.

1. Github actions will run your workflow file every time that you push or do a pull request:

on: [push, pull_request]

the runner an ubuntu 18.04 server will be allocated in Github as specified by:

runs-on: ubuntu-latest

however it could be running in your own servers with GPUs if needed! 😃 Check how to run your own runners

2. If your dvc pipeline changes DVC-CML will take care of managing your pipeline outputs

Every push/PR dvc repro will be executed if your dvc pipeline has changed, and only if has changed, allowing you to treat every branch or commit as a new experiment if you change your pipeline or treat it just as a normal git push if the pipeline does not change.

Go to your commits section in your github repo. If everything went fine you will see that the action run properly and you will see another commit by the github action agent.

DVC-CML ran the dvc repro for you (doing the train and eval stages) and then pushed your changes into git and dvc! 🚀

3. DVC-CML creates a DVC Report as a Github check.

This is the place where you can see how your experiment went, telling you the files that changed and the metrics.