diff --git a/program/cohort.ipynb b/program/cohort.ipynb index 14860f6..55a6aa9 100644 --- a/program/cohort.ipynb +++ b/program/cohort.ipynb @@ -34,7 +34,7 @@ "## Initial setup\n", "\n", ":::{.callout-note}\n", - "Before running this notebook, follow the [setup instructions](https://program.ml.school/setup.html) for the program.\n", + "Before running this notebook, follow the [Setup Instructions](https://program.ml.school/setup.html) for the program.\n", ":::\n", "\n", "Let's start by setting up the environment and preparing to run the notebook.\n" @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 640, + "execution_count": 150, "id": "4b2265b0", "metadata": {}, "outputs": [ @@ -77,8 +77,7 @@ "INFERENCE_CODE_FOLDER = CODE_FOLDER / \"inference\"\n", "INFERENCE_CODE_FOLDER.mkdir(parents=True, exist_ok=True)\n", "\n", - "sys.path.append(f\"./{CODE_FOLDER}\")\n", - "sys.path.append(f\"./{INFERENCE_CODE_FOLDER}\")\n", + "sys.path.extend([f\"./{CODE_FOLDER}\", f\"./{INFERENCE_CODE_FOLDER}\"])\n", "\n", "DATA_FILEPATH = \"penguins.csv\"\n", "\n", @@ -101,12 +100,12 @@ }, { "cell_type": "code", - "execution_count": 641, + "execution_count": 151, "id": "32c4d764", "metadata": {}, "outputs": [], "source": [ - "LOCAL_MODE = False" + "LOCAL_MODE = True" ] }, { @@ -119,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 642, + "execution_count": 152, "id": "3164a3af", "metadata": {}, "outputs": [], @@ -142,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 643, + "execution_count": 153, "id": "7bc40d28", "metadata": {}, "outputs": [], @@ -161,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 644, + "execution_count": 154, "id": "3b3f17e5", "metadata": {}, "outputs": [], @@ -201,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 645, + "execution_count": 155, "id": "942a01b5", "metadata": {}, "outputs": [], @@ -242,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 646, + "execution_count": 156, "id": "f1cd2f0e-446d-48a9-a008-b4f1cc593bfc", "metadata": { "tags": [] @@ -349,7 +348,7 @@ "4 3450.0 FEMALE " ] }, - "execution_count": 646, + "execution_count": 156, "metadata": {}, "output_type": "execute_result" } @@ -386,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 647, + "execution_count": 157, "id": "f2107c25-e730-4e22-a1b8-5bda53e61124", "metadata": { "tags": [] @@ -565,7 +564,7 @@ "max 6300.000000 NaN " ] }, - "execution_count": 647, + "execution_count": 157, "metadata": {}, "output_type": "execute_result" } @@ -584,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 648, + "execution_count": 158, "id": "1242122a-726e-4c37-a718-dd8e873d1612", "metadata": { "tags": [] @@ -635,14 +634,14 @@ "\n", "- `species`: There are 3 species of penguins in the dataset: Adelie (152), Gentoo (124), and Chinstrap (68).\n", "- `island`: Penguins are from 3 islands: Biscoe (168), Dream (124), and Torgersen (52).\n", - "- `sex`: We have 168 male penguins, 165 female penguins, and 1 penguin with an ambiguous gender ('.').\n", + "- `sex`: We have 168 male penguins, 165 female penguins, and 1 penguin with an ambiguous gender (`.`).\n", "\n", "Let's replace the ambiguous value in the `sex` column with a null value:\n" ] }, { "cell_type": "code", - "execution_count": 649, + "execution_count": 159, "id": "cf1cf582-8831-4f83-bb17-2175afb193e8", "metadata": { "tags": [] @@ -657,7 +656,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 649, + "execution_count": 159, "metadata": {}, "output_type": "execute_result" } @@ -677,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 650, + "execution_count": 160, "id": "cc42cb08-275c-4b05-9d2b-77052da2f336", "metadata": { "tags": [] @@ -696,7 +695,7 @@ "dtype: int64" ] }, - "execution_count": 650, + "execution_count": 160, "metadata": {}, "output_type": "execute_result" } @@ -715,7 +714,7 @@ }, { "cell_type": "code", - "execution_count": 651, + "execution_count": 161, "id": "3c57d55d-afd6-467a-a7a8-ff04132770ed", "metadata": { "tags": [] @@ -734,7 +733,7 @@ "dtype: int64" ] }, - "execution_count": 651, + "execution_count": 161, "metadata": {}, "output_type": "execute_result" } @@ -757,7 +756,7 @@ }, { "cell_type": "code", - "execution_count": 652, + "execution_count": 162, "id": "2852c740", "metadata": {}, "outputs": [ @@ -803,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 653, + "execution_count": 163, "id": "707cc972", "metadata": {}, "outputs": [ @@ -851,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 654, + "execution_count": 164, "id": "3daf3ba1-d218-4ad4-b862-af679b91273f", "metadata": { "tags": [] @@ -931,7 +930,7 @@ "body_mass_g 640316.716388 " ] }, - "execution_count": 654, + "execution_count": 164, "metadata": {}, "output_type": "execute_result" } @@ -956,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": 655, + "execution_count": 165, "id": "1d793e09-2cb9-47ff-a0e6-199a0f4fc1b3", "metadata": { "tags": [] @@ -1036,7 +1035,7 @@ "body_mass_g 1.000000 " ] }, - "execution_count": 655, + "execution_count": 165, "metadata": {}, "output_type": "execute_result" } @@ -1061,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 656, + "execution_count": 166, "id": "1258c99d", "metadata": {}, "outputs": [ @@ -1101,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 657, + "execution_count": 167, "id": "45b0a87f-028d-477f-9b65-199728c0b7ee", "metadata": { "tags": [] @@ -1155,7 +1154,7 @@ }, { "cell_type": "code", - "execution_count": 658, + "execution_count": 168, "id": "fb6ba7c0-1bd6-4fe5-8b7f-f6cbdfd3846c", "metadata": { "tags": [] @@ -1351,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 659, + "execution_count": 169, "id": "d1f122a4-acff-4687-91b9-bfef13567d88", "metadata": { "tags": [] @@ -1368,7 +1367,6 @@ ], "source": [ "%%ipytest -s\n", - "\n", "#| code-fold: true\n", "#| output: false\n", "\n", @@ -1489,7 +1487,7 @@ }, { "cell_type": "code", - "execution_count": 660, + "execution_count": 170, "id": "d88e9ccf", "metadata": {}, "outputs": [], @@ -1504,12 +1502,12 @@ "id": "f3b1d96a", "metadata": {}, "source": [ - "We can parameterize a SageMaker Pipeline to make it more flexible. In this case, we'll use a paramater to pass the location of the dataset we want to process. We can execute the pipeline with different datasets by changing the value of this parameter. To read more about these parameters, check [Pipeline Parameters](https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-parameters.html).\n" + "We can parameterize a SageMaker Pipeline to make it more flexible. In this case, we'll use a parameter to pass the location of the dataset we want to process. We can execute the pipeline with different datasets by changing the value of this parameter. To read more about these parameters, check [Pipeline Parameters](https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-parameters.html).\n" ] }, { "cell_type": "code", - "execution_count": 661, + "execution_count": 171, "id": "331fe373", "metadata": {}, "outputs": [], @@ -1527,12 +1525,16 @@ "id": "cfb9a589", "metadata": {}, "source": [ - "A processor gives the Processing Step information about the hardware and software that SageMaker should use to launch the Processing Job. To run the script we created, we need access to Scikit-Learn, so we can use the [SKLearnProcessor](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-processor) processor that comes out-of-the-box with the SageMaker's Python SDK. The [Data Processing with Framework Processors](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-job-frameworks.html) page discusses other built-in processors you can use. The [Docker Registry Paths and Example Code](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) page contains information about the available framework versions for each region.\n" + "A processor gives the Processing Step information about the hardware and software that SageMaker should use to launch the Processing Job. To run the script we created, we need access to Scikit-Learn, so we can use the [SKLearnProcessor](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-processor) processor that comes out-of-the-box with the SageMaker's Python SDK. \n", + "\n", + "SageMaker manages the infrastructure of a Processing Job. It provisions resources for the duration of the job, and cleans up when it completes. The Processing Container image that SageMaker uses to run a Processing Job can either be a SageMaker built-in image or a custom image.\n", + "\n", + "The [Data Processing with Framework Processors](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-job-frameworks.html) page discusses other built-in processors you can use. The [Docker Registry Paths and Example Code](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) page contains information about the available framework versions for each region." ] }, { "cell_type": "code", - "execution_count": 662, + "execution_count": 172, "id": "3aa4471a", "metadata": {}, "outputs": [ @@ -1545,10 +1547,12 @@ } ], "source": [ + "#| code: true\n", + "#| output: false\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", "processor = SKLearnProcessor(\n", - " base_job_name=\"split-and-transform-data\",\n", + " base_job_name=\"preprocess-data\",\n", " framework_version=\"1.2-1\",\n", " # By default, a new account doesn't have access to `ml.m5.xlarge` instances.\n", " # If you haven't requested a quota increase yet, you can use an\n", @@ -1568,12 +1572,16 @@ "id": "6cf2cc58", "metadata": {}, "source": [ - "Let's now define the Processing Step that we'll use in the pipeline. This step requires a list of inputs that we need on the preprocessing script. In this case, the input is the dataset we stored in S3. We also have a few outputs that we want SageMaker to capture when the Processing Job finishes.\n" + "Let's now define the Processing Step that we'll use in the pipeline. This step requires a list of inputs that we need on the preprocessing script. In this case, the input is the dataset we stored in S3. We also have a few outputs that we want SageMaker to capture when the Processing Job finishes.\n", + "\n", + "Here's a high-level overview of this step and the Processing Job that SageMaker creates behind the scenes:\n", + "\n", + " \"High-level" ] }, { "cell_type": "code", - "execution_count": 663, + "execution_count": 173, "id": "cdbd9303", "metadata": { "tags": [] @@ -1596,8 +1604,8 @@ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "\n", "\n", - "split_and_transform_data_step = ProcessingStep(\n", - " name=\"split-and-transform-data\",\n", + "preprocessing_step = ProcessingStep(\n", + " name=\"preprocess-data\",\n", " step_args=processor.run(\n", " code=f\"{CODE_FOLDER}/preprocessor.py\",\n", " inputs=[\n", @@ -1654,7 +1662,7 @@ }, { "cell_type": "code", - "execution_count": 664, + "execution_count": 174, "id": "e140642a", "metadata": { "tags": [] @@ -1663,17 +1671,10 @@ { "data": { "text/plain": [ - "{'PipelineArn': 'arn:aws:sagemaker:us-east-1:325223348818:pipeline/session1-pipeline',\n", - " 'ResponseMetadata': {'RequestId': '02b62dd1-6de0-4723-9019-f4f72862ba5c',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amzn-requestid': '02b62dd1-6de0-4723-9019-f4f72862ba5c',\n", - " 'content-type': 'application/x-amz-json-1.1',\n", - " 'content-length': '85',\n", - " 'date': 'Fri, 27 Oct 2023 14:38:36 GMT'},\n", - " 'RetryAttempts': 0}}" + "{'PipelineArn': 'session1-pipeline'}" ] }, - "execution_count": 664, + "execution_count": 174, "metadata": {}, "output_type": "execute_result" } @@ -1691,7 +1692,7 @@ " name=\"session1-pipeline\",\n", " parameters=[dataset_location],\n", " steps=[\n", - " split_and_transform_data_step,\n", + " preprocessing_step,\n", " ],\n", " pipeline_definition_config=pipeline_definition_config,\n", " sagemaker_session=config[\"session\"],\n", @@ -1722,13 +1723,12 @@ }, { "cell_type": "code", - "execution_count": 665, + "execution_count": 175, "id": "59d1e634", "metadata": {}, "outputs": [], "source": [ "%%script false --no-raise-error\n", - "\n", "#| eval: false\n", "#| code: true\n", "#| output: false\n", @@ -1743,13 +1743,13 @@ "source": [ "### Assignments\n", "\n", - "- Assignment 1.1 The SageMaker Pipeline we built supports running a few steps in Local Mode. The goal of this assignment is to run the pipeline on your local environment using Local Mode.\n", + "- Assignment 1.1 For this assignment, you should run the pipeline on your environment using Local Mode and then switch it to run in SageMaker. After completing this assignment, you should have your environment fully configured and your pipeline running without any issues. This assignment is fundamental to the rest of the program, so make sure you complete it before moving on to any other assignments.\n", "\n", - "- Assignment 1.2 For this assignment, we want to run the end-to-end pipeline in SageMaker Studio. Ensure you turn off Local Mode before doing so.\n", + "- Assignment 1.2 The pipeline uses Random Sampling to split the dataset. Modify the code to use Stratified Sampling instead. The goal of this assignment is to help you familiarize with how to modify the preprocessing script and re-run the pipeline to see your changes in action.\n", "\n", - "- Assignment 1.3 The pipeline uses Random Sampling to split the dataset. Modify the code to use Stratified Sampling instead.\n", + "- Assignment 1.3 We can specify different parameter values in a pipeline at the time we start it. In this session, we defined a `dataset_location` parameter that specifies the location of the data that we want the pipeline to process. For this assignment, use ChatGPT to generate dataset with 500 random penguins and store the file in S3. Then, run the pipeline pointing the `dataset_location` to the new dataset. Here is an explanation of how to [override default parameters during a pipeline execution](https://docs.aws.amazon.com/sagemaker/latest/dg/run-pipeline.html#run-pipeline-parametrized). You can use the Advanced Data Analysis tool from ChatGPT to generate the fake data. If you don't have access to it, you can simply duplicate your dataset and store it at a different S3 location.\n", "\n", - "- Assignment 1.4 For this assignment, we want to run a distributed Processing Job across multiple instances to capitalize the `island` column of the dataset. Your dataset will consist of 10 different files stored in S3. Set up a Processing Job using two instances. When specifying the input to the Processing Job, you must set the `ProcessingInput.s3_data_distribution_type` attribute to `ShardedByS3Key`. By doing this, SageMaker will run a cluster with two instances simultaneously, each with access to half the files.\n", + "- Assignment 1.4 For this assignment, we want to run a distributed Processing Job across multiple instances to capitalize the `island` column of the dataset. Your dataset will consist of 10 different files stored in S3. Set up a Processing Step using two instances. When specifying the input to the Processing Step, you must set the `ProcessingInput.s3_data_distribution_type` attribute to `ShardedByS3Key`. By doing this, SageMaker will run a cluster with two instances simultaneously, each with access to half the files. Check the [`S3DataDistributionType`](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_S3DataSource.html) documentation for more information.\n", "\n", "- Assignment 1.5 You can use [Amazon SageMaker Data Wrangler](https://aws.amazon.com/sagemaker/data-wrangler/) to complete each step of the data preparation workflow (including data selection, cleansing, exploration, visualization, and processing at scale) from a single visual interface. For this assignment, load the Data Wrangler interface and use it to build the same transformations we implemented using the Scikit-Learn Pipeline. If you have questions, open the [Penguins Data Flow](penguins.flow) included in this repository.\n" ] @@ -1780,7 +1780,7 @@ }, { "cell_type": "code", - "execution_count": 666, + "execution_count": 176, "id": "d92b121d-dcb9-43e8-9ee3-3ececb583e7e", "metadata": { "tags": [] @@ -1889,7 +1889,7 @@ }, { "cell_type": "code", - "execution_count": 667, + "execution_count": 177, "id": "14ea27ce-c453-4cb0-b309-dbecd732957e", "metadata": { "tags": [] @@ -1906,24 +1906,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "8/8 - 0s - loss: 1.0173 - accuracy: 0.4728 - val_loss: 0.9260 - val_accuracy: 0.6078 - 230ms/epoch - 29ms/step\n", - "2/2 [==============================] - 0s 1ms/step\n", - "Validation accuracy: 0.6078431372549019\n" + "8/8 - 0s - loss: 0.9615 - accuracy: 0.5816 - val_loss: 0.9899 - val_accuracy: 0.5490 - 218ms/epoch - 27ms/step\n", + "2/2 [==============================] - 0s 1ms/step\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:tensorflow:Assets written to: /var/folders/4c/v1q3hy1x4mb5w0wpc72zl3_w0000gp/T/tmpv4apdp15/model/001/assets\n" + "INFO:tensorflow:Assets written to: /var/folders/4c/v1q3hy1x4mb5w0wpc72zl3_w0000gp/T/tmpz_4e3xsq/model/001/assets\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Validation accuracy: 0.5490196078431373\n", "\u001b[32m.\u001b[0m\n", - "\u001b[32m\u001b[32m\u001b[1m1 passed\u001b[0m\u001b[32m in 0.53s\u001b[0m\u001b[0m\n" + "\u001b[32m\u001b[32m\u001b[1m1 passed\u001b[0m\u001b[32m in 0.49s\u001b[0m\u001b[0m\n" ] } ], @@ -1983,7 +1983,12 @@ "\n", "We can now create a [Training Step](https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-steps.html#step-type-training) that we can add to the pipeline. This Training Step will create a SageMaker Training Job in the background, run the training script, and upload the output to S3. Check the [TrainingStep](https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/sagemaker.workflow.pipelines.html#sagemaker.workflow.steps.TrainingStep) SageMaker's SDK documentation for more information.\n", "\n", - "SageMaker uses the concept of an [Estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to handle end-to-end training and deployment tasks. For this example, we will use the built-in [TensorFlow Estimator](https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/sagemaker.tensorflow.html#tensorflow-estimator) to run the training script we wrote before. The [Docker Registry Paths and Example Code](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) page contains information about the available framework versions for each region. Here, you can also check the available SageMaker [Deep Learning Container images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md).\n", + "SageMaker uses the concept of an [Estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) to handle end-to-end training and deployment tasks. For this example, we will use the built-in [TensorFlow Estimator](https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/sagemaker.tensorflow.html#tensorflow-estimator) to run the training script we wrote before. \n", + "\n", + "\n", + "SageMaker manages the infrastructure of a Training Job. It provisions resources for the duration of the job, and cleans up when it completes. The Training Container image that SageMaker uses to run a Training Job can either be a SageMaker built-in image or a custom image.\n", + "\n", + "The [Docker Registry Paths and Example Code](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) page contains information about the available framework versions for each region. Here, you can also check the available SageMaker [Deep Learning Container images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md).\n", "\n", "Notice the list of hyperparameters defined below. SageMaker will pass these hyperparameters as arguments to the entry point of the training script.\n", "\n", @@ -1992,7 +1997,7 @@ }, { "cell_type": "code", - "execution_count": 668, + "execution_count": 178, "id": "90fe82ae-6a2c-4461-bc83-bb52d8871e3b", "metadata": { "tags": [] @@ -2042,17 +2047,30 @@ "Here, we are using two input channels, `train` and `validation`. SageMaker will automatically create an environment variable corresponding to each of these channels following the format `SM_CHANNEL_[channel_name]`:\n", "\n", "- `SM_CHANNEL_TRAIN`: This environment variable will contain the path to the data in the `train` channel\n", - "- `SM_CHANNEL_VALIDATION`: This environment variable will contain the path to the data in the `validation` channel\n" + "- `SM_CHANNEL_VALIDATION`: This environment variable will contain the path to the data in the `validation` channel\n", + "\n", + "Here's a high-level overview of this step and the Training Job that SageMaker creates behind the scenes:\n", + "\n", + " \"High-level" ] }, { "cell_type": "code", - "execution_count": 738, + "execution_count": 179, "id": "99e4850c-83d6-4f4e-a813-d5a3f4bb7486", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/svpino/dev/ml.school/.venv/lib/python3.9/site-packages/sagemaker/workflow/pipeline_context.py:297: UserWarning: Running within a PipelineSession, there will be No Wait, No Logs, and No Job being started.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "# | code: true\n", "# | output: false\n", @@ -2065,13 +2083,13 @@ " step_args=estimator.fit(\n", " inputs={\n", " \"train\": TrainingInput(\n", - " s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"train\"\n", " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", " ),\n", " \"validation\": TrainingInput(\n", - " s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"validation\"\n", " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", @@ -2097,17 +2115,17 @@ "id": "90eb5075", "metadata": {}, "source": [ - "Since we could use the Training of the Tuning Step to create the model, we'll define this constant to indicate which approach we want to run.\n" + "Since we could use the Training of the Tuning Step to create the model, we'll define this constant to indicate which approach we want to run. Notice that the Tuning Step is not supported in Local Mode.\n" ] }, { "cell_type": "code", - "execution_count": 670, + "execution_count": 180, "id": "f367d0e3", "metadata": {}, "outputs": [], "source": [ - "USE_TUNING_STEP = False" + "USE_TUNING_STEP = True and not LOCAL_MODE" ] }, { @@ -2120,7 +2138,7 @@ "Here is the configuration that we'll use to find the best model:\n", "\n", "1. `objective_metric_name`: This is the name of the metric the tuner will use to determine the best model.\n", - "2. `objective_type`: This is the objective of the tuner. Should it \"Minimize\" the metric or \"Maximize\" it? In this example, since we are using the validation accuracy of the model, we want the objective to be \"Maximize.\" If we were using the loss of the model, we would set the objective to \"Minimize.\"\n", + "2. `objective_type`: This is the objective of the tuner. It specifies whether it should minimize the metric or maximize it. In this example, since we are using the validation accuracy of the model, we want the objective to be \"Maximize.\" If we were using the loss of the model, we would set the objective to \"Minimize.\"\n", "3. `metric_definitions`: Defines how the tuner will determine the metric's value by looking at the output logs of the training process.\n", "\n", "The tuner expects the list of the hyperparameters you want to explore. You can use subclasses of the [Parameter](https://sagemaker.readthedocs.io/en/stable/api/training/parameter.html#sagemaker.parameter.ParameterRange) class to specify different types of hyperparameters. This example explores different values for the `epochs` hyperparameter.\n", @@ -2133,7 +2151,7 @@ }, { "cell_type": "code", - "execution_count": 671, + "execution_count": 181, "id": "c8c82750", "metadata": {}, "outputs": [], @@ -2159,12 +2177,16 @@ "id": "28c2abc2", "metadata": {}, "source": [ - "We can now create the Tuning Step using the tuner we configured before:\n" + "We can now create the Tuning Step using the tuner we configured before.\n", + "\n", + "Here's a high-level overview of this step and the Hyperparameter Tuning Job that SageMaker creates behind the scenes:\n", + "\n", + " \"High-level" ] }, { "cell_type": "code", - "execution_count": 672, + "execution_count": 182, "id": "038ff2e5-ed28-445b-bc03-4e996ec2286f", "metadata": { "tags": [] @@ -2178,13 +2200,13 @@ " step_args=tuner.fit(\n", " inputs={\n", " \"train\": TrainingInput(\n", - " s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"train\"\n", " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", " ),\n", " \"validation\": TrainingInput(\n", - " s3_data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"validation\"\n", " ].S3Output.S3Uri,\n", " content_type=\"text/csv\",\n", @@ -2207,54 +2229,19 @@ }, { "cell_type": "code", - "execution_count": 673, + "execution_count": 183, "id": "9799ab39-fcae-41f4-a68b-85ab71b3ba9a", "metadata": { "tags": [] }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using provided s3_resource\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using provided s3_resource\n" - ] - }, { "data": { "text/plain": [ - "{'PipelineArn': 'arn:aws:sagemaker:us-east-1:325223348818:pipeline/session2-pipeline',\n", - " 'ResponseMetadata': {'RequestId': 'e99208aa-4074-41aa-a12b-90af6da62e3f',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amzn-requestid': 'e99208aa-4074-41aa-a12b-90af6da62e3f',\n", - " 'content-type': 'application/x-amz-json-1.1',\n", - " 'content-length': '85',\n", - " 'date': 'Fri, 27 Oct 2023 14:38:38 GMT'},\n", - " 'RetryAttempts': 0}}" + "{'PipelineArn': 'session2-pipeline'}" ] }, - "execution_count": 673, + "execution_count": 183, "metadata": {}, "output_type": "execute_result" } @@ -2267,7 +2254,7 @@ " name=\"session2-pipeline\",\n", " parameters=[dataset_location],\n", " steps=[\n", - " split_and_transform_data_step,\n", + " preprocessing_step,\n", " tune_model_step if USE_TUNING_STEP else train_model_step,\n", " ],\n", " pipeline_definition_config=pipeline_definition_config,\n", @@ -2299,13 +2286,12 @@ }, { "cell_type": "code", - "execution_count": 674, + "execution_count": null, "id": "274a9b1e", "metadata": {}, "outputs": [], "source": [ "%%script false --no-raise-error\n", - "\n", "#| eval: false\n", "#| code: true\n", "#| output: false\n", @@ -2320,15 +2306,15 @@ "source": [ "### Assignments\n", "\n", - "- Assignment 2.1 The training script trains the model using a hard-coded learning rate value. Modify the code to accept the learning rate as a parameter we can control from outside the script.\n", + "- Assignment 2.1 The training script trains the model using a hard-coded learning rate value. Modify the script to accept the learning rate as a parameter we can pass from the pipeline.\n", "\n", - "- Assignment 2.2 We currently define the number of epochs to train the model as a constant that we pass to the Estimator using the list of hyperparameters. Replace this constant with a new Pipeline Parameter named `training_epochs`. You'll need to specify this new parameter when creating the Pipeline.\n", + "- Assignment 2.2 We currently define the number of epochs to train the model as a constant that we pass to the Estimator using the list of hyperparameters. Replace this constant with a new Pipeline Parameter named `training_epochs`.\n", "\n", - "- Assignment 2.3 The current tuning process aims to find the model with the highest validation accuracy. Modify the code to focus on the model with the lowest training loss.\n", + "- Assignment 2.3 The current tuning process aims to find the model with the highest validation accuracy. Modify the code so the best model is the one with the lowest training loss.\n", "\n", - "- Assignment 2.4 We used an instance of [`SKLearnProcessor`](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-processor) to run the script that transforms and splits the data, but there's no way to add additional dependencies to the processing container. Modify the code to use an instance of [`FrameworkProcessor`](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.processing.FrameworkProcessor) instead. This class will allow you to specify a directory containing a `requirements.txt` file containing a list of dependencies. SageMaker will install these libraries in the processing container before triggering the processing job.\n", + "- Assignment 2.4 We used an instance of [`SKLearnProcessor`](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-processor) to run the script that transforms and splits the data. While this processor is convenient, it doesn't allow us to install additional libraries in the container. Modify the code to use an instance of [`FrameworkProcessor`](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.processing.FrameworkProcessor) instead `SKLearnProcessor`. This class will allow us to specify a directory containing a `requirements.txt` file listing any additional dependencies. SageMaker will install these libraries in the processing container before triggering the processing job.\n", "\n", - "- Assignment 2.5 We want to execute the pipeline whenever the dataset changes. We can accomplish this by using [Amazon EventBridge](https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-what-is.html). Configure an event to automatically start the pipeline when a new file is added to the S3 bucket where we store our dataset. Check [Amazon EventBridge Integration](https://docs.aws.amazon.com/sagemaker/latest/dg/pipeline-eventbridge.html) for an implementation tutorial.\n" + "- Assignment 2.5 We configured the Training Step to log information from the Training Job as part of the SageMaker Experiment associated to the pipeline. As part of this assignment, check [Manage Machine Learning with Amazon SageMaker Experiments](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) and explore the generated experiments in the SageMaker Studio Console so you can familiarize with the information SageMaker logs during training." ] }, { @@ -2712,7 +2698,7 @@ " # the first step of the pipeline when we split and\n", " # transformed the data.\n", " ProcessingInput(\n", - " source=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " source=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"test\"\n", " ].S3Output.S3Uri,\n", " destination=\"/opt/ml/processing/test\",\n", @@ -3070,7 +3056,7 @@ " name=\"session3-pipeline\",\n", " parameters=[dataset_location, accuracy_threshold],\n", " steps=[\n", - " split_and_transform_data_step,\n", + " preprocessing_step,\n", " tune_model_step if USE_TUNING_STEP else train_model_step,\n", " evaluate_model_step,\n", " condition_step,\n", @@ -3918,7 +3904,7 @@ "transformation_pipeline_model = Join(\n", " on=\"/\",\n", " values=[\n", - " split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"model\"\n", " ].S3Output.S3Uri,\n", " \"model.tar.gz\",\n", @@ -4196,7 +4182,7 @@ " name=\"session4-pipeline\",\n", " parameters=[dataset_location, accuracy_threshold],\n", " steps=[\n", - " split_and_transform_data_step,\n", + " preprocessing_step,\n", " tune_model_step if USE_TUNING_STEP else train_model_step,\n", " evaluate_model_step,\n", " condition_step,\n", @@ -4845,7 +4831,7 @@ " role=role,\n", " ),\n", " quality_check_config=DataQualityCheckConfig(\n", - " baseline_dataset=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " baseline_dataset=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"train-baseline\"\n", " ].S3Output.S3Uri,\n", " dataset_format=DatasetFormat.csv(header=True, output_columns_position=\"START\"),\n", @@ -4960,7 +4946,7 @@ " step_args=transformer.transform(\n", " # We will use the baseline set we generated when we split the data.\n", " # This set corresponds to the test split before the transformation step.\n", - " data=split_and_transform_data_step.properties.ProcessingOutputConfig.Outputs[\n", + " data=preprocessing_step.properties.ProcessingOutputConfig.Outputs[\n", " \"test-baseline\"\n", " ].S3Output.S3Uri,\n", "\n", @@ -5268,7 +5254,7 @@ " name=\"session5-pipeline\",\n", " parameters=[dataset_location, accuracy_threshold],\n", " steps=[\n", - " split_and_transform_data_step,\n", + " preprocessing_step,\n", " tune_model_step if USE_TUNING_STEP else train_model_step,\n", " evaluate_model_step,\n", " data_quality_baseline_step,\n", diff --git a/program/images/preprocess-data.png b/program/images/preprocess-data.png new file mode 100644 index 0000000..a72b649 Binary files /dev/null and b/program/images/preprocess-data.png differ diff --git a/program/images/train-model.png b/program/images/train-model.png new file mode 100644 index 0000000..b888309 Binary files /dev/null and b/program/images/train-model.png differ diff --git a/program/images/tune-model.png b/program/images/tune-model.png new file mode 100644 index 0000000..488a40e Binary files /dev/null and b/program/images/tune-model.png differ