From d04300a6af671788e478c5d4dfdb93763a729615 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Thu, 14 Mar 2024 17:53:19 +0000
Subject: [PATCH 01/12] Add an example of object-segmentation (ClipSeg) using
 graph mode.

---
 .../object-segementation/ClipSeg/README.md    |  32 ++++++
 .../ClipSeg/run_example.py                    | 108 ++++++++++++++++++
 2 files changed, 140 insertions(+)
 create mode 100644 examples/object-segementation/ClipSeg/README.md
 create mode 100644 examples/object-segementation/ClipSeg/run_example.py

diff --git a/examples/object-segementation/ClipSeg/README.md b/examples/object-segementation/ClipSeg/README.md
new file mode 100644
index 000000000..f476a5a4e
--- /dev/null
+++ b/examples/object-segementation/ClipSeg/README.md
@@ -0,0 +1,32 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Owl-ViT Examples
+
+This directory contains an example script that demonstrates using ClipSeg with graph mode.
+
+## Single-HPU inference
+
+```bash
+python3 run_example.py \
+    --model_name_or_path "CIDAS/clipseg-rd64-refined" \
+    --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
+    --prompt "a cat, a remote, a blanket" \
+    --warmup 3 \
+    --n_iterations 20 \
+    --use_hpu_graphs \
+    --bf16 \
+    --print_result
+```
+Models that have been validated:
+  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
\ No newline at end of file
diff --git a/examples/object-segementation/ClipSeg/run_example.py b/examples/object-segementation/ClipSeg/run_example.py
new file mode 100644
index 000000000..d4aa3efae
--- /dev/null
+++ b/examples/object-segementation/ClipSeg/run_example.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# Copied from https://huggingface.co/docs/transformers/main/en/model_doc/clipseg
+
+from transformers import AutoProcessor, CLIPSegForImageSegmentation
+from PIL import Image
+import requests
+import torch
+import habana_frameworks.torch as ht
+import habana_frameworks.torch.core as htcore
+import time
+import argparse
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="CIDAS/clipseg-rd64-refined",
+        type=str,
+        help="Path of the pre-trained model",
+    )
+    parser.add_argument(
+        "--image_path",
+        default="http://images.cocodataset.org/val2017/000000039769.jpg",
+        type=str,
+        help='Path of the input image. Should be a single string (eg: --image_path "URL")',
+    )
+    parser.add_argument(
+        "--prompt",
+        default="a cat, a remote, a blanket",
+        type=str,
+        help='Prompt for classification. It should be a string seperated by comma. (eg: --prompt "a photo of a cat, a photo of a dog")',
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to use bf16 precision for classification.",
+    )
+    parser.add_argument(
+        "--print_result",
+        action="store_true",
+        help="Whether to print the classification results.",
+    )
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+
+    args = parser.parse_args()
+
+    adapt_transformers_to_gaudi()
+
+    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
+    model = CLIPSegForImageSegmentation.from_pretrained(args.model_name_or_path)
+
+    image = Image.open(requests.get(args.image_path, stream=True).raw)
+    texts = []
+    for text in args.prompt.split(','):
+        texts.append(text)
+
+    if args.use_hpu_graphs:
+        model = ht.hpu.wrap_in_hpu_graph(model)
+
+    autocast = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16)
+    model.to("hpu")
+
+    with torch.no_grad(), autocast:
+        for i in range(args.warmup):
+            inputs = processor(text=texts, images=[image]* len(texts), padding=True, return_tensors="pt").to("hpu")
+            outputs = model(**inputs)
+            torch.hpu.synchronize()
+
+        total_model_time = 0
+        for i in range(args.n_iterations):
+            inputs = processor(text=texts, images=[image]* len(texts), padding=True, return_tensors="pt").to("hpu")
+            model_start_time = time.time()
+            outputs = model(**inputs)
+            torch.hpu.synchronize()
+            model_end_time = time.time()
+            total_model_time = total_model_time + (model_end_time - model_start_time)
+
+            if args.print_result:
+                logits = outputs.logits
+                print(logits.shape)
+                print("Logits: " + str(logits))
+
+    print("n_iterations: " + str(args.n_iterations))
+    print("Total latency (ms): " + str(total_model_time*1000))
+    print("Average latency (ms): " + str(total_model_time*1000/args.n_iterations))
\ No newline at end of file

From f1ccb13dd2a12c5ae793151209fe1e370f6da147 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Thu, 14 Mar 2024 18:44:43 +0000
Subject: [PATCH 02/12] Updated readme and added codes for generating segmented
 images.

---
 examples/object-segementation/ClipSeg/README.md      |  2 +-
 examples/object-segementation/ClipSeg/run_example.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/object-segementation/ClipSeg/README.md b/examples/object-segementation/ClipSeg/README.md
index f476a5a4e..d8c76272a 100644
--- a/examples/object-segementation/ClipSeg/README.md
+++ b/examples/object-segementation/ClipSeg/README.md
@@ -21,7 +21,7 @@ This directory contains an example script that demonstrates using ClipSeg with g
 python3 run_example.py \
     --model_name_or_path "CIDAS/clipseg-rd64-refined" \
     --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
-    --prompt "a cat, a remote, a blanket" \
+    --prompt "cat, remote, blanket" \
     --warmup 3 \
     --n_iterations 20 \
     --use_hpu_graphs \
diff --git a/examples/object-segementation/ClipSeg/run_example.py b/examples/object-segementation/ClipSeg/run_example.py
index d4aa3efae..a10b8fa9d 100644
--- a/examples/object-segementation/ClipSeg/run_example.py
+++ b/examples/object-segementation/ClipSeg/run_example.py
@@ -23,6 +23,7 @@
 import habana_frameworks.torch.core as htcore
 import time
 import argparse
+from torchvision.utils import save_image
 
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
@@ -99,9 +100,14 @@
             total_model_time = total_model_time + (model_end_time - model_start_time)
 
             if args.print_result:
-                logits = outputs.logits
-                print(logits.shape)
-                print("Logits: " + str(logits))
+                if (i == 0): # generate/output once only
+                    logits = outputs.logits
+                    for j in range(logits.shape[0]):
+                        threshold = 0.5
+                        segmented_image = ((torch.sigmoid(logits[j])  > threshold)*255).unsqueeze(0)
+                        segmented_image = segmented_image.to(torch.float32)
+                        save_image(segmented_image, 'segmented' + texts[j] + '.png')
+                    print('Segmented images are generated.')
 
     print("n_iterations: " + str(args.n_iterations))
     print("Total latency (ms): " + str(total_model_time*1000))

From bc093acede956ed4b1476a637fbe5ebec1720743 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 18 Mar 2024 18:08:06 +0000
Subject: [PATCH 03/12] Added an example code of SAM model.

---
 .../SegmentAnythingModel/README.md            |  33 ++++++
 .../SegmentAnythingModel/run_example.py       | 109 ++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 examples/object-segementation/SegmentAnythingModel/README.md
 create mode 100644 examples/object-segementation/SegmentAnythingModel/run_example.py

diff --git a/examples/object-segementation/SegmentAnythingModel/README.md b/examples/object-segementation/SegmentAnythingModel/README.md
new file mode 100644
index 000000000..8a0f24300
--- /dev/null
+++ b/examples/object-segementation/SegmentAnythingModel/README.md
@@ -0,0 +1,33 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Owl-ViT Examples
+
+This directory contains an example script that demonstrates using SAM with graph mode.
+
+## Single-HPU inference
+
+```bash
+python3 run_example.py \
+    --model_name_or_path "facebook/sam-vit-huge" \
+    --image_path "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" \
+    --point_prompt "450,600" \
+    --warmup 3 \
+    --n_iterations 20 \
+    --use_hpu_graphs \
+    --bf16 \
+    --print_result
+```
+Models that have been validated:
+  - [facebook/sam-vit-base](https://huggingface.co/facebook/sam-vit-base)
+  - [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge)
\ No newline at end of file
diff --git a/examples/object-segementation/SegmentAnythingModel/run_example.py b/examples/object-segementation/SegmentAnythingModel/run_example.py
new file mode 100644
index 000000000..592f7f429
--- /dev/null
+++ b/examples/object-segementation/SegmentAnythingModel/run_example.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# Copied from https://huggingface.co/facebook/sam-vit-base
+
+from transformers import SamModel, SamProcessor
+from PIL import Image
+import requests
+import torch
+import habana_frameworks.torch as ht
+import habana_frameworks.torch.core as htcore
+import time
+import argparse
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="facebook/sam-vit-huge",
+        type=str,
+        help="Path of the pre-trained model",
+    )
+    parser.add_argument(
+        "--image_path",
+        default="https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png",
+        type=str,
+        help='Path of the input image. Should be a single string (eg: --image_path "URL")',
+    )
+    parser.add_argument(
+        "--point_prompt",
+        default="450, 600",
+        type=str,
+        help='Prompt for segmentation. It should be a string seperated by comma. (eg: --point_prompt "450, 600")',
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to use bf16 precision for classification.",
+    )
+    parser.add_argument(
+        "--print_result",
+        action="store_true",
+        help="Whether to save the segmentation result.",
+    )
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+
+    args = parser.parse_args()
+
+    adapt_transformers_to_gaudi()
+
+    processor = SamProcessor.from_pretrained(args.model_name_or_path)
+    model = SamModel.from_pretrained(args.model_name_or_path)
+
+    image = Image.open(requests.get(args.image_path, stream=True).raw).convert("RGB")
+    points = []
+    for text in args.point_prompt.split(','):
+        points.append(int(text))
+    points = [[points]]
+
+    if args.use_hpu_graphs:
+        model = ht.hpu.wrap_in_hpu_graph(model)
+
+    autocast = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16)
+    model.to("hpu")
+
+    with torch.no_grad(), autocast:
+        for i in range(args.warmup):
+            inputs = processor(image, input_points=points, return_tensors="pt").to("hpu")
+            outputs = model(**inputs)
+            torch.hpu.synchronize()
+
+        total_model_time = 0
+        for i in range(args.n_iterations):
+            inputs = processor(image, input_points=points, return_tensors="pt").to("hpu")
+            model_start_time = time.time()
+            outputs = model(**inputs)
+            torch.hpu.synchronize()
+            model_end_time = time.time()
+            total_model_time = total_model_time + (model_end_time - model_start_time)
+
+            if args.print_result:
+                if (i == 0): # generate/output once only
+                    iou = outputs.iou_scores
+                    print("iou score: " + str(iou))
+
+    print("n_iterations: " + str(args.n_iterations))
+    print("Total latency (ms): " + str(total_model_time*1000))
+    print("Average latency (ms): " + str(total_model_time*1000/args.n_iterations))
\ No newline at end of file

From 23489864642aad77e20408a94b05c6321249b455 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 25 Mar 2024 20:41:46 +0000
Subject: [PATCH 04/12] Add this example to README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fabff9e26..5260e0625 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,7 @@ The following model architectures, tasks and device distributions have been vali
 | OWLViT |   | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
 | ClipSeg |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-
+| Segment Anything Model |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation/SegmentAnythingModel)</li> |
 </div>
 
 - Diffusers:

From d7816c22220a5559a9f0dae07448797b0b807aed Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 25 Mar 2024 20:49:38 +0000
Subject: [PATCH 05/12] Expose the example in index.mdx.

---
 docs/source/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index b33cfd062..5afc2b9bf 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -72,7 +72,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | OWLViT       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
 | ClipSeg      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-
+| SAM          |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation/SegmentAnythingModel)</li> |
 
 - Diffusers
 

From 8a27e86586486d106d8638c251c7065cc7b6aa0b Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 25 Mar 2024 20:53:07 +0000
Subject: [PATCH 06/12] Fixed the typo of the model name.

---
 examples/object-segementation/SegmentAnythingModel/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/object-segementation/SegmentAnythingModel/README.md b/examples/object-segementation/SegmentAnythingModel/README.md
index 8a0f24300..73a7129b2 100644
--- a/examples/object-segementation/SegmentAnythingModel/README.md
+++ b/examples/object-segementation/SegmentAnythingModel/README.md
@@ -11,7 +11,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Owl-ViT Examples
+# Segment Anything Model Examples
 
 This directory contains an example script that demonstrates using SAM with graph mode.
 

From a16985b0e23211b7d2aef31241b0d934db3d97b2 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 25 Mar 2024 21:03:29 +0000
Subject: [PATCH 07/12] Remove unrelated files.

---
 .../object-segementation/ClipSeg/README.md    |  32 -----
 .../ClipSeg/run_example.py                    | 114 ------------------
 2 files changed, 146 deletions(-)
 delete mode 100644 examples/object-segementation/ClipSeg/README.md
 delete mode 100644 examples/object-segementation/ClipSeg/run_example.py

diff --git a/examples/object-segementation/ClipSeg/README.md b/examples/object-segementation/ClipSeg/README.md
deleted file mode 100644
index d8c76272a..000000000
--- a/examples/object-segementation/ClipSeg/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Owl-ViT Examples
-
-This directory contains an example script that demonstrates using ClipSeg with graph mode.
-
-## Single-HPU inference
-
-```bash
-python3 run_example.py \
-    --model_name_or_path "CIDAS/clipseg-rd64-refined" \
-    --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
-    --prompt "cat, remote, blanket" \
-    --warmup 3 \
-    --n_iterations 20 \
-    --use_hpu_graphs \
-    --bf16 \
-    --print_result
-```
-Models that have been validated:
-  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
\ No newline at end of file
diff --git a/examples/object-segementation/ClipSeg/run_example.py b/examples/object-segementation/ClipSeg/run_example.py
deleted file mode 100644
index a10b8fa9d..000000000
--- a/examples/object-segementation/ClipSeg/run_example.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# Copied from https://huggingface.co/docs/transformers/main/en/model_doc/clipseg
-
-from transformers import AutoProcessor, CLIPSegForImageSegmentation
-from PIL import Image
-import requests
-import torch
-import habana_frameworks.torch as ht
-import habana_frameworks.torch.core as htcore
-import time
-import argparse
-from torchvision.utils import save_image
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default="CIDAS/clipseg-rd64-refined",
-        type=str,
-        help="Path of the pre-trained model",
-    )
-    parser.add_argument(
-        "--image_path",
-        default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        type=str,
-        help='Path of the input image. Should be a single string (eg: --image_path "URL")',
-    )
-    parser.add_argument(
-        "--prompt",
-        default="a cat, a remote, a blanket",
-        type=str,
-        help='Prompt for classification. It should be a string seperated by comma. (eg: --prompt "a photo of a cat, a photo of a dog")',
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to use bf16 precision for classification.",
-    )
-    parser.add_argument(
-        "--print_result",
-        action="store_true",
-        help="Whether to print the classification results.",
-    )
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-
-    args = parser.parse_args()
-
-    adapt_transformers_to_gaudi()
-
-    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
-    model = CLIPSegForImageSegmentation.from_pretrained(args.model_name_or_path)
-
-    image = Image.open(requests.get(args.image_path, stream=True).raw)
-    texts = []
-    for text in args.prompt.split(','):
-        texts.append(text)
-
-    if args.use_hpu_graphs:
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-    autocast = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16)
-    model.to("hpu")
-
-    with torch.no_grad(), autocast:
-        for i in range(args.warmup):
-            inputs = processor(text=texts, images=[image]* len(texts), padding=True, return_tensors="pt").to("hpu")
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-
-        total_model_time = 0
-        for i in range(args.n_iterations):
-            inputs = processor(text=texts, images=[image]* len(texts), padding=True, return_tensors="pt").to("hpu")
-            model_start_time = time.time()
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-            model_end_time = time.time()
-            total_model_time = total_model_time + (model_end_time - model_start_time)
-
-            if args.print_result:
-                if (i == 0): # generate/output once only
-                    logits = outputs.logits
-                    for j in range(logits.shape[0]):
-                        threshold = 0.5
-                        segmented_image = ((torch.sigmoid(logits[j])  > threshold)*255).unsqueeze(0)
-                        segmented_image = segmented_image.to(torch.float32)
-                        save_image(segmented_image, 'segmented' + texts[j] + '.png')
-                    print('Segmented images are generated.')
-
-    print("n_iterations: " + str(args.n_iterations))
-    print("Total latency (ms): " + str(total_model_time*1000))
-    print("Average latency (ms): " + str(total_model_time*1000/args.n_iterations))
\ No newline at end of file

From 093158f8b4cf1a0eba163a0b23db02cc99365d55 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 15 Apr 2024 20:43:46 +0000
Subject: [PATCH 08/12] Added test case for SAM model.

---
 tests/test_modelenabling.py | 121 ++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 tests/test_modelenabling.py

diff --git a/tests/test_modelenabling.py b/tests/test_modelenabling.py
new file mode 100644
index 000000000..ab279bee6
--- /dev/null
+++ b/tests/test_modelenabling.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+from PIL import Image
+import torch
+import habana_frameworks.torch as ht
+import habana_frameworks.torch.core as htcore
+import time
+import argparse
+from transformers import OwlViTProcessor, OwlViTForObjectDetection, SamProcessor, SamModel
+import unittest
+from unittest import TestCase
+import numpy as np
+import os
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+adapt_transformers_to_gaudi()
+
+# For Gaudi 2
+LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 3.7109851837158203
+LATENCY_SAM_BF16_GRAPH_BASELINE = 98.92215728759766
+
+class GaudiSAMTester(TestCase):
+    """
+    Tests for Segment Anything Model - SAM
+    """
+    def prepare_model_and_processor(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-huge").to("hpu")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = model.eval()
+        return model, processor
+
+    def prepare_data(self):
+        image = Image.open(requests.get("https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png", stream=True).raw).convert("RGB")
+        input_points = [[[450, 600]]]
+        return input_points, image
+
+    def test_inference_default(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+        outputs = model(**inputs)
+        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+        scores = outputs.iou_scores
+        scores = scores[0][0]
+        expected_scores = np.array([0.9912, 0.9818, 0.9666])
+        self.assertEqual(len(scores), 3)
+        self.assertLess(np.abs(scores.cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_inference_bf16(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16): # Autocast BF16
+            outputs = model(**inputs)
+            masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+            scores = outputs.iou_scores
+            scores = scores[0][0]
+            expected_scores = np.array([0.9912, 0.9818, 0.9666])
+            self.assertEqual(len(scores), 3)
+            self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_inference_hpu_graphs(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+
+        model = ht.hpu.wrap_in_hpu_graph(model) #Apply graph
+
+        outputs = model(**inputs)
+        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+        scores = outputs.iou_scores
+        scores = scores[0][0]
+        expected_scores = np.array([0.9912, 0.9818, 0.9666])
+        self.assertEqual(len(scores), 3)
+        self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_no_latency_regression_bf16(self):
+        warmup = 3
+        iterations = 10
+
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+
+        model = ht.hpu.wrap_in_hpu_graph(model)
+
+        with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+            for i in range(warmup):
+                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+                outputs = model(**inputs)
+                torch.hpu.synchronize()
+            
+            total_model_time = 0
+            for i in range(iterations):
+                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+                model_start_time = time.time()
+                outputs = model(**inputs)
+                torch.hpu.synchronize()
+                model_end_time = time.time()
+                total_model_time = total_model_time + (model_end_time - model_start_time)
+        
+        latency = total_model_time*1000/iterations # in terms of ms
+        self.assertGreaterEqual(latency, 0.95 * LATENCY_SAM_BF16_GRAPH_BASELINE)
+
+# if __name__ == '__main__':
+#     unittest.main()
\ No newline at end of file

From 58042ebd472093b14944e56df629c1abf45fa596 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Mon, 22 Apr 2024 17:59:33 +0000
Subject: [PATCH 09/12] Renamed the test python file and add the related test
 into CI test.

---
 Makefile                         |   5 ++
 tests/test_image_segmentation.py | 121 +++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 tests/test_image_segmentation.py

diff --git a/Makefile b/Makefile
index 6e87a399a..bec72a57b 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,11 @@ fast_tests_diffusers:
 	python -m pip install .[tests]
 	python -m pytest tests/test_diffusers.py
 
+# Run unit and integration tests related to Image segmentation
+fast_tests_image_segmentation:
+	python -m pip install .[tests]
+	python -m pytest tests/test_image_segmentation.py
+
 # Run single-card non-regression tests
 slow_tests_1x: test_installs
 	python -m pytest tests/test_examples.py -v -s -k "single_card"
diff --git a/tests/test_image_segmentation.py b/tests/test_image_segmentation.py
new file mode 100644
index 000000000..ab279bee6
--- /dev/null
+++ b/tests/test_image_segmentation.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+from PIL import Image
+import torch
+import habana_frameworks.torch as ht
+import habana_frameworks.torch.core as htcore
+import time
+import argparse
+from transformers import OwlViTProcessor, OwlViTForObjectDetection, SamProcessor, SamModel
+import unittest
+from unittest import TestCase
+import numpy as np
+import os
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+adapt_transformers_to_gaudi()
+
+# For Gaudi 2
+LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 3.7109851837158203
+LATENCY_SAM_BF16_GRAPH_BASELINE = 98.92215728759766
+
+class GaudiSAMTester(TestCase):
+    """
+    Tests for Segment Anything Model - SAM
+    """
+    def prepare_model_and_processor(self):
+        model = SamModel.from_pretrained("facebook/sam-vit-huge").to("hpu")
+        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = model.eval()
+        return model, processor
+
+    def prepare_data(self):
+        image = Image.open(requests.get("https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png", stream=True).raw).convert("RGB")
+        input_points = [[[450, 600]]]
+        return input_points, image
+
+    def test_inference_default(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+        outputs = model(**inputs)
+        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+        scores = outputs.iou_scores
+        scores = scores[0][0]
+        expected_scores = np.array([0.9912, 0.9818, 0.9666])
+        self.assertEqual(len(scores), 3)
+        self.assertLess(np.abs(scores.cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_inference_bf16(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16): # Autocast BF16
+            outputs = model(**inputs)
+            masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+            scores = outputs.iou_scores
+            scores = scores[0][0]
+            expected_scores = np.array([0.9912, 0.9818, 0.9666])
+            self.assertEqual(len(scores), 3)
+            self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_inference_hpu_graphs(self):
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+
+        model = ht.hpu.wrap_in_hpu_graph(model) #Apply graph
+
+        outputs = model(**inputs)
+        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+        scores = outputs.iou_scores
+        scores = scores[0][0]
+        expected_scores = np.array([0.9912, 0.9818, 0.9666])
+        self.assertEqual(len(scores), 3)
+        self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
+
+    def test_no_latency_regression_bf16(self):
+        warmup = 3
+        iterations = 10
+
+        model, processor = self.prepare_model_and_processor()
+        input_points, image = self.prepare_data()
+
+        model = ht.hpu.wrap_in_hpu_graph(model)
+
+        with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+            for i in range(warmup):
+                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+                outputs = model(**inputs)
+                torch.hpu.synchronize()
+            
+            total_model_time = 0
+            for i in range(iterations):
+                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
+                model_start_time = time.time()
+                outputs = model(**inputs)
+                torch.hpu.synchronize()
+                model_end_time = time.time()
+                total_model_time = total_model_time + (model_end_time - model_start_time)
+        
+        latency = total_model_time*1000/iterations # in terms of ms
+        self.assertGreaterEqual(latency, 0.95 * LATENCY_SAM_BF16_GRAPH_BASELINE)
+
+# if __name__ == '__main__':
+#     unittest.main()
\ No newline at end of file

From 5cd8f2b0e18f68cdfaf9d3fe46dcdbb0125f2c94 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Tue, 30 Apr 2024 23:30:45 +0000
Subject: [PATCH 10/12] Aligned the file architecture by moving the files under
 object-segmentation. Used Automodel and related processor to replace
 model-specific API. Improved the testing logic.

---
 examples/object-segementation/README.md       |  23 +++-
 .../SegmentAnythingModel/README.md            |  33 -----
 .../run_example.py => run_example_sam.py}     |  17 +--
 tests/test_image_segmentation.py              |  35 ++---
 tests/test_modelenabling.py                   | 121 ------------------
 5 files changed, 44 insertions(+), 185 deletions(-)
 delete mode 100644 examples/object-segementation/SegmentAnythingModel/README.md
 rename examples/object-segementation/{SegmentAnythingModel/run_example.py => run_example_sam.py} (93%)
 delete mode 100644 tests/test_modelenabling.py

diff --git a/examples/object-segementation/README.md b/examples/object-segementation/README.md
index 4afb59849..3204acab3 100644
--- a/examples/object-segementation/README.md
+++ b/examples/object-segementation/README.md
@@ -13,10 +13,12 @@ limitations under the License.
 
 # Object Segmentation Examples
 
-This directory contains an example script that demonstrates how to perform object segmentation on Gaudi with graph mode.
+This directory contains two example script that demonstrates how to perform object segmentation on Gaudi with graph mode.
 
 ## Single-HPU inference
 
+### ClipSeg Model
+
 ```bash
 python3 run_example.py \
     --model_name_or_path "CIDAS/clipseg-rd64-refined" \
@@ -29,4 +31,21 @@ python3 run_example.py \
     --print_result
 ```
 Models that have been validated:
-  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
\ No newline at end of file
+  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
+
+### Segment Anything Model
+
+```bash
+python3 run_example_sam.py \
+    --model_name_or_path "facebook/sam-vit-huge" \
+    --image_path "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" \
+    --point_prompt "450,600" \
+    --warmup 3 \
+    --n_iterations 20 \
+    --use_hpu_graphs \
+    --bf16 \
+    --print_result
+```
+Models that have been validated:
+  - [facebook/sam-vit-base](https://huggingface.co/facebook/sam-vit-base)
+  - [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge)
\ No newline at end of file
diff --git a/examples/object-segementation/SegmentAnythingModel/README.md b/examples/object-segementation/SegmentAnythingModel/README.md
deleted file mode 100644
index 73a7129b2..000000000
--- a/examples/object-segementation/SegmentAnythingModel/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Segment Anything Model Examples
-
-This directory contains an example script that demonstrates using SAM with graph mode.
-
-## Single-HPU inference
-
-```bash
-python3 run_example.py \
-    --model_name_or_path "facebook/sam-vit-huge" \
-    --image_path "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" \
-    --point_prompt "450,600" \
-    --warmup 3 \
-    --n_iterations 20 \
-    --use_hpu_graphs \
-    --bf16 \
-    --print_result
-```
-Models that have been validated:
-  - [facebook/sam-vit-base](https://huggingface.co/facebook/sam-vit-base)
-  - [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge)
\ No newline at end of file
diff --git a/examples/object-segementation/SegmentAnythingModel/run_example.py b/examples/object-segementation/run_example_sam.py
similarity index 93%
rename from examples/object-segementation/SegmentAnythingModel/run_example.py
rename to examples/object-segementation/run_example_sam.py
index 592f7f429..016b318be 100644
--- a/examples/object-segementation/SegmentAnythingModel/run_example.py
+++ b/examples/object-segementation/run_example_sam.py
@@ -15,17 +15,18 @@
 
 # Copied from https://huggingface.co/facebook/sam-vit-base
 
-from transformers import SamModel, SamProcessor
-from PIL import Image
+import argparse
+import time
+
+import habana_frameworks.torch as ht
 import requests
 import torch
-import habana_frameworks.torch as ht
-import habana_frameworks.torch.core as htcore
-import time
-import argparse
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
 
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -69,8 +70,8 @@
 
     adapt_transformers_to_gaudi()
 
-    processor = SamProcessor.from_pretrained(args.model_name_or_path)
-    model = SamModel.from_pretrained(args.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
+    model = AutoModel.from_pretrained(args.model_name_or_path)
 
     image = Image.open(requests.get(args.image_path, stream=True).raw).convert("RGB")
     points = []
diff --git a/tests/test_image_segmentation.py b/tests/test_image_segmentation.py
index ab279bee6..cae5042af 100644
--- a/tests/test_image_segmentation.py
+++ b/tests/test_image_segmentation.py
@@ -13,21 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import requests
-from PIL import Image
-import torch
-import habana_frameworks.torch as ht
-import habana_frameworks.torch.core as htcore
 import time
-import argparse
-from transformers import OwlViTProcessor, OwlViTForObjectDetection, SamProcessor, SamModel
-import unittest
 from unittest import TestCase
+
+import habana_frameworks.torch as ht
 import numpy as np
-import os
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
 
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
+
 adapt_transformers_to_gaudi()
 
 # For Gaudi 2
@@ -39,8 +37,8 @@ class GaudiSAMTester(TestCase):
     Tests for Segment Anything Model - SAM
     """
     def prepare_model_and_processor(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge").to("hpu")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+        model = AutoModel.from_pretrained("facebook/sam-vit-huge").to("hpu")
+        processor = AutoProcessor.from_pretrained("facebook/sam-vit-huge")
         model = model.eval()
         return model, processor
 
@@ -54,7 +52,6 @@ def test_inference_default(self):
         input_points, image = self.prepare_data()
         inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
         outputs = model(**inputs)
-        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
         scores = outputs.iou_scores
         scores = scores[0][0]
         expected_scores = np.array([0.9912, 0.9818, 0.9666])
@@ -68,7 +65,6 @@ def test_inference_bf16(self):
 
         with torch.autocast(device_type="hpu", dtype=torch.bfloat16): # Autocast BF16
             outputs = model(**inputs)
-            masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
             scores = outputs.iou_scores
             scores = scores[0][0]
             expected_scores = np.array([0.9912, 0.9818, 0.9666])
@@ -83,7 +79,6 @@ def test_inference_hpu_graphs(self):
         model = ht.hpu.wrap_in_hpu_graph(model) #Apply graph
 
         outputs = model(**inputs)
-        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
         scores = outputs.iou_scores
         scores = scores[0][0]
         expected_scores = np.array([0.9912, 0.9818, 0.9666])
@@ -102,20 +97,18 @@ def test_no_latency_regression_bf16(self):
         with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
             for i in range(warmup):
                 inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-                outputs = model(**inputs)
+                _ = model(**inputs)
                 torch.hpu.synchronize()
-            
+
             total_model_time = 0
             for i in range(iterations):
                 inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
                 model_start_time = time.time()
-                outputs = model(**inputs)
+                _ = model(**inputs)
                 torch.hpu.synchronize()
                 model_end_time = time.time()
                 total_model_time = total_model_time + (model_end_time - model_start_time)
-        
+
         latency = total_model_time*1000/iterations # in terms of ms
-        self.assertGreaterEqual(latency, 0.95 * LATENCY_SAM_BF16_GRAPH_BASELINE)
+        self.assertLessEqual(latency, 1.05 * LATENCY_SAM_BF16_GRAPH_BASELINE)
 
-# if __name__ == '__main__':
-#     unittest.main()
\ No newline at end of file
diff --git a/tests/test_modelenabling.py b/tests/test_modelenabling.py
deleted file mode 100644
index ab279bee6..000000000
--- a/tests/test_modelenabling.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import requests
-from PIL import Image
-import torch
-import habana_frameworks.torch as ht
-import habana_frameworks.torch.core as htcore
-import time
-import argparse
-from transformers import OwlViTProcessor, OwlViTForObjectDetection, SamProcessor, SamModel
-import unittest
-from unittest import TestCase
-import numpy as np
-import os
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-adapt_transformers_to_gaudi()
-
-# For Gaudi 2
-LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 3.7109851837158203
-LATENCY_SAM_BF16_GRAPH_BASELINE = 98.92215728759766
-
-class GaudiSAMTester(TestCase):
-    """
-    Tests for Segment Anything Model - SAM
-    """
-    def prepare_model_and_processor(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-huge").to("hpu")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
-        model = model.eval()
-        return model, processor
-
-    def prepare_data(self):
-        image = Image.open(requests.get("https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png", stream=True).raw).convert("RGB")
-        input_points = [[[450, 600]]]
-        return input_points, image
-
-    def test_inference_default(self):
-        model, processor = self.prepare_model_and_processor()
-        input_points, image = self.prepare_data()
-        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-        outputs = model(**inputs)
-        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
-        scores = outputs.iou_scores
-        scores = scores[0][0]
-        expected_scores = np.array([0.9912, 0.9818, 0.9666])
-        self.assertEqual(len(scores), 3)
-        self.assertLess(np.abs(scores.cpu().detach().numpy() - expected_scores).max(), 0.02)
-
-    def test_inference_bf16(self):
-        model, processor = self.prepare_model_and_processor()
-        input_points, image = self.prepare_data()
-        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16): # Autocast BF16
-            outputs = model(**inputs)
-            masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
-            scores = outputs.iou_scores
-            scores = scores[0][0]
-            expected_scores = np.array([0.9912, 0.9818, 0.9666])
-            self.assertEqual(len(scores), 3)
-            self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
-
-    def test_inference_hpu_graphs(self):
-        model, processor = self.prepare_model_and_processor()
-        input_points, image = self.prepare_data()
-        inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-
-        model = ht.hpu.wrap_in_hpu_graph(model) #Apply graph
-
-        outputs = model(**inputs)
-        masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
-        scores = outputs.iou_scores
-        scores = scores[0][0]
-        expected_scores = np.array([0.9912, 0.9818, 0.9666])
-        self.assertEqual(len(scores), 3)
-        self.assertLess(np.abs(scores.to(torch.float32).cpu().detach().numpy() - expected_scores).max(), 0.02)
-
-    def test_no_latency_regression_bf16(self):
-        warmup = 3
-        iterations = 10
-
-        model, processor = self.prepare_model_and_processor()
-        input_points, image = self.prepare_data()
-
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-        with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-            for i in range(warmup):
-                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-                outputs = model(**inputs)
-                torch.hpu.synchronize()
-            
-            total_model_time = 0
-            for i in range(iterations):
-                inputs = processor(image, input_points=input_points, return_tensors="pt").to("hpu")
-                model_start_time = time.time()
-                outputs = model(**inputs)
-                torch.hpu.synchronize()
-                model_end_time = time.time()
-                total_model_time = total_model_time + (model_end_time - model_start_time)
-        
-        latency = total_model_time*1000/iterations # in terms of ms
-        self.assertGreaterEqual(latency, 0.95 * LATENCY_SAM_BF16_GRAPH_BASELINE)
-
-# if __name__ == '__main__':
-#     unittest.main()
\ No newline at end of file

From 09a1a335b191973952eb863ed8dc7dee5c6725bd Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Tue, 30 Apr 2024 23:41:55 +0000
Subject: [PATCH 11/12] Update README.md and index.mdx.

---
 README.md             | 2 +-
 docs/source/index.mdx | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5260e0625..9c88e1481 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,7 @@ The following model architectures, tasks and device distributions have been vali
 | OWLViT |   | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
 | ClipSeg |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| Segment Anything Model |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation/SegmentAnythingModel)</li> |
+| Segment Anything Model |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 </div>
 
 - Diffusers:
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 5afc2b9bf..132630c63 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -47,7 +47,11 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | GPT-J        | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-NeoX     | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | OPT          |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+<<<<<<< HEAD
 | Llama 2 / CodeLlama / Llama 3 / Llama Guard | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |
+=======
+| Llama 2 / CodeLlama / Llama 3 | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li> |
+>>>>>>> 455d728 (Update README.md and index.mdx.)
 | StableLM     |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Falcon       | <div style="text-align:left"><li>LoRA</li></div> | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | CodeGen      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
@@ -72,7 +76,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | OWLViT       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
 | ClipSeg      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| SAM          |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation/SegmentAnythingModel)</li> |
+| SAM          |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 
 - Diffusers
 

From b398475f0c4171cbd27f4cb32adb8c15c63ec996 Mon Sep 17 00:00:00 2001
From: Raymond Lau <kwun.fung.lau@intel.com>
Date: Wed, 29 May 2024 21:07:30 +0000
Subject: [PATCH 12/12] Update typos.

---
 docs/source/index.mdx                   | 4 ----
 examples/object-segementation/README.md | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 132630c63..22b8dcba5 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -47,11 +47,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | GPT-J        | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | GPT-NeoX     | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | OPT          |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-<<<<<<< HEAD
 | Llama 2 / CodeLlama / Llama 3 / Llama Guard | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |
-=======
-| Llama 2 / CodeLlama / Llama 3 | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li> |
->>>>>>> 455d728 (Update README.md and index.mdx.)
 | StableLM     |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Falcon       | <div style="text-align:left"><li>LoRA</li></div> | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | CodeGen      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
diff --git a/examples/object-segementation/README.md b/examples/object-segementation/README.md
index 3204acab3..fa1496a54 100644
--- a/examples/object-segementation/README.md
+++ b/examples/object-segementation/README.md
@@ -13,7 +13,7 @@ limitations under the License.
 
 # Object Segmentation Examples
 
-This directory contains two example script that demonstrates how to perform object segmentation on Gaudi with graph mode.
+This directory contains two examples script that demonstrates how to perform object segmentation on Gaudi with graph mode.
 
 ## Single-HPU inference