Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Polygon tracking feature with XMem tracker #7829

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ interface State {
activeLabelID: number | null;
activeTracker: MLModel | null;
convertMasksToPolygons: boolean;
selectedObjectType: ObjectType;
trackedShapes: TrackedShape[];
fetching: boolean;
pointsReceived: boolean;
Expand Down Expand Up @@ -235,6 +236,7 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
super(props);
this.state = {
convertMasksToPolygons: false,
selectedObjectType: ObjectType.SHAPE,
activeInteractor: props.interactors.length ? props.interactors[0] : null,
activeTracker: props.trackers.length ? props.trackers[0] : null,
activeLabelID: props.labels.length ? props.labels[0].id as number : null,
Expand Down Expand Up @@ -593,7 +595,7 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
const portals = !activeTracker ?
[] :
states
.filter((objectState) => objectState.objectType === 'track' && objectState.shapeType === 'rectangle')
.filter((objectState) => objectState.objectType === 'track' && (objectState.shapeType === 'rectangle' || objectState.shapeType === 'polygon'))
.map((objectState: any): React.ReactPortal | null => {
const { clientID } = objectState;
const selectorID = `#cvat-objects-sidebar-state-item-${clientID}`;
Expand Down Expand Up @@ -822,7 +824,10 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
job: jobInstance.id,
}) as TrackerResults;

response.shapes = response.shapes.map(trackedRectangleMapper);
// If shape type is rectangle, keep same approach
if (response.shapes[0].length === 4) {
response.shapes = response.shapes.map(trackedRectangleMapper);
}
for (let i = 0; i < trackableObjects.clientIDs.length; i++) {
const clientID = trackableObjects.clientIDs[i];
const shape = response.shapes[i];
Expand Down Expand Up @@ -859,15 +864,15 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
}

private async constructFromPoints(): Promise<void> {
const { convertMasksToPolygons } = this.state;
const { convertMasksToPolygons, selectedObjectType } = this.state;
const {
frame, labels, curZOrder, activeLabelID, createAnnotations,
} = this.props;

if (convertMasksToPolygons) {
const object = new core.classes.ObjectState({
frame,
objectType: ObjectType.SHAPE,
objectType: selectedObjectType,
source: core.enums.Source.SEMI_AUTO,
label: labels.find((label) => label.id === activeLabelID as number) as Label,
shapeType: ShapeType.POLYGON,
Expand Down Expand Up @@ -958,6 +963,29 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
);
}

private renderObjectTypeBlock(): JSX.Element {
const { selectedObjectType } = this.state;
const objectTypes = Object.values(ObjectType);
objectTypes.splice(objectTypes.indexOf(ObjectType.TAG), 1);
Comment on lines +968 to +969
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you mean that this code supports only shape and track, you should write exactly this.
Otherwise, when we will add one more object type, this part will be automatically broken and nobody will know about it

return (
<Row className='cvat-interactors-setups-container'>
<Select
value={selectedObjectType}
onChange={(value: ObjectType) => {
this.setState({ selectedObjectType: value });
}}
>
{objectTypes.map((type) => (
<Select.Option key={type} value={type}>
{type}
</Select.Option>
))}
</Select>
<Text>Object Type</Text>
</Row>
);
}

private renderLabelBlock(): JSX.Element {
const { labels } = this.props;
const { activeLabelID } = this.state;
Expand Down Expand Up @@ -1346,6 +1374,7 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
}

private renderPopoverContent(): JSX.Element {
const { convertMasksToPolygons } = this.state;
return (
<div className='cvat-tools-control-popover-content'>
<Row justify='start'>
Expand All @@ -1358,6 +1387,7 @@ export class ToolsControlComponent extends React.PureComponent<Props, State> {
<Tabs type='card' tabBarGutter={8}>
<Tabs.TabPane key='interactors' tab='Interactors'>
{this.renderMasksConvertingBlock()}
{convertMasksToPolygons ? this.renderObjectTypeBlock() : null}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xmem is a tracker, not interactor. The difference between them is exactly that interactors produce shapes on one frame, when tracker produces shapes on multiple frames (tracks).
Why are you modifying the block responsible for interactors?

{this.renderLabelBlock()}
{this.renderInteractorBlock()}
</Tabs.TabPane>
Expand Down
67 changes: 67 additions & 0 deletions serverless/pytorch/omerferhatt/xmem/nuclio/function-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
metadata:
name: pth-omerferhatt-xmem
namespace: cvat
annotations:
name: XMem
type: tracker
spec:
framework: pytorch

spec:
description: Long-Term Object Segmentation with an Atkinson-Shiffrin Memory Model
runtime: 'python:3.9'
handler: main:handler
eventTimeout: 30s

env:
- name: PYTHONPATH
value: /opt/nuclio/xmem

build:
image: cvat.pth.omerferhatt.xmem:latest-gpu
baseImage: nvidia/cuda:12.1.0-runtime-ubuntu22.04

directives:
preCopy:
- kind: RUN
value: |-
apt update \
&& apt install -y --no-install-recommends \
wget \
git \
ca-certificates \
python-is-python3 \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
- kind: WORKDIR
value: /opt/nuclio
- kind: RUN
value: git clone --branch main https://github.com/omerferhatt/XMem xmem
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not to use original repository?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also it is better to use specific tag. main branch may be changed and new version will not compatible with old dependencies, for example.

- kind: RUN
value: pip install opencv-python-headless jsonpickle
- kind: RUN
value: |-
pip install torch torchvision
Comment on lines +42 to +45
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Usually it is a good idea to fix dependencies version. Otherwise sometimes it will turn around, that the image cannot be build anymore.

- kind: RUN
value: wget 'https://www.dropbox.com/scl/fi/5m1l747p15qzgq023e0q9/xmem.pth?rlkey=ss2kjaq4qlvvk5juucyvtmrh8&dl=0' -O '/xmem.pth'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think we should put the file to root /.
Better to put to current working directory.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think dropbox is reliable enough.

Also it is not clear who is the weights owner, not clear weights license.
Is there original place?


triggers:
myHttpTrigger:
maxWorkers: 1
kind: 'http'
workerAvailabilityTimeoutMilliseconds: 10000
attributes:
# Set value from the calculation of tracking of 100 objects at the same time on a 4k image
maxRequestBodySize: 1073741824 # 1GB
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to send 1 Gb body, within HTTP request?


resources:
limits:
nvidia.com/gpu: 1

platform:
attributes:
restartPolicy:
name: always
maximumRetryCount: 3
mountMode: volume
63 changes: 63 additions & 0 deletions serverless/pytorch/omerferhatt/xmem/nuclio/function.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
metadata:
name: pth-omerferhatt-xmem
namespace: cvat
annotations:
name: XMem
type: tracker
spec:
framework: pytorch

spec:
description: Long-Term Object Segmentation with an Atkinson-Shiffrin Memory Model
runtime: 'python:3.9'
handler: main:handler
eventTimeout: 30s

env:
- name: PYTHONPATH
value: /opt/nuclio/xmem

build:
image: cvat.pth.omerferhatt.xmem
baseImage: ubuntu:22.04

directives:
preCopy:
- kind: RUN
value: |-
apt update \
&& apt install -y --no-install-recommends \
wget \
git \
ca-certificates \
python-is-python3 \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
- kind: WORKDIR
value: /opt/nuclio
- kind: RUN
value: git clone --branch main https://github.com/omerferhatt/XMem xmem
- kind: RUN
value: pip install opencv-python-headless jsonpickle
- kind: RUN
value: |-
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu
- kind: RUN
value: wget 'https://www.dropbox.com/scl/fi/5m1l747p15qzgq023e0q9/xmem.pth?rlkey=ss2kjaq4qlvvk5juucyvtmrh8&dl=0' -O '/xmem.pth'

triggers:
myHttpTrigger:
maxWorkers: 1
kind: 'http'
workerAvailabilityTimeoutMilliseconds: 10000
attributes:
# Set value from the calculation of tracking of 100 objects at the same time on a 4k image
maxRequestBodySize: 1073741824 # 1GB

platform:
attributes:
restartPolicy:
name: always
maximumRetryCount: 3
mountMode: volume
42 changes: 42 additions & 0 deletions serverless/pytorch/omerferhatt/xmem/nuclio/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import base64
import io
import json

import numpy as np
from model_handler import ModelHandler
from PIL import Image


def init_context(context):
context.logger.info("Init context... 0%")
model = ModelHandler()
context.user_data.model = model
context.logger.info("Init context...100%")


def handler(context, event):
context.logger.info("Run XMem model")
data = event.body
buf = io.BytesIO(base64.b64decode(data["image"]))
shapes = data.get("shapes")
states = data.get("states")

image = Image.open(buf).convert("RGB")
image = np.array(image)[:, :, ::-1].copy()
results = {"shapes": [], "states": []}
for i, shape in enumerate(shapes):
context.logger.info(f"Inference [{i}] started")

shape, state = context.user_data.model.infer(
image, shape, states[i] if i < len(states) else None
)
results["shapes"].append(shape)
results["states"].append(state)
context.logger.info(f"Inference [{i}] finised")

return context.Response(
body=json.dumps(results),
headers={},
content_type="application/json",
status_code=200,
)