Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

intermittent test_to_pytorch_dataloader() failures #69

Open
mkornacker opened this issue Jan 23, 2024 · 2 comments
Open

intermittent test_to_pytorch_dataloader() failures #69

mkornacker opened this issue Jan 23, 2024 · 2 comments
Assignees

Comments

@mkornacker
Copy link
Collaborator

This is what the logs say:
=================================================================================================== FAILURES ====================================================================================================
___________________________________________________________________________________ TestDataFrame.test_to_pytorch_dataloader ____________________________________________________________________________________

self = <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fe7cc9a6710>, timeout = 5.0

def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
    # Tries to fetch data from `self._data_queue` once for a given timeout.
    # This can also be used as inner loop of fetching without timeout, with
    # the sender status as the loop condition.
    #
    # This raises a `RuntimeError` if any worker died expectedly. This error
    # can come from either the SIGCHLD handler in `_utils/signal_handling.py`
    # (only for non-Windows platforms), or the manual check below on errors
    # and timeouts.
    #
    # Returns a 2-tuple:
    #   (bool: whether successfully get data, any: data if successful else None)
    try:
      data = self._data_queue.get(timeout=timeout)

../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1132:


/usr/lib/python3.10/multiprocessing/queues.py:113: in get
if not self._poll(timeout):
/usr/lib/python3.10/multiprocessing/connection.py:257: in poll
return self._poll(timeout)
/usr/lib/python3.10/multiprocessing/connection.py:424: in _poll
r = wait([self], timeout)
/usr/lib/python3.10/multiprocessing/connection.py:931: in wait
ready = selector.select(timeout)
/usr/lib/python3.10/selectors.py:416: in select
fd_event_list = self._selector.poll(timeout)


signum = 17, frame = <frame at 0x7fe7f801d840, file '/usr/lib/python3.10/selectors.py', line 417, code select>

def handler(signum, frame):
    # This following call uses `waitid` with WNOHANG from C side. Therefore,
    # Python can still get and update the process status successfully.
  _error_if_any_worker_fails()

E RuntimeError: DataLoader worker (pid 3972440) is killed by signal: Segmentation fault.

../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py:66: RuntimeError

The above exception was the direct cause of the following exception:

self = <test_dataframe.TestDataFrame object at 0x7fe91f129210>
all_datatypes_tbl = Column Name Type Computed With
row_id int
c_array ar...g
c_timestamp timestamp
c_video video

def test_to_pytorch_dataloader(self, all_datatypes_tbl: catalog.MutableTable) -> None:
    """ Tests the dataset works well with pytorch dataloader:
        1. compatibility with multiprocessing
        2. compatibility of all types with default collate_fn
    """
    import torch.utils.data
    @pt.udf(param_types=[pt.JsonType()], return_type=pt.JsonType())
    def restrict_json_for_default_collate(obj):
        keys = ['id', 'label', 'iscrowd', 'bounding_box']
        return {k: obj[k] for k in keys}

    t = all_datatypes_tbl
    df = t.select(
        t.row_id,
        t.c_int,
        t.c_float,
        t.c_bool,
        t.c_timestamp,
        t.c_array,
        t.c_video,
        # default collate_fn doesnt support null values, nor lists of different lengths
        # but does allow some dictionaries if they are uniform
        c_json = restrict_json_for_default_collate(t.c_json.detections[0]),
        # images must be uniform shape for pytorch collate_fn to not fail
        c_image=t.c_image.resize([220, 224]).convert('RGB')
    )
    df_size = df.count()
    ds = df.to_pytorch_dataset(image_format='pt')
    # test serialization:
    #  - pickle.dumps() and pickle.loads() must work so that
    #   we can use num_workers > 0
    x = pickle.dumps(ds)
    _ = pickle.loads(x)

    # test we get all rows
    def check_recover_all_rows(ds, size : int, **kwargs):
        dl = torch.utils.data.DataLoader(ds, **kwargs)
        loaded_ids = set()
        for batch in dl:
            for row_id in batch['row_id']:
                val = int(row_id) # np.int -> int or will fail set equality test below.
                assert val not in loaded_ids, val
                loaded_ids.add(val)

        assert loaded_ids == set(range(size))

    # check different number of workers
    check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=0) # within this process
    check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=2) # two separate processes

    # check edge case where some workers get no rows
    short_size = 1
    df_short = df.where(t.row_id < short_size)
    ds_short = df_short.to_pytorch_dataset(image_format='pt')
  check_recover_all_rows(ds_short, size=short_size, batch_size=13, num_workers=short_size+1)

pixeltable/tests/test_dataframe.py:344:


pixeltable/tests/test_dataframe.py:328: in check_recover_all_rows
for batch in dl:
../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630: in next
data = self._next_data()
../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1328: in _next_data
idx, data = self._get_data()
../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1294: in _get_data
success, data = self._try_get_data()


self = <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fe7cc9a6710>, timeout = 5.0

def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
    # Tries to fetch data from `self._data_queue` once for a given timeout.
    # This can also be used as inner loop of fetching without timeout, with
    # the sender status as the loop condition.
    #
    # This raises a `RuntimeError` if any worker died expectedly. This error
    # can come from either the SIGCHLD handler in `_utils/signal_handling.py`
    # (only for non-Windows platforms), or the manual check below on errors
    # and timeouts.
    #
    # Returns a 2-tuple:
    #   (bool: whether successfully get data, any: data if successful else None)
    try:
        data = self._data_queue.get(timeout=timeout)
        return (True, data)
    except Exception as e:
        # At timeout and error, we manually check whether any worker has
        # failed. Note that this is the only mechanism for Windows to detect
        # worker failures.
        failed_workers = []
        for worker_id, w in enumerate(self._workers):
            if self._workers_status[worker_id] and not w.is_alive():
                failed_workers.append(w)
                self._mark_worker_as_unavailable(worker_id)
        if len(failed_workers) > 0:
            pids_str = ', '.join(str(w.pid) for w in failed_workers)
          raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e

E RuntimeError: DataLoader worker (pid(s) 3972440) exited unexpectedly

../.cache/pypoetry/virtualenvs/pixeltable-9-yc5FAE-py3.10/lib/python3.10/site-packages/torch/utils/data/dataloader.py:1145: RuntimeError

@orm011
Copy link
Collaborator

orm011 commented Jan 23, 2024

Which commit is this on

@orm011
Copy link
Collaborator

orm011 commented Jan 23, 2024

Is it the same test you see warnings on?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants