Hi,
I'm running Prefect on a RunPod instance. When I'm running a task that takes a bit of time (say, 20+ minutes), sometimes I get a worker pool crash:
+ Exception Group Traceback (most recent call last):
| File "/usr/local/lib/python3.11/dist-packages/prefect/cli/_utilities.py", line 44, in wrapper
| return fn(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/cli/_types.py", line 156, in sync_fn
| return asyncio.run(async_fn(*args, **kwargs))
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
| return runner.run(main)
| ^^^^^^^^^^^^^^^^
| File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
| return self._loop.run_until_complete(task)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
| return future.result()
| ^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/cli/worker.py", line 168, in start
| await worker.start(
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/process.py", line 158, in start
| async with self as worker:
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/process.py", line 320, in
aexit
| await super().__aexit__(*exc_info)
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/base.py", line 1600, in
aexit
| raise exceptions[0] from None
| File "/usr/lib/python3.11/contextlib.py", line 728, in
aexit
| cb_suppress = await cb(*exc_details)
| ^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/runner/runner.py", line 1610, in
aexit
| await self._exit_stack.__aexit__(*exc_info)
| File "/usr/lib/python3.11/contextlib.py", line 745, in
aexit
| raise exc_details[1]
| File "/usr/lib/python3.11/contextlib.py", line 728, in
aexit
| cb_suppress = await cb(*exc_details)
| ^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py", line 736, in
aexit
| raise BaseExceptionGroup(
| ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Exception Group Traceback (most recent call last):
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/process.py", line 162, in start
| async with anyio.create_task_group() as loops_task_group:
| File "/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py", line 736, in
aexit
| raise BaseExceptionGroup(
| ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/usr/local/lib/python3.11/dist-packages/prefect/utilities/services.py", line 64, in critical_service_loop
| await workload()
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/base.py", line 1107, in sync_with_backend
| await self._update_local_work_pool_info()
| File "/usr/local/lib/python3.11/dist-packages/prefect/workers/base.py", line 1004, in _update_local_work_pool_info
| work_pool = await self._client.create_work_pool(work_pool=wp)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/client/orchestration/_work_pools/client.py", line 478, in create_work_pool
| response = await self.request(
| ^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/client/orchestration/base.py", line 53, in request
| return await self._client.send(request)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/dist-packages/prefect/client/base.py", line 379, in send
| response.raise_for_status()
| File "/usr/local/lib/python3.11/dist-packages/prefect/client/base.py", line 163, in raise_for_status
| raise PrefectHTTPStatusError.from_httpx_error(exc) from exc.
cause
| prefect.exceptions.PrefectHTTPStatusError: Client error '404 Not Found' for url '
https://rdzs18c4k9ub52-4200.proxy.runpod.net/api/work_pools/'
| For more information check:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404
What can be causing this?