Bhaskar SR
09/03/2025, 3:56 AMCREATE TEMP TABLE tr_del_ids ON COMMIT DROP AS
SELECT id
FROM task_run
WHERE created < now() - interval '3 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
DELETE FROM task_run_state s
USING tr_del_ids d
WHERE s.task_run_id = d.id;
DELETE FROM task_run_tag t
USING tr_del_ids d
WHERE t.task_run_id = d.id;
DELETE FROM task_run_input ti
USING tr_del_ids d
WHERE ti.task_run_id = d.id OR ti.parent_task_run_id = d.id;
DELETE FROM artifact a
USING tr_del_ids d
WHERE a.task_run_id = d.id;
DELETE FROM task_run tr
USING tr_del_ids d
WHERE tr.id = d.id;
then commit the data.
This I am going to schedule
is this will create any problem?
please suggest
Marvin
09/03/2025, 3:56 AMMarvin
09/03/2025, 4:00 AMinterval '3 days'
, which would keep roughly 3 days of data.
Key suggestions:
1) Include logs in the purge
- Prefect stores run logs in a logs table with a task_run_id
column. If you don’t delete them, you’ll leave a lot of orphaned log rows.
- Table name is typically log
(singular) in the DB, though some environments use logs
. Verify your schema and include it in the delete.
2) Order of deletes and FKs
- There is a foreign key from task_run.state_id
to a row in task_run_state
. If you delete states first, that can fail unless the FK is defined as ON DELETE SET NULL or the constraint is deferrable.
- Safer patterns:
- Option A: Null out task_run.state_id
for the target runs before deleting task_run_state
.
- Option B: Delete parent task_run
rows and rely on ON DELETE CASCADE from children (if your schema has CASCADE set on all child tables). Since that’s not guaranteed for every table in every install, your explicit child deletes are reasonable—just be careful about the state_id
FK.
- Option C: Wrap the transaction and use deferrable constraints: SET CONSTRAINTS ALL DEFERRED;
then perform your deletes.
3) Batch deletes to reduce lock contention and bloat
- Deleting “5 Lacs” (500k) rows daily in one transaction can lock tables and bloat indexes. Prefer batched deletes (e.g., LIMIT 10k per loop) with commits between batches and run during off-hours. Run VACUUM/ANALYZE afterward.
4) Indexes
- Ensure you have indexes on:
- task_run(created)
, task_run(state_type)
, and ideally a composite on (state_type, created)
- child tables on task_run_id
(and parent_task_run_id
where applicable)
- This will keep the selection and deletes efficient.
5) Double-check your retention window
- If you truly want to keep only the last 2 days, use created < now() - interval '2 days'
, not '3 days'
.
A safer SQL template you can adapt
- This version:
- Uses a 2-day cutoff
- Drops state_id
first to avoid FK issues
- Deletes logs as well
- Keeps your other child deletes
- Please verify table names in your schema for the logs table (log
vs logs
) before running.
BEGIN;
-- choose the cutoff you want to delete (older than 2 days)
CREATE TEMP TABLE tr_del_ids ON COMMIT DROP AS
SELECT id
FROM task_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
-- break the FK from task_run.state_id -> task_run_state.id to avoid FK violations
UPDATE task_run tr
SET state_id = NULL
FROM tr_del_ids d
WHERE tr.id = d.id;
-- delete logs first to reduce volume (verify table name: log vs logs)
DELETE FROM log l
USING tr_del_ids d
WHERE l.task_run_id = d.id;
-- child tables
DELETE FROM task_run_input ti
USING tr_del_ids d
WHERE ti.task_run_id = d.id OR ti.parent_task_run_id = d.id;
DELETE FROM task_run_tag t
USING tr_del_ids d
WHERE t.task_run_id = d.id;
DELETE FROM artifact a
USING tr_del_ids d
WHERE a.task_run_id = d.id;
DELETE FROM task_run_state s
USING tr_del_ids d
WHERE s.task_run_id = d.id;
-- finally delete the task_run rows
DELETE FROM task_run tr
USING tr_del_ids d
WHERE tr.id = d.id;
COMMIT;
If you prefer to rely on cascades:
- If your FKs are defined with ON DELETE CASCADE from children (e.g., task_run_state.task_run_id
-> task_run.id
), you can just delete from task_run
and let the DB cascade to the child tables. However, you’d still need to handle the log
table separately if it does not have an FK or cascade rule.
Scheduling notes:
- Run in off-hours, in batches if possible.Marvin
09/03/2025, 4:00 AMdelete_task_run
per ID:
- prefect.server.models.task_runs
- Logs and artifacts APIs: logs, artifacts
- There is no built-in CLI retention/prune command in Prefect 3.x today; most users handle retention with DB jobs or custom scripts.
If you can share:
- Prefect version
- Whether your logs table is log
or logs
- Whether you want to keep exactly 2 days
…I can tweak the SQL and a batched version for you.Bhaskar SR
09/03/2025, 4:32 AMMarvin
09/03/2025, 4:34 AMMarvin
09/03/2025, 4:35 AMBEGIN;
-- IDs to delete: flow runs older than 2 days in terminal states
CREATE TEMP TABLE fr_del_ids ON COMMIT DROP AS
SELECT id
FROM flow_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
-- IDs to delete: task runs older than 2 days in terminal states
-- plus any task runs that belong to an old flow run we’re deleting
CREATE TEMP TABLE tr_del_ids ON COMMIT DROP AS
SELECT id
FROM task_run
WHERE (created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED'))
OR flow_run_id IN (SELECT id FROM fr_del_ids);
-- Break FKs from latest-state pointers before deleting state history
UPDATE task_run tr
SET state_id = NULL
FROM tr_del_ids d
WHERE tr.id = d.id;
UPDATE flow_run fr
SET state_id = NULL
FROM fr_del_ids d
WHERE fr.id = d.id;
-- Logs: verify your table name is `log` (singular) or `logs` (plural)
-- Delete task-level logs
DELETE FROM log l
USING tr_del_ids d
WHERE l.task_run_id = d.id;
-- Delete flow-level logs (covers logs that aren’t tied to a task run)
DELETE FROM log l
USING fr_del_ids d
WHERE l.flow_run_id = d.id;
-- Task-run children
DELETE FROM task_run_input ti
USING tr_del_ids d
WHERE ti.task_run_id = d.id OR ti.parent_task_run_id = d.id;
DELETE FROM task_run_tag t
USING tr_del_ids d
WHERE t.task_run_id = d.id;
DELETE FROM artifact a
USING tr_del_ids d
WHERE a.task_run_id = d.id;
DELETE FROM task_run_state s
USING tr_del_ids d
WHERE s.task_run_id = d.id;
-- Delete task runs
DELETE FROM task_run tr
USING tr_del_ids d
WHERE tr.id = d.id;
-- Flow-run children
-- flow_run_input exists in Prefect’s schema; delete if present in your DB
DELETE FROM flow_run_input fi
USING fr_del_ids d
WHERE fi.flow_run_id = d.id;
-- Some artifacts may be attached at the flow level; delete if column exists
DELETE FROM artifact a
USING fr_del_ids d
WHERE a.flow_run_id = d.id;
DELETE FROM flow_run_state s
USING fr_del_ids d
WHERE s.flow_run_id = d.id;
-- Delete flow runs
DELETE FROM flow_run fr
USING fr_del_ids d
WHERE fr.id = d.id;
COMMIT;
Operational tips
- Batch deletes to reduce locks/bloat: loop in chunks (e.g., 10k–50k rows at a time) with commits between batches.
- Run during off-hours, monitor for long locks, and VACUUM/ANALYZE the affected tables afterward.
- Ensure indexes exist on:
- task_run(created), task_run(state_type), and task_run(flow_run_id)
- flow_run(created), flow_run(state_type)
- child tables on task_run_id/flow_run_id
- Preview with SELECT COUNT(*) before running deletes, and test in a staging environment if possible.Marvin
09/03/2025, 4:35 AMlog
or logs
)
- Whether your artifacts table has both task_run_id
and flow_run_id
…I can tailor the script exactly and provide a batched version you can schedule.Bhaskar SR
09/03/2025, 4:42 AMMarvin
09/03/2025, 4:42 AMMarvin
09/03/2025, 4:43 AMlog
table that has both flow_run_id
and task_run_id
, here’s a final 2‑day retention script that:
- Deletes old task runs (and their children)
- Deletes old flow runs (and their children)
- Handles logs at both task and flow levels
- Avoids FK issues by nulling the state_id
pointers before deleting state history
One-shot (full set) version
- Test with SELECT counts first, then run the delete in a maintenance window.
- If you expect large volumes, use the batched version below instead.
-- Preview counts (optional)
SELECT COUNT(*) AS old_task_runs
FROM task_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
SELECT COUNT(*) AS old_flow_runs
FROM flow_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
BEGIN;
-- Flow runs to delete (older than 2 days, terminal)
CREATE TEMP TABLE fr_del_ids ON COMMIT DROP AS
SELECT id
FROM flow_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
-- Task runs to delete:
-- (a) older than 2 days in terminal state
-- (b) or belong to flow runs being deleted
CREATE TEMP TABLE tr_del_ids ON COMMIT DROP AS
SELECT id
FROM task_run
WHERE (created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED'))
OR flow_run_id IN (SELECT id FROM fr_del_ids);
-- Break latest-state FKs to avoid FK violations when removing state history
UPDATE task_run tr
SET state_id = NULL
FROM tr_del_ids d
WHERE tr.id = d.id;
UPDATE flow_run fr
SET state_id = NULL
FROM fr_del_ids d
WHERE fr.id = d.id;
-- Logs (delete by task_run_id and flow_run_id)
DELETE FROM log l
USING tr_del_ids d
WHERE l.task_run_id = d.id;
DELETE FROM log l
USING fr_del_ids d
WHERE l.flow_run_id = d.id;
-- Task-run children
DELETE FROM task_run_input ti
USING tr_del_ids d
WHERE ti.task_run_id = d.id OR ti.parent_task_run_id = d.id;
DELETE FROM task_run_tag t
USING tr_del_ids d
WHERE t.task_run_id = d.id;
DELETE FROM artifact a
USING tr_del_ids d
WHERE a.task_run_id = d.id;
DELETE FROM task_run_state s
USING tr_del_ids d
WHERE s.task_run_id = d.id;
-- Delete task runs
DELETE FROM task_run tr
USING tr_del_ids d
WHERE tr.id = d.id;
-- Flow-run children
DELETE FROM flow_run_input fi
USING fr_del_ids d
WHERE fi.flow_run_id = d.id;
DELETE FROM artifact a
USING fr_del_ids d
WHERE a.flow_run_id = d.id;
DELETE FROM flow_run_state s
USING fr_del_ids d
WHERE s.flow_run_id = d.id;
-- Delete flow runs
DELETE FROM flow_run fr
USING fr_del_ids d
WHERE fr.id = d.id;
COMMIT;
Batched “unit” version (safer for big tables)
- Run this block repeatedly (e.g., every few minutes) until both counts reach 0.
- Adjust LIMITs to your environment (10k–50k is common).
```
-- Preview counts
SELECT COUNT(*) AS old_task_runs
FROM task_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
SELECT COUNT(*) AS old_flow_runs
FROM flow_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED');
BEGIN;
-- Batch of flow runs to delete (LIMIT controls batch size)
CREATE TEMP TABLE fr_del_ids ON COMMIT DROP AS
SELECT id
FROM flow_run
WHERE created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED')
LIMIT 50000;
-- Batch of task runs to delete
CREATE TEMP TABLE tr_del_ids ON COMMIT DROP AS
SELECT id
FROM task_run
WHERE (created < now() - interval '2 days'
AND state_type IN ('COMPLETED','FAILED','CANCELLED','CRASHED'))
OR flow_run_id IN (SELECT id FROM fr_del_ids)
LIMIT 50000;
-- If no rows selected, nothing to do in this batch
-- You can check:
-- SELECT count() FROM tr_del_ids; SELECT count() FROM fr_del_ids;
UPDATE task_run tr
SET state_id = NULL
FROM tr_del_ids dMarvin
09/03/2025, 4:43 AM