@@ -73,6 +73,7 @@ class _TableConfig:
73
73
in the table. to ignore certain records even if they are the latest in the table, you can
74
74
supply additional filters here (e.g. externally triggered dag runs)
75
75
:param keep_last_group_by: if keeping the last record, can keep the last record for each group
76
+ :param dependent_tables: list of tables which have FK relationship with this table
76
77
"""
77
78
78
79
table_name : str
@@ -81,6 +82,10 @@ class _TableConfig:
81
82
keep_last : bool = False
82
83
keep_last_filters : Any | None = None
83
84
keep_last_group_by : Any | None = None
85
+ # We explicitly list these tables instead of detecting foreign keys automatically,
86
+ # because the relationships are unlikely to change and the number of tables is small.
87
+ # Relying on automation here would increase complexity and reduce maintainability.
88
+ dependent_tables : list [str ] | None = None
84
89
85
90
def __post_init__ (self ):
86
91
self .recency_column = column (self .recency_column_name )
@@ -104,29 +109,46 @@ def readable_config(self):
104
109
105
110
config_list : list [_TableConfig ] = [
106
111
_TableConfig (table_name = "job" , recency_column_name = "latest_heartbeat" ),
107
- _TableConfig (table_name = "dag" , recency_column_name = "last_parsed_time" ),
112
+ _TableConfig (
113
+ table_name = "dag" ,
114
+ recency_column_name = "last_parsed_time" ,
115
+ dependent_tables = ["dag_version" , "deadline" ],
116
+ ),
108
117
_TableConfig (
109
118
table_name = "dag_run" ,
110
119
recency_column_name = "start_date" ,
111
120
extra_columns = ["dag_id" , "run_type" ],
112
121
keep_last = True ,
113
122
keep_last_filters = [column ("run_type" ) != DagRunType .MANUAL ],
114
123
keep_last_group_by = ["dag_id" ],
124
+ dependent_tables = ["task_instance" , "deadline" ],
115
125
),
116
126
_TableConfig (table_name = "asset_event" , recency_column_name = "timestamp" ),
117
127
_TableConfig (table_name = "import_error" , recency_column_name = "timestamp" ),
118
128
_TableConfig (table_name = "log" , recency_column_name = "dttm" ),
119
129
_TableConfig (table_name = "sla_miss" , recency_column_name = "timestamp" ),
120
- _TableConfig (table_name = "task_instance" , recency_column_name = "start_date" ),
130
+ _TableConfig (
131
+ table_name = "task_instance" ,
132
+ recency_column_name = "start_date" ,
133
+ dependent_tables = ["task_instance_history" , "xcom" ],
134
+ ),
121
135
_TableConfig (table_name = "task_instance_history" , recency_column_name = "start_date" ),
122
136
_TableConfig (table_name = "task_reschedule" , recency_column_name = "start_date" ),
123
137
_TableConfig (table_name = "xcom" , recency_column_name = "timestamp" ),
124
138
_TableConfig (table_name = "_xcom_archive" , recency_column_name = "timestamp" ),
125
139
_TableConfig (table_name = "callback_request" , recency_column_name = "created_at" ),
126
140
_TableConfig (table_name = "celery_taskmeta" , recency_column_name = "date_done" ),
127
141
_TableConfig (table_name = "celery_tasksetmeta" , recency_column_name = "date_done" ),
128
- _TableConfig (table_name = "trigger" , recency_column_name = "created_date" ),
129
- _TableConfig (table_name = "dag_version" , recency_column_name = "created_at" ),
142
+ _TableConfig (
143
+ table_name = "trigger" ,
144
+ recency_column_name = "created_date" ,
145
+ dependent_tables = ["task_instance" ],
146
+ ),
147
+ _TableConfig (
148
+ table_name = "dag_version" ,
149
+ recency_column_name = "created_at" ,
150
+ dependent_tables = ["task_instance" , "dag_run" ],
151
+ ),
130
152
_TableConfig (table_name = "deadline" , recency_column_name = "deadline_time" ),
131
153
]
132
154
@@ -234,6 +256,7 @@ def _do_delete(
234
256
logger .debug ("delete statement:\n %s" , delete .compile ())
235
257
session .execute (delete )
236
258
session .commit ()
259
+
237
260
except BaseException as e :
238
261
raise e
239
262
finally :
@@ -414,17 +437,37 @@ def _suppress_with_logging(table: str, session: Session):
414
437
session .rollback ()
415
438
416
439
417
- def _effective_table_names (* , table_names : list [str ] | None ) -> tuple [set [str ], dict [str , _TableConfig ]]:
440
+ def _effective_table_names (* , table_names : list [str ] | None ) -> tuple [list [str ], dict [str , _TableConfig ]]:
418
441
desired_table_names = set (table_names or config_dict )
419
- effective_config_dict = {k : v for k , v in config_dict .items () if k in desired_table_names }
420
- effective_table_names = set (effective_config_dict )
421
- if desired_table_names != effective_table_names :
422
- outliers = desired_table_names - effective_table_names
442
+
443
+ outliers = desired_table_names - set (config_dict .keys ())
444
+ if outliers :
423
445
logger .warning (
424
- "The following table(s) are not valid choices and will be skipped: %s" , sorted (outliers )
446
+ "The following table(s) are not valid choices and will be skipped: %s" ,
447
+ sorted (outliers ),
425
448
)
426
- if not effective_table_names :
449
+ desired_table_names = desired_table_names - outliers
450
+
451
+ visited : set [str ] = set ()
452
+ effective_table_names : list [str ] = []
453
+
454
+ def collect_deps (table : str ):
455
+ if table in visited :
456
+ return
457
+ visited .add (table )
458
+ config = config_dict [table ]
459
+ for dep in config .dependent_tables or []:
460
+ collect_deps (dep )
461
+ effective_table_names .append (table )
462
+
463
+ for table_name in desired_table_names :
464
+ collect_deps (table_name )
465
+
466
+ effective_config_dict = {n : config_dict [n ] for n in effective_table_names }
467
+
468
+ if not effective_config_dict :
427
469
raise SystemExit ("No tables selected for db cleanup. Please choose valid table names." )
470
+
428
471
return effective_table_names , effective_config_dict
429
472
430
473
@@ -480,6 +523,8 @@ def run_cleanup(
480
523
:param session: Session representing connection to the metadata database.
481
524
"""
482
525
clean_before_timestamp = timezone .coerce_datetime (clean_before_timestamp )
526
+
527
+ # Get all tables to clean (root + dependents)
483
528
effective_table_names , effective_config_dict = _effective_table_names (table_names = table_names )
484
529
if dry_run :
485
530
print ("Performing dry run for db cleanup." )
@@ -491,6 +536,7 @@ def run_cleanup(
491
536
if not dry_run and confirm :
492
537
_confirm_delete (date = clean_before_timestamp , tables = sorted (effective_table_names ))
493
538
existing_tables = reflect_tables (tables = None , session = session ).tables
539
+
494
540
for table_name , table_config in effective_config_dict .items ():
495
541
if table_name in existing_tables :
496
542
with _suppress_with_logging (table_name , session ):
0 commit comments