Skip to content

Commit 51fea77

Browse files
awaelchlikrshrimali
authored andcommitted
Fix TensorBoardLogger creating redundant experiment when finalizing (#14762)
Co-authored-by: Kushashwa Ravi Shrimali <[email protected]>
1 parent 764aa1d commit 51fea77

File tree

4 files changed

+30
-9
lines changed

4 files changed

+30
-9
lines changed

src/pytorch_lightning/CHANGELOG.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
77
## [unReleased] - 2022-MM-DD
88

99

10-
- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610))
11-
12-
1310
### Added
1411

1512

@@ -40,6 +37,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
4037
- Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551))
4138

4239

40+
- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610))
41+
42+
4343
### Changed
4444

4545
- The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892))
@@ -186,6 +186,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
186186
- Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724))
187187

188188

189+
- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762))
190+
191+
192+
189193
## [1.7.6] - 2022-09-13
190194

191195
### Changed

src/pytorch_lightning/loggers/tensorboard.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,9 @@ def save(self) -> None:
268268

269269
@rank_zero_only
270270
def finalize(self, status: str) -> None:
271-
self.experiment.flush()
272-
self.experiment.close()
271+
if self._experiment is not None:
272+
self.experiment.flush()
273+
self.experiment.close()
273274
self.save()
274275

275276
@property

tests/tests_pytorch/checkpointing/test_model_checkpoint.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -869,18 +869,23 @@ def validation_step(self, batch, batch_idx):
869869
"limit_test_batches": 2,
870870
"enable_progress_bar": False,
871871
"enable_model_summary": False,
872+
"log_every_n_steps": 1,
873+
"default_root_dir": tmpdir,
872874
}
873875
trainer = Trainer(**trainer_kwargs, callbacks=[checkpoint_callback])
874876
trainer.fit(model)
875-
assert os.listdir(tmpdir) == ["epoch=00.ckpt"]
877+
assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"}
876878

877879
for idx in range(4):
878880
# load from checkpoint
879-
trainer = pl.Trainer(**trainer_kwargs, default_root_dir=tmpdir)
881+
trainer = Trainer(**trainer_kwargs)
880882
trainer.fit(model, ckpt_path=checkpoint_callback.best_model_path)
881883
trainer.test(ckpt_path=checkpoint_callback.best_model_path, verbose=False)
884+
882885
assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"}
883-
assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_{i}" for i in range(4)}
886+
887+
# no new versions created after the initial fit, because the ones that resume from ckpt do not log anything
888+
assert set(os.listdir(tmpdir / "lightning_logs")) == {"version_0"}
884889

885890

886891
def test_checkpoint_repeated_strategy_extended(tmpdir):
@@ -891,6 +896,7 @@ class ExtendedBoringModel(BoringModel):
891896
def validation_step(self, batch, batch_idx):
892897
output = self.layer(batch)
893898
loss = self.loss(batch, output)
899+
self.log("val_loss", loss)
894900
return {"val_loss": loss}
895901

896902
def validation_epoch_end(self, *_):
@@ -930,7 +936,7 @@ def assert_checkpoint_log_dir(idx):
930936
limit_test_batches=4,
931937
callbacks=[checkpoint_cb],
932938
)
933-
trainer = pl.Trainer(**trainer_config)
939+
trainer = Trainer(**trainer_config)
934940
assert_trainer_init(trainer)
935941

936942
model = ExtendedBoringModel()

tests/tests_pytorch/loggers/test_tensorboard.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,17 @@ def training_step(self, *args):
275275
def test_tensorboard_finalize(summary_writer, tmpdir):
276276
"""Test that the SummaryWriter closes in finalize."""
277277
logger = TensorBoardLogger(save_dir=tmpdir)
278+
assert logger._experiment is None
278279
logger.finalize("any")
280+
281+
# no log calls, no experiment created -> nothing to flush
282+
summary_writer.assert_not_called()
283+
284+
logger = TensorBoardLogger(save_dir=tmpdir)
285+
logger.log_metrics({"flush_me": 11.1}) # trigger creation of an experiment
286+
logger.finalize("any")
287+
288+
# finalize flushes to experiment directory
279289
summary_writer().flush.assert_called()
280290
summary_writer().close.assert_called()
281291

0 commit comments

Comments
 (0)