【问题标题】:How to dump confusion matrix using TensorBoard logger in pytorch-lightning?如何在 pytorch-lightning 中使用 TensorBoard 记录器转储混淆矩阵?
【发布时间】:2021-04-06 11:23:16
【问题描述】:

The official doc 仅状态

>>> from pytorch_lightning.metrics import ConfusionMatrix
>>> target = torch.tensor([1, 1, 0, 0])
>>> preds = torch.tensor([0, 1, 0, 0])
>>> confmat = ConfusionMatrix(num_classes=2)
>>> confmat(preds, target)

这没有说明如何在框架中使用指标。

我的尝试(方法不完整,只显示相关部分):

def __init__(...):
    self.val_confusion = pl.metrics.classification.ConfusionMatrix(num_classes=self._config.n_clusters)

def validation_step(self, batch, batch_index):
    ...
    log_probs = self.forward(orig_batch)
    loss = self._criterion(log_probs, label_batch)
   
    self.val_confusion.update(log_probs, label_batch)
    self.log('validation_confusion_step', self.val_confusion, on_step=True, on_epoch=False)

def validation_step_end(self, outputs):
    return outputs

def validation_epoch_end(self, outs):
    self.log('validation_confusion_epoch', self.val_confusion.compute())

在第 0 个 epoch 之后,这给出了

    Traceback (most recent call last):
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 521, in train
        self.train_loop.run_training_epoch()
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\training_loop.py", line 588, in run_training_epoch
        self.trainer.run_evaluation(test_mode=False)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 613, in run_evaluation
        self.evaluation_loop.log_evaluation_step_metrics(output, batch_idx)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\evaluation_loop.py", line 346, in log_evaluation_step_metrics
        self.__log_result_step_metrics(step_log_metrics, step_pbar_metrics, batch_idx)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\evaluation_loop.py", line 350, in __log_result_step_metrics
        cached_batch_pbar_metrics, cached_batch_log_metrics = cached_results.update_logger_connector()
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 378, in update_logger_connector
        batch_log_metrics = self.get_latest_batch_log_metrics()
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 418, in get_latest_batch_log_metrics
        batch_log_metrics = self.run_batch_from_func_name("get_batch_log_metrics")
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 414, in run_batch_from_func_name
        results = [func(include_forked_originals=False) for func in results]
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 414, in <listcomp>
        results = [func(include_forked_originals=False) for func in results]
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 122, in get_batch_log_metrics
        return self.run_latest_batch_metrics_with_func_name("get_batch_log_metrics",
*args, **kwargs)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 115, in run_latest_batch_metrics_with_func_name
        for dl_idx in range(self.num_dataloaders)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 115, in <listcomp>
        for dl_idx in range(self.num_dataloaders)
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\epoch_result_store.py", line 100, in get_latest_from_func_name
        results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
      File "C:\code\EPMD\Kodex\Templates\Testing\venv\lib\site-packages\pytorch_lightning\core\step_result.py", line 298, in get_batch_log_metrics
        result[dl_key] = self[k]._forward_cache.detach()
    AttributeError: 'NoneType' object has no attribute 'detach'

                                                      

它确实在训练前通过了健全性验证检查。

失败发生在validation_step_end 的返回上。对我来说毫无意义。

使用 mertics 的完全相同的方法可以准确地工作。

如何得到正确的混淆矩阵?

【问题讨论】:

  • 请提供预期的MRE。显示中间结果与预期结果的偏差。我们应该能够将您的代码块粘贴到文件中,运行它并重现您的问题。这也让我们可以在您的上下文中测试任何建议。
  • 您提供的文档链接提供的信息比您在问题中提供的更多信息,以及更完整的示例。正如我所看到的,validation_step 中的update 假定实现与ConfusionMatrix 对象的结构不一致。由于您省略了这么多代码,我们无法判断;您让我们目视检查您未追踪的代码片段,而不是测试。
  • @Prune MRE 不可行,运行机器学习的代码至少需要一个数据集和配置。这只是一个缺少文档的问题,无论如何我的可重现性实际上是无用的,我只是想看看正确的用法。请告诉我我缺少文档的哪一部分?显然我的实现并不像预期的那样,但我也不明白预期是什么,因为我使用的与更完整的准确性示例完全相同。
  • 准确性示例在文档本身中不是 MRE,因为那样它的可读性较差...pytorch-lightning.readthedocs.io/en/stable/metrics.html

标签: python deep-learning pytorch tensorboard pytorch-lightning


【解决方案1】:

您可以使用self.logger.experiment.add_figure(*tag*, *figure*) 报告该数字。

变量self.logger.experiment 实际上是一个SummaryWriter(来自 PyTorch,而不是 Lightning)。这个类有方法add_figure (documentation)。

您可以按如下方式使用它:(MNIST 示例)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.nll_loss(preds, y)
        return { 'loss': loss, 'preds': preds, 'target': y}

    def validation_epoch_end(self, outputs):
        preds = torch.cat([tmp['preds'] for tmp in outputs])
        targets = torch.cat([tmp['target'] for tmp in outputs])
        confusion_matrix = pl.metrics.functional.confusion_matrix(preds, targets, num_classes=10)

        df_cm = pd.DataFrame(confusion_matrix.numpy(), index = range(10), columns=range(10))
        plt.figure(figsize = (10,7))
        fig_ = sns.heatmap(df_cm, annot=True, cmap='Spectral').get_figure()
        plt.close(fig_)
        
        self.logger.experiment.add_figure("Confusion matrix", fig_, self.current_epoch)

【讨论】:

    【解决方案2】:

    这花了很多时间才找到。

    这是我可以粘贴的最小代码,但仍然可读且可重现。

    我不想把整个模型数据集和参数放在这里,因为他们对这个问题的读者没有兴趣,只是噪音。


    也就是说,这是创建每个时期的混淆矩阵并在 Tensorboard 中显示所需的代码

    这是一个单帧例如:


    import pytorch_lightning as pl
    import seaborn as sn
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from PIL import Image
    
    def __init__(self, config, trained_vae, latent_dim):
        self.val_confusion = pl.metrics.classification.ConfusionMatrix(num_classes=self._config.n_clusters)
        self.logger: Optional[TensorBoardLogger] = None
    
    def forward(self, x):
        ...
        return log_probs
    
    def validation_step(self, batch, batch_index):
        if self._config.dataset == "mnist":
            orig_batch, label_batch = batch
            orig_batch = orig_batch.reshape(-1, 28 * 28)
    
        log_probs = self.forward(orig_batch)
        loss = self._criterion(log_probs, label_batch)
    
        self.val_confusion.update(log_probs, label_batch)
        return {"loss": loss, "labels": label_batch}
    
    def validation_step_end(self, outputs):
        return outputs
    
    def validation_epoch_end(self, outs):
        tb = self.logger.experiment
    
        # confusion matrix
        conf_mat = self.val_confusion.compute().detach().cpu().numpy().astype(np.int)
        df_cm = pd.DataFrame(
            conf_mat,
            index=np.arange(self._config.n_clusters),
            columns=np.arange(self._config.n_clusters))
        plt.figure()
        sn.set(font_scale=1.2)
        sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d')
        buf = io.BytesIO()
        
        plt.savefig(buf, format='jpeg')
        buf.seek(0)
        im = Image.open(buf)
        im = torchvision.transforms.ToTensor()(im)
        tb.add_image("val_confusion_matrix", im, global_step=self.current_epoch)
    

    和培训师的电话

    logger = TensorBoardLogger(save_dir=tb_logs_folder, name='Classifier')
    trainer = Trainer(deterministic=True,
                      max_epochs=10,
                      default_root_dir=classifier_checkpoints_path,
                      logger=logger,
                      gpus=1
                      )
    

    【讨论】:

      猜你喜欢
      • 2021-12-08
      • 2017-05-27
      • 2020-02-23
      • 1970-01-01
      • 2021-10-12
      • 2022-08-05
      • 2022-01-23
      • 2019-10-10
      • 1970-01-01
      相关资源
      最近更新 更多