From e1b537273616b01b011e3eddc408bb5703593b5f Mon Sep 17 00:00:00 2001 From: Dan Meyers Date: Fri, 5 Sep 2025 16:25:46 +0100 Subject: [PATCH 1/2] feat: Fix Ceph version lookup for newer releases Ceph made various breaking changes to the status output in the `octopus` release in 2020, including (but not limited to) the entire `mon_status` command being removed. The DataDog check as-was only checked if the release was exactly 'octopus', not any later release as well. Given that `octopus` is now 5 years old, and there doesn't seem to be anything in the responses that gives a semantic version of similar to numerically compare, it's easiest to just _assume_ that we will get stats the New Way :tm:, and also try any old way if the `mon_status` content exists in the `raw` map and the new way failed. --- ceph/datadog_checks/ceph/ceph.py | 58 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/ceph/datadog_checks/ceph/ceph.py b/ceph/datadog_checks/ceph/ceph.py index d4531b79d4aef..2961e4cf64095 100644 --- a/ceph/datadog_checks/ceph/ceph.py +++ b/ceph/datadog_checks/ceph/ceph.py @@ -43,7 +43,6 @@ class Ceph(AgentCheck): def __init__(self, name, init_config, instances): super(Ceph, self).__init__(name, init_config, instances) - self._octopus = False def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) @@ -58,6 +57,7 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance): ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} + # `mon_status` is only a valid command in versions of Ceph prior to `octopus` (released 2020-03-23) for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail', 'osd metadata'): try: args = '{} {} -fjson'.format(ceph_args, cmd) @@ -73,24 +73,22 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance): mon_map = raw.get('status', {}).get('monmap') if mon_map is None: raise RuntimeError("Could not detect Ceph release series") - if 'min_mon_release_name' in mon_map and mon_map['min_mon_release_name'] == 'octopus': - self.log.debug("Detected octopus version of ceph...") - self._octopus = True - else: - self._octopus = False return raw def _extract_tags(self, raw, instance): tags = instance.get('tags', []) fsid = None - if self._octopus: + try: fsid = raw['status']['fsid'] - elif 'mon_status' in raw: - fsid = raw['mon_status']['monmap']['fsid'] + except KeyError: + if 'mon_status' in raw: + fsid = raw['mon_status']['monmap']['fsid'] + else: + self.log.debug("Could not find fsid") + + if 'mon_status' in raw: tags.append(self.NAMESPACE + '_mon_state:%s' % raw['mon_status']['state']) - else: - self.log.debug("Could not find fsid") if fsid is not None: tags.append(self.NAMESPACE + '_fsid:%s' % fsid) @@ -276,29 +274,29 @@ def _extract_metrics(self, raw, tags): except KeyError: self.log.debug('Error retrieving pgstatus metrics') - if self._octopus: - try: - num_mons = int(raw['status']['monmap']['num_mons']) - self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags) - except KeyError: - self.log.debug('Error retrieving num_mons metric') - else: - try: - num_mons = len(raw['mon_status']['monmap']['mons']) - self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags) - except KeyError: - self.log.debug('Error retrieving mon_status metrics') + try: + num_mons = int(raw['status']['monmap']['num_mons']) + self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags) + except KeyError: + if 'mon_status' in raw: + try: + num_mons = len(raw['mon_status']['monmap']['mons']) + self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags) + except KeyError: + self.log.debug('Error retrieving mon_status metrics') - try: - num_mons_active = len(raw['mon_status']['quorum']) - self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags) - except KeyError: - self.log.debug('Error retrieving mon_status quorum metrics') + try: + num_mons_active = len(raw['mon_status']['quorum']) + self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags) + except KeyError: + self.log.debug('Error retrieving mon_status quorum metrics') + else: + self.log.debug('Error retrieving num_mons metric') try: stats = raw['df_detail']['stats'] - if not self._octopus: - self._publish(stats, self.gauge, ['total_objects'], tags) + # This will only work on Ceph versions prior to `octopus`, but will catch+return on later versions + self._publish(stats, self.gauge, ['total_objects'], tags) used = float(stats['total_used_bytes']) total = float(stats['total_bytes']) if total > 0: From 0a8adbeeeb0596083c4ea3cc957b831ca1b7db23 Mon Sep 17 00:00:00 2001 From: Dan Meyers Date: Fri, 5 Sep 2025 17:44:37 +0100 Subject: [PATCH 2/2] chore: Update changelog --- ceph/changelog.d/21279.fixed | 1 + 1 file changed, 1 insertion(+) create mode 100644 ceph/changelog.d/21279.fixed diff --git a/ceph/changelog.d/21279.fixed b/ceph/changelog.d/21279.fixed new file mode 100644 index 0000000000000..9b28d069a939d --- /dev/null +++ b/ceph/changelog.d/21279.fixed @@ -0,0 +1 @@ +feat: Fix Ceph version lookup for newer releases of Ceph