Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
e38bae4
add-ha-rabbit
YuryHrytsuk Aug 14, 2025
901ee0c
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Aug 14, 2025
1be4f1c
Add ha rabbit
YuryHrytsuk Aug 20, 2025
9ad628f
Document erlang cookie rotation
YuryHrytsuk Aug 23, 2025
8f007fc
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Aug 26, 2025
cf8bbfa
Add ha proxy
YuryHrytsuk Aug 26, 2025
8ca30d7
Further configuration
YuryHrytsuk Aug 27, 2025
439d56a
Document autoscaling (not supported)
YuryHrytsuk Aug 27, 2025
2f86ff9
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Aug 27, 2025
ebd87c9
More configurable parameters
YuryHrytsuk Aug 27, 2025
18e172b
minor improvements
YuryHrytsuk Aug 27, 2025
1f52e7c
Add resource limits/reservations
YuryHrytsuk Aug 28, 2025
c36db8a
Add haproxy resources
YuryHrytsuk Aug 28, 2025
2ba480e
Document side effect of haproxy round robin
YuryHrytsuk Aug 28, 2025
4714fa4
Add healthcheck for haproxy
YuryHrytsuk Aug 28, 2025
4706404
Update readme
YuryHrytsuk Aug 28, 2025
0934599
Removing volumes
YuryHrytsuk Sep 1, 2025
51cd721
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 2, 2025
e42a00e
Robust volume clean up
YuryHrytsuk Sep 3, 2025
4d7d3e3
Simplification
YuryHrytsuk Sep 4, 2025
44a9ebe
Add confirmation dialogue
YuryHrytsuk Sep 4, 2025
f63e5b6
Unification
YuryHrytsuk Sep 4, 2025
8d28184
Minor clean up
YuryHrytsuk Sep 4, 2025
1a95eae
update gitignore
YuryHrytsuk Sep 4, 2025
3409428
fixes after clean up
YuryHrytsuk Sep 4, 2025
afb04ca
clean up
YuryHrytsuk Sep 4, 2025
f649f0c
clean up
YuryHrytsuk Sep 4, 2025
7115051
Deploy rabbit only if necessary
YuryHrytsuk Sep 4, 2025
2c7debc
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 4, 2025
0e3f235
clean up
YuryHrytsuk Sep 4, 2025
2b15092
Document cluster update behaviour. Architecture must be changed
YuryHrytsuk Sep 5, 2025
634a195
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 5, 2025
f5c4f1b
Switch from services to stacks
YuryHrytsuk Sep 8, 2025
69159f2
fixes
YuryHrytsuk Sep 8, 2025
3078248
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 8, 2025
6158d0a
improvements
YuryHrytsuk Sep 8, 2025
0ceb26b
minor fixes
YuryHrytsuk Sep 9, 2025
e2488cb
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 11, 2025
8a701c0
update
YuryHrytsuk Sep 11, 2025
d94d6ab
update
YuryHrytsuk Sep 11, 2025
d6a4de1
Improvements
YuryHrytsuk Sep 11, 2025
cf6a1c8
improvements
YuryHrytsuk Sep 11, 2025
c83b4f1
improvements
YuryHrytsuk Sep 11, 2025
8f420cc
fixes and improvements
YuryHrytsuk Sep 11, 2025
9455ae5
remove leftovers
YuryHrytsuk Sep 11, 2025
c2bef0c
Improve doc
YuryHrytsuk Sep 11, 2025
2a72bfa
fixes
YuryHrytsuk Sep 11, 2025
239ef37
Improve README
YuryHrytsuk Sep 16, 2025
1735293
Merge remote-tracking branch 'upstream/main' into add-ha-rabbit
YuryHrytsuk Sep 16, 2025
fb83f59
remove lines
YuryHrytsuk Sep 16, 2025
a8cd85f
Clean up
YuryHrytsuk Sep 16, 2025
98f2943
Fix readme header
YuryHrytsuk Sep 16, 2025
f6e5695
Improve node index validation
YuryHrytsuk Sep 16, 2025
79b3071
remove TODOs from compose file
YuryHrytsuk Sep 16, 2025
787ae19
remove unecessary headers
YuryHrytsuk Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions services/rabbit/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*.yml
*.env
!template*.env
!erlang.cookie.secret.template
rabbitmq.conf
haproxy.cfg
105 changes: 105 additions & 0 deletions services/rabbit/.operations.Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#
# Variables
#

LOAD_BALANCER_STACK_NAME := rabbit-loadbalancer

MAKEFLAGS += --no-print-directory

#
# Helpers
#

define create_node_stack_name
rabbit-node0$(1)
endef

validate-NODE_COUNT: guard-NODE_COUNT
@if ! echo "$(NODE_COUNT)" | grep --quiet --extended-regexp '^[1-9]$$'; then \
echo NODE_COUNT must be a positive single digit integer; \
exit 1; \
fi

validate-node-ix0%: .env
@if ! echo "$*" | grep --quiet --extended-regexp '^[1-9]$$'; then \
echo "Node index $* must be a positive single digit integer"; \
exit 1; \
fi

@set -o allexport; . $<; set +o allexport; \
if [ "$*" -lt 1 ] || [ "$*" -gt "$$RABBIT_CLUSTER_NODE_COUNT" ]; then \
echo "Node index $* is out of range 1..$$RABBIT_CLUSTER_NODE_COUNT"; \
exit 1; \
fi

#
# Cluster level
#

### Note: up operation is called by CI automatically
### it must NOT deploy stacks if they are already running
### to avoid breaking existing cluster (stopping all nodes at once)
up: start-cluster

down: stop-cluster

start-cluster: start-all-nodes start-loadbalancer

update-cluster stop-cluster:
@$(error This operation may break cluster. Check README for details.)

#
# Load Balancer
#

start-loadbalancer: .stack.loadbalancer.yml
@docker stack deploy --with-registry-auth --prune --compose-file $< $(LOAD_BALANCER_STACK_NAME)

update-loadbalancer: start-loadbalancer

stop-loadbalancer:
@docker stack rm $(LOAD_BALANCER_STACK_NAME)

#
# Rabbit all Nodes together
#

.start-all-nodes: validate-NODE_COUNT
@i=1; \
while [ $$i -le $(NODE_COUNT) ]; do \
$(MAKE) start-node0$$i; \
i=$$((i + 1)); \
done

start-all-nodes: .env
@source $<; \
$(MAKE) .start-all-nodes NODE_COUNT=$$RABBIT_CLUSTER_NODE_COUNT

update-all-nodes:
@$(error Updating all nodes at the same time may break the cluster \
as it may restart (i.e. stop) all nodes at the same time. \
Update one node at a time)

stop-all-nodes:
@$(error Stopping all nodes at the same time breaks the cluster. \
Update one node at a time. \
Read more at https://groups.google.com/g/rabbitmq-users/c/owvanX2iSqA/m/ZAyRDhRfCQAJ)

#
# Rabbit Node level
#

start-node0%: validate-node-ix0% .stack.node0%.yml
@STACK_NAME=$(call create_node_stack_name,$*); \
if docker stack ls --format '{{.Name}}' | grep --silent "$$STACK_NAME"; then \
echo "Rabbit Node $* is already running, skipping"; \
else \
echo "Starting Rabbit Node $* ..."; \
docker stack deploy --with-registry-auth --prune --compose-file $(word 2,$^) $(call create_node_stack_name,$*); \
fi

update-node0%: validate-node-ix0% .stack.node0%.yml
@docker stack deploy --detach=false --with-registry-auth --prune --compose-file $(word 2,$^) $(call create_node_stack_name,$*)

stop-node0%: validate-node-ix0%
@docker stack rm --detach=false $(call create_node_stack_name,$*)
66 changes: 66 additions & 0 deletions services/rabbit/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
REPO_BASE_DIR := $(shell git rev-parse --show-toplevel)

include ${REPO_BASE_DIR}/scripts/common-services.Makefile
# common-services.Makefile should be included first as common.Makefile
# relies on STACK_NAME var which is defined in common-services.Makefile
include ${REPO_BASE_DIR}/scripts/common.Makefile

#
# Operations
#

include ${REPO_BASE_DIR}/services/rabbit/.operations.Makefile

#
# Docker compose files
#

### Load Balancer
docker-compose.loadbalancer.yml: docker-compose.loadbalancer.yml.j2 \
.env \
configs/rabbitmq.conf \
configs/erlang.cookie.secret \
configs/haproxy.cfg \
venv \
$(VENV_BIN)/j2
@$(call jinja, $<, .env, $@)

.stack.loadbalancer.yml: docker-compose.loadbalancer.yml .env
@${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e .env $< > $@

### Node

.PRECIOUS: node0%.env
node0%.env: .env
envsubst < $< > $@; \
echo NODE_INDEX=$* >> $@

.PRECIOUS: docker-compose.node0%.yml
docker-compose.node0%.yml: docker-compose.node0x.yml.j2 \
node0%.env \
configs/rabbitmq.conf \
configs/erlang.cookie.secret \
configs/haproxy.cfg \
venv \
$(VENV_BIN)/j2
@$(call jinja, $<, node0$*.env, $@)

.PRECIOUS: .stack.node0%.yml
.stack.node0%.yml: docker-compose.node0%.yml node0%.env
@${REPO_BASE_DIR}/scripts/docker-stack-config.bash -e node0$*.env $< > $@

#
# Config / Secret files
#

configs/erlang.cookie.secret: configs/erlang.cookie.secret.template .env
@set -a; source .env; set +a; \
envsubst < $< > $@

configs/rabbitmq.conf: configs/rabbitmq.conf.j2 .env venv
# generate $@
@$(call jinja, $<, .env, $@)

configs/haproxy.cfg: configs/haproxy.cfg.j2 .env venv
# generate $@
@$(call jinja, $<, .env, $@)
49 changes: 49 additions & 0 deletions services/rabbit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## Starting a cluster

Make sure all nodes have joined the cluster before using it. Otherwise, number of replicas in quorum queues might be affected. Say, you have a cluster of 3 nodes. You connect to cluster before the 3rd node join it. Your quorum queue would end up with only 2 replicas and will be broken once, 1 node (of 2 nodes holding the replicas of the queue) goes down.

## Updating a cluster

Perform update one node at a time. Never update all nodes at the same time (this may break cluster)! Follow instructions from official documentation https://www.rabbitmq.com/docs/upgrade#rolling-upgrade.

## Graceful shutdown

Shutdown nodes one by one gracefully. Wait until the nodes is stopped and leaves the cluster. Then remove next node. When starting cluster, start nodes **in the reverse order**! For example, if you shutdown node01, then node02 and lastly node03, first start node03 then node02 and finally node01.

If all Nodes were shutdown simultaneously, then you will see mnesia tables errors in node's logs. Restarting node solves the issue. Documentation also mentions force_boot CLI command in this case (see https://www.rabbitmq.com/docs/man/rabbitmqctl.8#force_boot)

## How to add / remove nodes

The only supported way, is to completely shutdown the cluster (docker stack and most likely rabbit node volumes) and start brand new.

With manual effort this can be done on the running cluster, by adding 1 more rabbit node manually (as a separate docker stack or new service) and manually executing rabbitmqctl commands (some hints can be found here https://www.rabbitmq.com/docs/clustering#creating)

## Updating rabbitmq.conf / advanced.config (zero-downtime)

We do not support this automated (except starting from scratch with empty volumes). But manually this can be achieved in case needed. `rabbitmq.conf` and `advanced.config` changes take effect after a node restart. This can be performed with zero-downtime when RabbitMQ is clustered (have multiple nodes). This can be achieved by stopping and starting rabbitmq nodes one by one
* `docker exec -it <container-id> bash`
* (inside container) `rabbitmqctl stop_app` and wait some time until node is stopped (can be seen in management ui)
* (inside container) `rabbitmqctl start_app`

Source: https://www.rabbitmq.com/docs/next/configure#config-changes-effects

## Enable node Maintenance mode

1. Get inside container's shell (`docker exec -it <container-id> bash`)
2. (Inside container) execute `rabbitmq-upgrade drain`

Source: https://www.rabbitmq.com/docs/upgrade#maintenance-mode

## Troubleshooting
mnesia errors after all rabbit nodes (docker services) restart:
* https://stackoverflow.com/questions/60407082/rabbit-mq-error-while-waiting-for-mnesia-tables

official documentation mentioning restart scenarios
* https://www.rabbitmq.com/docs/clustering#restarting-schema-sync

all (3) cluster nodes go down simultaneosuly, cluster is broken:
* https://groups.google.com/g/rabbitmq-users/c/owvanX2iSqA

## Autoscaling

Not supported at the moment.
1 change: 1 addition & 0 deletions services/rabbit/configs/erlang.cookie.secret.template
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
${RABBIT_ERLANG_COOKIE}
64 changes: 64 additions & 0 deletions services/rabbit/configs/haproxy.cfg.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{% set NODE_IXS = range(1, (RABBIT_CLUSTER_NODE_COUNT | int) + 1) -%}

global
log stdout format raw local0

# haproxy by default resolves server hostname only once
# this breaks if container restarts. By using resolvers
# we tell haproxy to re-resolve the hostname (so container
# restarts are handled properly)
resolvers dockerdns
nameserver dns1 127.0.0.11:53
resolve_retries 3
timeout resolve 1s
timeout retry 1s
hold other 10s
hold refused 10s
hold nx 10s
hold timeout 10s
hold valid 10s
hold obsolete 10s

defaults
log global
mode tcp
option tcplog

timeout connect 5s
timeout client 30s
timeout server 30s

frontend rabbit
bind *:{{ RABBIT_PORT }}
default_backend rabbit_backends

frontend rabbit_dashboard
bind *:{{ RABBIT_MANAGEMENT_PORT }}
default_backend rabbit_dashboard_backends

frontend health
mode http
bind 127.0.0.1:32087
http-request return status 200 if { src 127.0.0.0/8 }

backend rabbit_backends
# side effect of roundrobin is connection should be evenly distributed
# thus rabbit queue leader replica shall be also evenly distributed
# (https://www.rabbitmq.com/docs/4.0/clustering#replica-placement)
# if algorithm below is changed, consider adjusting rabbit configuration
# as stated in documentation link above
balance roundrobin

# init-addrs libc,none - start even if there aren’t any backend servers running
{% for ix in NODE_IXS %}
server rabbit0{{ ix }} rabbit-node0{{ ix }}_rabbit0{{ ix }}:{{ RABBIT_PORT }} check resolvers dockerdns init-addr libc,none inter 5s rise 2 fall 3 send-proxy
{%- endfor %}

backend rabbit_dashboard_backends
mode http
balance roundrobin

{% for ix in NODE_IXS %}
server rabbit0{{ ix }} rabbit-node0{{ ix }}_rabbit0{{ ix }}:{{ RABBIT_MANAGEMENT_PORT }} check resolvers dockerdns init-addr libc,none inter 5s rise 2 fall 3
{%- endfor %}
# keep new line in the end to avoid "Missing LF on last line" error
19 changes: 19 additions & 0 deletions services/rabbit/configs/rabbitmq.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{% set NODE_IXS = range(1, (RABBIT_CLUSTER_NODE_COUNT | int) + 1) -%}

# https://www.rabbitmq.com/docs/cluster-formation#peer-discovery-configuring-mechanism
cluster_formation.peer_discovery_backend = classic_config

{% for ix in NODE_IXS %}
cluster_formation.classic_config.nodes.{{ ix }} = rabbit@rabbit-node0{{ ix }}_rabbit0{{ ix }}
{%- endfor %}

## Sets the initial quorum queue replica count for newly declared quorum queues.
## This value can be overridden using the 'x-quorum-initial-group-size' queue argument
## at declaration time.
# https://www.rabbitmq.com/docs/quorum-queues#quorum-requirements
quorum_queue.initial_cluster_size = {{ RABBIT_QUORUM_QUEUE_DEFAULT_REPLICA_COUNT }}

# Extract proper client ip when behind a proxy (e.g. haproxy)
# https://www.rabbitmq.com/docs/networking#proxy-protocol
# WARNING: this forces clients to use a proxy (direct access to nodes does not work)
proxy_protocol = true
46 changes: 46 additions & 0 deletions services/rabbit/docker-compose.loadbalancer.yml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
services:
loadbalancer:
image: haproxy:3.2
deploy:
update_config:
order: start-first
parallelism: 1
delay: 30s
failure_action: rollback
# https://discourse.haproxy.org/t/haproxy-high-availability-configuration/11983
replicas: ${RABBIT_LB_REPLICAS}
# necessary to preserve client ip
# otherwise we see overlay rabbit network lb ip
# (rabbitmq management dashboard connection section)
endpoint_mode: dnsrr
resources:
limits:
# https://help.hcl-software.com/digital-experience/dx-95-doc-archive/CF203/platform/kubernetes/haproxy-migration/haproxy-configuration.html
cpus: "1"
memory: "2G"
# according to local observations and link below
# https://github.com/haproxytech/helm-charts/blob/haproxy-1.24.0/haproxy/values.yaml#L403
reservations:
cpus: "0.1"
memory: "128M"
healthcheck: # https://stackoverflow.com/a/76513320/12124525
test: bash -c 'echo "" > /dev/tcp/127.0.0.1/32087 || exit 1'
start_period: 5s
timeout: 2s
retries: 2
interval: 10s
networks:
- rabbit
configs:
- source: haproxy.cfg
target: /usr/local/etc/haproxy/haproxy.cfg

networks:
rabbit:
name: ${RABBIT_NETWORK}
external: true

configs:
haproxy.cfg:
file: ./configs/haproxy.cfg
name: rabbit_haproxy_conf_{{ "./configs/haproxy.cfg" | sha256file | substring(0,10) }}
Loading
Loading