diff --git a/playbooks/node-down.yaml b/playbooks/node-down.yaml new file mode 100644 index 0000000..d142c17 --- /dev/null +++ b/playbooks/node-down.yaml @@ -0,0 +1,131 @@ +--- +- name: Check cluster state + hosts: cluster + any_errors_fatal: true + vars_prompt: + - name: skylab_node_down + prompt: "Select node to offline (one of: {{ groups.cluster | join(', ') }})" + private: false + tasks: + - name: Validate user input + ansible.builtin.assert: + that: + - skylab_node_down in groups.cluster + fail_msg: >- + ERROR: Host '{{ skylab_node_down }}' is not a valid cluster node (one + of: {{ groups.cluster | join(', ') }}) + + - name: Fetch node swarm ID + ansible.builtin.command: + cmd: !unsafe docker info --format '{{ .Swarm.NodeID}}' + changed_when: false + register: _docker_node_id_raw + + - name: Fetch swarm node availability + ansible.builtin.command: + cmd: docker node inspect {{ _docker_node_id_raw.stdout.strip() }} --format '{{ '{{ .Spec.Availability}}' }}' + changed_when: false + register: _docker_node_availability_raw + + - name: Set common facts + ansible.builtin.set_fact: + _target_node: "{{ skylab_node_down }}" + _docker_node_id: "{{ _docker_node_id_raw.stdout.strip() }}" + _docker_node_availability: "{{ _docker_node_availability_raw.stdout.strip() }}" + # Use the next host in the group, unless that would exceed the length of the group, + # in which case use the first host in the group + _target_alt: >- + {{ groups.cluster[ + lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1 + if (lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1) < (groups.cluster | length) + else 0] + }} + + # I'm not sure how to do this without invoking a loop, so here we are + - name: Set common fact for node addresses + vars: + _node_addresses: + - "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4.address }}" + ansible.builtin.set_fact: + _node_addresses: "{{ _node_addresses + [item.address] }}" + loop: "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4_secondaries }}" + loop_control: + label: "{{ item.address }}" + + - name: Set facts for target node + when: inventory_hostname == _target_node + ansible.builtin.set_fact: + _needs_docker_migration: "{{ (_docker_node_availability | lower != 'drain') | bool }}" + + - name: Check cluster settings + when: inventory_hostname != _target_node + ansible.builtin.assert: + that: + - skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses + - _docker_node_availability | lower == 'active' + fail_msg: >- + ERROR: Node '{{ inventory_hostname }}' is already marked as unavailable. All cluster + nodes must be available before a new node can be moved to unavailable status. + +- name: Offline node + hosts: "{{ skylab_node_down }}" + tasks: + - name: Migrate services off target node + when: _needs_docker_migration + block: + - name: Fetch current cluster service state + ansible.builtin.command: + cmd: !unsafe docker service ls --format '{{json .}}' + changed_when: false + register: _cluster_service_prestate + + - name: Disable NAT rule {{ _skylab_adguard_nat_rule }} + delegate_to: core + connection: ansible.netcommon.network_cli + community.network.edgeos_config: + lines: + - set service nat rule {{ _skylab_adguard_nat_rule }} disable + + - name: Update node availability + vars: + ansible_python_interpreter: "{{ skylab_state_dir }}/ansible-runtime/bin/python" + community.docker.docker_node: + availability: drain + hostname: "{{ _docker_node_id }}" + register: _node_availability_status + + - name: Wait for services to shutdown + ansible.builtin.pause: + seconds: 10 + + - name: Wait for services to migrate + ansible.builtin.command: + cmd: !unsafe docker service ls --format '{{json .}}' + changed_when: false + register: _cluster_service_poststate + until: _cluster_service_poststate.stdout == _cluster_service_prestate.stdout + retries: 120 + delay: 5 + + - name: Enable NAT rule {{ _skylab_adguard_nat_rule }} + delegate_to: core + connection: ansible.netcommon.network_cli + community.network.edgeos_config: + lines: + - delete service nat rule {{ _skylab_adguard_nat_rule }} disable + save: true + + - name: Delete address from node + become: true + when: skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses + ansible.builtin.command: + cmd: ip address delete {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ skylab_cluster.interface }} + changed_when: true + + - name: Assign address to alt node + delegate_to: "{{ _target_alt }}" + become: true + when: skylab_cluster.address | ansible.netcommon.ipaddr('address') not in hostvars[_target_alt]._node_addresses + ansible.builtin.command: + cmd: ip address add {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ hostvars[_target_alt].skylab_cluster.interface }} + changed_when: true