Add playbook for offline'ing a cluster host safetly
This commit is contained in:
		
							
								
								
									
										131
									
								
								playbooks/node-down.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								playbooks/node-down.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,131 @@
 | 
			
		||||
---
 | 
			
		||||
- name: Check cluster state
 | 
			
		||||
  hosts: cluster
 | 
			
		||||
  any_errors_fatal: true
 | 
			
		||||
  vars_prompt:
 | 
			
		||||
    - name: skylab_node_down
 | 
			
		||||
      prompt: "Select node to offline (one of: {{ groups.cluster | join(', ') }})"
 | 
			
		||||
      private: false
 | 
			
		||||
  tasks:
 | 
			
		||||
    - name: Validate user input
 | 
			
		||||
      ansible.builtin.assert:
 | 
			
		||||
        that:
 | 
			
		||||
          - skylab_node_down in groups.cluster
 | 
			
		||||
        fail_msg: >-
 | 
			
		||||
          ERROR: Host '{{ skylab_node_down }}' is not a valid cluster node (one
 | 
			
		||||
          of: {{ groups.cluster | join(', ') }})
 | 
			
		||||
 | 
			
		||||
    - name: Fetch node swarm ID
 | 
			
		||||
      ansible.builtin.command:
 | 
			
		||||
        cmd: !unsafe docker info --format '{{ .Swarm.NodeID}}'
 | 
			
		||||
      changed_when: false
 | 
			
		||||
      register: _docker_node_id_raw
 | 
			
		||||
 | 
			
		||||
    - name: Fetch swarm node availability
 | 
			
		||||
      ansible.builtin.command:
 | 
			
		||||
        cmd: docker node inspect {{ _docker_node_id_raw.stdout.strip() }} --format '{{ '{{ .Spec.Availability}}' }}'
 | 
			
		||||
      changed_when: false
 | 
			
		||||
      register: _docker_node_availability_raw
 | 
			
		||||
 | 
			
		||||
    - name: Set common facts
 | 
			
		||||
      ansible.builtin.set_fact:
 | 
			
		||||
        _target_node: "{{ skylab_node_down }}"
 | 
			
		||||
        _docker_node_id: "{{ _docker_node_id_raw.stdout.strip() }}"
 | 
			
		||||
        _docker_node_availability: "{{ _docker_node_availability_raw.stdout.strip() }}"
 | 
			
		||||
        # Use the next host in the group, unless that would exceed the length of the group,
 | 
			
		||||
        # in which case use the first host in the group
 | 
			
		||||
        _target_alt: >-
 | 
			
		||||
          {{ groups.cluster[
 | 
			
		||||
            lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1
 | 
			
		||||
            if (lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1) < (groups.cluster | length)
 | 
			
		||||
            else 0]
 | 
			
		||||
          }}
 | 
			
		||||
 | 
			
		||||
    # I'm not sure how to do this without invoking a loop, so here we are
 | 
			
		||||
    - name: Set common fact for node addresses
 | 
			
		||||
      vars:
 | 
			
		||||
        _node_addresses:
 | 
			
		||||
          - "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4.address }}"
 | 
			
		||||
      ansible.builtin.set_fact:
 | 
			
		||||
        _node_addresses: "{{ _node_addresses + [item.address] }}"
 | 
			
		||||
      loop: "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4_secondaries }}"
 | 
			
		||||
      loop_control:
 | 
			
		||||
        label: "{{ item.address }}"
 | 
			
		||||
 | 
			
		||||
    - name: Set facts for target node
 | 
			
		||||
      when: inventory_hostname == _target_node
 | 
			
		||||
      ansible.builtin.set_fact:
 | 
			
		||||
        _needs_docker_migration: "{{ (_docker_node_availability | lower != 'drain') | bool }}"
 | 
			
		||||
 | 
			
		||||
    - name: Check cluster settings
 | 
			
		||||
      when: inventory_hostname != _target_node
 | 
			
		||||
      ansible.builtin.assert:
 | 
			
		||||
        that:
 | 
			
		||||
          - skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
 | 
			
		||||
          - _docker_node_availability | lower == 'active'
 | 
			
		||||
        fail_msg: >-
 | 
			
		||||
          ERROR: Node '{{ inventory_hostname }}' is already marked as unavailable. All cluster
 | 
			
		||||
          nodes must be available before a new node can be moved to unavailable status.
 | 
			
		||||
 | 
			
		||||
- name: Offline node
 | 
			
		||||
  hosts: "{{ skylab_node_down }}"
 | 
			
		||||
  tasks:
 | 
			
		||||
    - name: Migrate services off target node
 | 
			
		||||
      when: _needs_docker_migration
 | 
			
		||||
      block:
 | 
			
		||||
        - name: Fetch current cluster service state
 | 
			
		||||
          ansible.builtin.command:
 | 
			
		||||
            cmd: !unsafe docker service ls --format '{{json .}}'
 | 
			
		||||
          changed_when: false
 | 
			
		||||
          register: _cluster_service_prestate
 | 
			
		||||
 | 
			
		||||
        - name: Disable NAT rule {{ _skylab_adguard_nat_rule }}
 | 
			
		||||
          delegate_to: core
 | 
			
		||||
          connection: ansible.netcommon.network_cli
 | 
			
		||||
          community.network.edgeos_config:
 | 
			
		||||
            lines:
 | 
			
		||||
              - set service nat rule {{ _skylab_adguard_nat_rule }} disable
 | 
			
		||||
 | 
			
		||||
        - name: Update node availability
 | 
			
		||||
          vars:
 | 
			
		||||
            ansible_python_interpreter: "{{ skylab_state_dir }}/ansible-runtime/bin/python"
 | 
			
		||||
          community.docker.docker_node:
 | 
			
		||||
            availability: drain
 | 
			
		||||
            hostname: "{{ _docker_node_id }}"
 | 
			
		||||
          register: _node_availability_status
 | 
			
		||||
 | 
			
		||||
        - name: Wait for services to shutdown
 | 
			
		||||
          ansible.builtin.pause:
 | 
			
		||||
            seconds: 10
 | 
			
		||||
 | 
			
		||||
        - name: Wait for services to migrate
 | 
			
		||||
          ansible.builtin.command:
 | 
			
		||||
            cmd: !unsafe docker service ls --format '{{json .}}'
 | 
			
		||||
          changed_when: false
 | 
			
		||||
          register: _cluster_service_poststate
 | 
			
		||||
          until: _cluster_service_poststate.stdout == _cluster_service_prestate.stdout
 | 
			
		||||
          retries: 120
 | 
			
		||||
          delay: 5
 | 
			
		||||
 | 
			
		||||
        - name: Enable NAT rule {{ _skylab_adguard_nat_rule }}
 | 
			
		||||
          delegate_to: core
 | 
			
		||||
          connection: ansible.netcommon.network_cli
 | 
			
		||||
          community.network.edgeos_config:
 | 
			
		||||
            lines:
 | 
			
		||||
              - delete service nat rule {{ _skylab_adguard_nat_rule }} disable
 | 
			
		||||
            save: true
 | 
			
		||||
 | 
			
		||||
    - name: Delete address from node
 | 
			
		||||
      become: true
 | 
			
		||||
      when: skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
 | 
			
		||||
      ansible.builtin.command:
 | 
			
		||||
        cmd: ip address delete {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ skylab_cluster.interface }}
 | 
			
		||||
      changed_when: true
 | 
			
		||||
 | 
			
		||||
    - name: Assign address to alt node
 | 
			
		||||
      delegate_to: "{{ _target_alt }}"
 | 
			
		||||
      become: true
 | 
			
		||||
      when: skylab_cluster.address | ansible.netcommon.ipaddr('address') not in hostvars[_target_alt]._node_addresses
 | 
			
		||||
      ansible.builtin.command:
 | 
			
		||||
        cmd: ip address add {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ hostvars[_target_alt].skylab_cluster.interface }}
 | 
			
		||||
      changed_when: true
 | 
			
		||||
		Reference in New Issue
	
	Block a user