Add playbook for offline'ing a cluster host safetly
This commit is contained in:
		
							
								
								
									
										131
									
								
								playbooks/node-down.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								playbooks/node-down.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,131 @@
 | 
				
			|||||||
 | 
					---
 | 
				
			||||||
 | 
					- name: Check cluster state
 | 
				
			||||||
 | 
					  hosts: cluster
 | 
				
			||||||
 | 
					  any_errors_fatal: true
 | 
				
			||||||
 | 
					  vars_prompt:
 | 
				
			||||||
 | 
					    - name: skylab_node_down
 | 
				
			||||||
 | 
					      prompt: "Select node to offline (one of: {{ groups.cluster | join(', ') }})"
 | 
				
			||||||
 | 
					      private: false
 | 
				
			||||||
 | 
					  tasks:
 | 
				
			||||||
 | 
					    - name: Validate user input
 | 
				
			||||||
 | 
					      ansible.builtin.assert:
 | 
				
			||||||
 | 
					        that:
 | 
				
			||||||
 | 
					          - skylab_node_down in groups.cluster
 | 
				
			||||||
 | 
					        fail_msg: >-
 | 
				
			||||||
 | 
					          ERROR: Host '{{ skylab_node_down }}' is not a valid cluster node (one
 | 
				
			||||||
 | 
					          of: {{ groups.cluster | join(', ') }})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Fetch node swarm ID
 | 
				
			||||||
 | 
					      ansible.builtin.command:
 | 
				
			||||||
 | 
					        cmd: !unsafe docker info --format '{{ .Swarm.NodeID}}'
 | 
				
			||||||
 | 
					      changed_when: false
 | 
				
			||||||
 | 
					      register: _docker_node_id_raw
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Fetch swarm node availability
 | 
				
			||||||
 | 
					      ansible.builtin.command:
 | 
				
			||||||
 | 
					        cmd: docker node inspect {{ _docker_node_id_raw.stdout.strip() }} --format '{{ '{{ .Spec.Availability}}' }}'
 | 
				
			||||||
 | 
					      changed_when: false
 | 
				
			||||||
 | 
					      register: _docker_node_availability_raw
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Set common facts
 | 
				
			||||||
 | 
					      ansible.builtin.set_fact:
 | 
				
			||||||
 | 
					        _target_node: "{{ skylab_node_down }}"
 | 
				
			||||||
 | 
					        _docker_node_id: "{{ _docker_node_id_raw.stdout.strip() }}"
 | 
				
			||||||
 | 
					        _docker_node_availability: "{{ _docker_node_availability_raw.stdout.strip() }}"
 | 
				
			||||||
 | 
					        # Use the next host in the group, unless that would exceed the length of the group,
 | 
				
			||||||
 | 
					        # in which case use the first host in the group
 | 
				
			||||||
 | 
					        _target_alt: >-
 | 
				
			||||||
 | 
					          {{ groups.cluster[
 | 
				
			||||||
 | 
					            lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1
 | 
				
			||||||
 | 
					            if (lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1) < (groups.cluster | length)
 | 
				
			||||||
 | 
					            else 0]
 | 
				
			||||||
 | 
					          }}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # I'm not sure how to do this without invoking a loop, so here we are
 | 
				
			||||||
 | 
					    - name: Set common fact for node addresses
 | 
				
			||||||
 | 
					      vars:
 | 
				
			||||||
 | 
					        _node_addresses:
 | 
				
			||||||
 | 
					          - "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4.address }}"
 | 
				
			||||||
 | 
					      ansible.builtin.set_fact:
 | 
				
			||||||
 | 
					        _node_addresses: "{{ _node_addresses + [item.address] }}"
 | 
				
			||||||
 | 
					      loop: "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4_secondaries }}"
 | 
				
			||||||
 | 
					      loop_control:
 | 
				
			||||||
 | 
					        label: "{{ item.address }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Set facts for target node
 | 
				
			||||||
 | 
					      when: inventory_hostname == _target_node
 | 
				
			||||||
 | 
					      ansible.builtin.set_fact:
 | 
				
			||||||
 | 
					        _needs_docker_migration: "{{ (_docker_node_availability | lower != 'drain') | bool }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Check cluster settings
 | 
				
			||||||
 | 
					      when: inventory_hostname != _target_node
 | 
				
			||||||
 | 
					      ansible.builtin.assert:
 | 
				
			||||||
 | 
					        that:
 | 
				
			||||||
 | 
					          - skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
 | 
				
			||||||
 | 
					          - _docker_node_availability | lower == 'active'
 | 
				
			||||||
 | 
					        fail_msg: >-
 | 
				
			||||||
 | 
					          ERROR: Node '{{ inventory_hostname }}' is already marked as unavailable. All cluster
 | 
				
			||||||
 | 
					          nodes must be available before a new node can be moved to unavailable status.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- name: Offline node
 | 
				
			||||||
 | 
					  hosts: "{{ skylab_node_down }}"
 | 
				
			||||||
 | 
					  tasks:
 | 
				
			||||||
 | 
					    - name: Migrate services off target node
 | 
				
			||||||
 | 
					      when: _needs_docker_migration
 | 
				
			||||||
 | 
					      block:
 | 
				
			||||||
 | 
					        - name: Fetch current cluster service state
 | 
				
			||||||
 | 
					          ansible.builtin.command:
 | 
				
			||||||
 | 
					            cmd: !unsafe docker service ls --format '{{json .}}'
 | 
				
			||||||
 | 
					          changed_when: false
 | 
				
			||||||
 | 
					          register: _cluster_service_prestate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        - name: Disable NAT rule {{ _skylab_adguard_nat_rule }}
 | 
				
			||||||
 | 
					          delegate_to: core
 | 
				
			||||||
 | 
					          connection: ansible.netcommon.network_cli
 | 
				
			||||||
 | 
					          community.network.edgeos_config:
 | 
				
			||||||
 | 
					            lines:
 | 
				
			||||||
 | 
					              - set service nat rule {{ _skylab_adguard_nat_rule }} disable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        - name: Update node availability
 | 
				
			||||||
 | 
					          vars:
 | 
				
			||||||
 | 
					            ansible_python_interpreter: "{{ skylab_state_dir }}/ansible-runtime/bin/python"
 | 
				
			||||||
 | 
					          community.docker.docker_node:
 | 
				
			||||||
 | 
					            availability: drain
 | 
				
			||||||
 | 
					            hostname: "{{ _docker_node_id }}"
 | 
				
			||||||
 | 
					          register: _node_availability_status
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        - name: Wait for services to shutdown
 | 
				
			||||||
 | 
					          ansible.builtin.pause:
 | 
				
			||||||
 | 
					            seconds: 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        - name: Wait for services to migrate
 | 
				
			||||||
 | 
					          ansible.builtin.command:
 | 
				
			||||||
 | 
					            cmd: !unsafe docker service ls --format '{{json .}}'
 | 
				
			||||||
 | 
					          changed_when: false
 | 
				
			||||||
 | 
					          register: _cluster_service_poststate
 | 
				
			||||||
 | 
					          until: _cluster_service_poststate.stdout == _cluster_service_prestate.stdout
 | 
				
			||||||
 | 
					          retries: 120
 | 
				
			||||||
 | 
					          delay: 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        - name: Enable NAT rule {{ _skylab_adguard_nat_rule }}
 | 
				
			||||||
 | 
					          delegate_to: core
 | 
				
			||||||
 | 
					          connection: ansible.netcommon.network_cli
 | 
				
			||||||
 | 
					          community.network.edgeos_config:
 | 
				
			||||||
 | 
					            lines:
 | 
				
			||||||
 | 
					              - delete service nat rule {{ _skylab_adguard_nat_rule }} disable
 | 
				
			||||||
 | 
					            save: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Delete address from node
 | 
				
			||||||
 | 
					      become: true
 | 
				
			||||||
 | 
					      when: skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
 | 
				
			||||||
 | 
					      ansible.builtin.command:
 | 
				
			||||||
 | 
					        cmd: ip address delete {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ skylab_cluster.interface }}
 | 
				
			||||||
 | 
					      changed_when: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - name: Assign address to alt node
 | 
				
			||||||
 | 
					      delegate_to: "{{ _target_alt }}"
 | 
				
			||||||
 | 
					      become: true
 | 
				
			||||||
 | 
					      when: skylab_cluster.address | ansible.netcommon.ipaddr('address') not in hostvars[_target_alt]._node_addresses
 | 
				
			||||||
 | 
					      ansible.builtin.command:
 | 
				
			||||||
 | 
					        cmd: ip address add {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ hostvars[_target_alt].skylab_cluster.interface }}
 | 
				
			||||||
 | 
					      changed_when: true
 | 
				
			||||||
		Reference in New Issue
	
	Block a user