生产实践:
调用Ansible来摘流/上线 nginx upstream机器
学习技巧:
Shell、Sed、Ansible、Pssh使用
脚本内容:
最近在搞成本优化,需要大量下线、缩容降配机器,由于代理没有平台来管理,多机房,多环境摘流,上下线操作比较麻烦,想搞个脚本小工具去nginx代理层批量摘流,上线后端upstream机器,代替大量重复性的人肉手工操作。工具需要具有灰度,预览,备份回滚,前后check,md5值显示等功能,尽量满足幂等性、健壮性,工具不能同时执行,vhosts文件也不能同时操作,想想还是用ansible来实现比较实用,试了下replace模块,但是正则那块一直报错,后面直接使用sed简单粗暴,也能满足需求,不算上重复代码一共300多行,完事。线上四五十台代理梭哈没有出现问题,保住了狗命。
使用说明:
脚本使用:
使用:输入包含代理主机名或IP的文件,待上下线的主机ip、执行操作[off:摘流,on:上线]、操作人标识
Usage: op_upstream_host.sh <inventory file> <ip> [off/on])<operator>
摘流:sh op_upstream_host.sh proxy-test 10.x.x.x off azh
上线:sh op_upstream_host.sh proxy-test 10.x.x.x on azh
op_upstream_host.sh:主脚本文件
#!/bin/bash
#
# op_upstream_host - this script use for offline and online the nginx upstream host
#
# create by anzhihe@foxmail.com 20220505
#
PROXY_FILE=$1
IPV4=$2
OPERATE=$3
OPERATOR=$4
LOCK_FILE="/var/run/op_upstream_host.lock"
#ansible_inventory="/home/anzhihe/app/python27/bin/ansible-inventory"
#ansible_playbook="/home/anzhihe/app/python27/bin/ansible-playbook"
#
# Set Colors and Error Codes
#
set_const() {
# 定义颜色变量
bold=$(tput bold)
underline=$(tput sgr 0 1)
reset=$(tput sgr0)
red=$(tput setaf 1)
green=$(tput setaf 76)
white=$(tput setaf 7)
tan=$(tput setaf 202)
blue=$(tput setaf 25)
yellow=$(tput setaf 11)
purple=$(tput setaf 127)
# 定义返回错误码
E_NOT_EXIST=11
E_NOT_IPV4=12
E_NOT_OFF_ON=13
E_IS_EMPTY=14
E_BAD_ARGUS=15
}
#
# Headers and Logging
#
underline() {
printf "${underline}${bold}%s${reset}\n" "$@"
}
h1() {
printf "\n${underline}${bold}${blue}%s${reset}\n" "$@"
}
h2() {
printf "\n${underline}${bold}${white}%s${reset}\n\n" "$@"
}
debug() {
printf "${white}%s${reset}\n" "$@"
}
info() {
printf "${white}➜ %s${reset}\n" "$@"
}
success() {
printf "${green}✔ %s${reset}\n" "$@"
}
error() {
printf "${tan}✖ %s${reset}\n" "$@"
}
warn() {
printf "${yellow}➜ %s${reset}\n" "$@"
}
bold() {
printf "${bold}%s${reset}\n" "$@"
}
note() {
printf "\n${underline}${bold}${blue}Note:${reset} ${blue}%s${reset}\n" "$@"
}
usage() {
cat <<EOF
Usage: $0 <inventory file> <host ipv4> [off/on] <operator>
example:
摘流:sh op_upstream_host.sh proxy-test 10.110.112.119 off azh
上线:sh op_upstream_host.sh proxy-test 10.110.112.119 on azh
-h, --help display this help and exit
EOF
}
usage_exit() {
usage && exit "$@"
}
check_arguments() {
[[ "$1" == "-h" || "$1" == "--help" || -z "$1" ]] && usage_exit 0
[[ ! -f "${PROXY_FILE}" ]] && warn "请输入包含代理主机名的文件!" && usage_exit "${E_NOT_EXIST}"
check_ipv4 "${IPV4}" && warn "请输入合法的IPV4地址!" && usage_exit "${E_NOT_IPV4}"
[[ "${OPERATE}" != "off" && "${OPERATE}" != "on" ]] && warn "摘流请输入off,下线请输入on!" && usage_exit "${E_NOT_OFF_ON}"
[[ -z "${OPERATOR}" ]] && warn "操作人不能设置为空!" && usage_exit "${E_IS_EMPTY}"
[[ "$#" -ne 4 ]] && h2 "您输入的参数不符合规范,请查看使用说明。" && usage_exit "${E_BAD_ARGUS}"
}
check_ipv4() {
if echo $@ | grep -E -w "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" >/dev/null; then
return 1
else
return 0
fi
}
prompt() {
echo -e "\n${yellow}$(date +'%F %T') ➡ 正在执行 [${IPV4}] 机器$@的操作${reset}"
echo -e "${underline}${red}!!! 灰度执行后一定要检查再全量,一定要遵守机器上下线流程${reset}\n"
echo -e "${green}执行任务的主机列表: ${reset}"
#$($ansible_inventory) --graph -i ${inventory} all
/home/anzhihe/app/python27/bin/ansible-inventory --graph -i ${inventory} all && sleep 3
}
op_upstream_host() {
[[ "$1" == "off" ]] && offline_upstream_host
[[ "$1" == "on" ]] && online_upstream_host
}
offline_upstream_host() {
prompt "摘流" && /home/anzhihe/app/python27/bin/ansible-playbook -i ${inventory} --extra-vars="ip=${IPV4} sre=${OPERATOR}" offline_upstream_host.yml
}
online_upstream_host() {
prompt "上线" && /home/anzhihe/app/python27/bin/ansible-playbook -i ${inventory} --extra-vars="ip=${IPV4} sre=${OPERATOR}" online_upstream_host.yml
}
get_vhosts_md5() {
echo -e "TASK [vhosts.md5结果] *************************************************************"
pssh -h $@ -o /home/anzhihe/test/md5/ --inline-stdout "cat /home/anzhihe/app/nginx/conf/vhosts_bak/vhost.md5"
}
main() {
set_const
check_arguments "$@"
# 添加锁文件,确保脚本不能同时运行
if [ ! -e ${LOCK_FILE} ]; then
trap "rm -f ${LOCK_FILE}; exit" INT TERM EXIT
touch ${LOCK_FILE} && echo $$ > ${LOCK_FILE}
# 判断锁文件中的pid和当前pid是否相同 ,如果不同则退出
[ "$(cat ${LOCK_FILE})" == $$ ] || exit
inventory=$(readlink -f ${PROXY_FILE})
op_upstream_host ${OPERATE} && get_vhosts_md5 ${inventory}
rm ${LOCK_FILE}
trap - INT TERM EXIT
else
echo "$0 is already running"
fi
}
main "$@"offline_upstream_host.yml:摘流依赖的yml文件
- hosts: all
remote_user: root
#serial: 1 # 串行执行
serial: [1,3,5,15%] # 灰度执行
any_errors_fatal: true
gather_facts: no
vars:
- vhosts_conf_path: "/home/anzhihe/app/nginx/conf/vhosts"
- vhosts_bak_path: "/home/anzhihe/app/nginx/conf/vhosts_bak"
- vhosts_md5_file: "/home/anzhihe/app/nginx/conf/vhosts_bak/vhost.md5"
- getvhosts: "yes"
- doprev: "yes"
- dongxcheck: "yes"
- dongxreload: "yes"
- dorollback: "yes"
tasks:
- name: "获取待摘流主机vhosts文件"
shell: cd {{vhosts_conf_path}} && grep -r -w {{ip}} *.conf | awk -F":" '{print $1}' | uniq
when: getvhosts == "yes"
register: vhosts
args:
warn: false
- name: "获取到的vhosts conf文件"
debug:
msg: "{{vhosts.stdout_lines}}"
when: vhosts is success
- name: "检测vhosts文件是否被占用"
shell: grep -q -w "offline_host" {{vhosts_conf_path}}/{{item}} && echo "in-use"
loop: "{{vhosts.stdout_lines}}"
when: item > 0 and inventory_hostname == groups['ungrouped'][0]
register: checkused
failed_when: "'in-use' in checkused.stdout "
- name : "执行摘流预览变更"
shell: "sed -n -e \"/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/i ### ⬇ offline_host `date +'%Y%m%d'` {{sre}}\" -e '/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/s/^/#/gp' {{vhosts_conf_path}}/{{item}}"
#shell: "sed -n -e \"/^[[:blank:]]*server[[:blank:]]\\+\\<{{ip}}\\>:.*/i ### ⬇ offline_host `date +'%Y/%m/%d %H:%M:%S'`\" -e '/^[[:blank:]]*server[[:blank:]]\\+\\<{{ip}}\\>:.*/s/^/#/gp' {{vhosts_conf_path}}/{{item}}"
loop: "{{vhosts.stdout_lines}}"
when: doprev == "yes"
register: prevres
args:
warn: false
- name: "预览摘流变更结果"
debug:
msg: "{{item.stdout_lines}}"
loop: "{{prevres.results}}"
loop_control:
label: "{{item.item}}"
when: prevres is success and item > 0
- name: " ⬆预览摘流结果"
pause:
prompt: "请检察预览结果是否符合预期,没有问题请按回车 [Enter] 继续执行, [CTRL + C] 终止执行"
when: inventory_hostname == groups['ungrouped'][0]
- block:
- name: "创建vhost_bak备份文件夹"
file:
path: "{{vhosts_bak_path}}"
state: directory
owner: root
group: root
mode: 0775
- name: "创建vhost.md5备份文件"
file:
path: "{{vhosts_md5_file}}"
state: touch
owner: root
group: root
mode: 0775
- name: "备份变更vhosts文件"
copy: src={{vhosts_conf_path}}/{{item}} dest={{vhosts_bak_path}}/ owner=anzhihe group=anzhihe mode=0755 backup=yes remote_src=yes
loop: "{{vhosts.stdout_lines}}"
when: item > 0
changed_when: true
register: copyres
- name: "备份返回结果"
debug:
msg: "{{item.item}}备份成功 - md5sum:{{item.md5sum}}"
when: copyres is success
loop: "{{copyres.results}}"
loop_control:
label: "{{item.item}}"
- name: "注释下线主机"
#shell: "sed -i.bak -e \"/^[[:blank:]]*server[[:blank:]]\\+\\<{{ip}}\\>:.*/i ### ⬇ offline_host `date +'%Y/%m/%d %H:%M:%S'`\" -e '/^[[:blank:]]*server[[:blank:]]\\+\\<{{ip}}\\>:.*/s/^/#/g' {{vhosts_conf_path}}/{{item}}"
shell: "sed -i.bakoff_{{sre}}_`date +'%Y%m%d%H%M%S'` -e \"/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/i ### ⬇ offline_host `date +'%Y%m%d'` {{sre}}\" -e '/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/s/^/#/g' {{vhosts_conf_path}}/{{item}}"
loop: "{{vhosts.stdout_lines}}"
when: copyres.changed and item > 0
register: offres
args:
warn: false
- name: "注释返回结果"
debug:
msg: "{{item.end}} - {{item.item}} 机器注释成功"
when: offres is success
loop: "{{offres.results}}"
loop_control:
label: "{{item.item}}"
- name: "nginx配置检察"
shell: /home/anzhihe/app/nginx/sbin/nginx -t
when: copyres.changed and dongxcheck == "yes"
register: checkres
notify: reload nginx
- name: "nginx配置检察结果"
debug:
msg: "nginx检察返回:{{checkres.end}} - {{checkres.stderr_lines}}"
when: checkres.rc == 0
#ignore_errors: yes
# - debug:
# msg: "nginx配置检察失败"
# when: checkres is failure
# - debug:
# msg: "nginx配置没有变更"
# when: checkres is skip
- meta: flush_handlers
when: checkres.changed and checkres.rc == 0
- name: "清空vhosts.md5"
shell: ">{{vhosts_md5_file}}"
- name: "获取vhost文件md5值"
shell: "cd {{vhosts_conf_path}} && md5sum {{item}} >> {{vhosts_md5_file}}"
loop: "{{vhosts.stdout_lines}}"
when: item > 0
rescue:
- name: "测试失败,执行回滚操作"
copy: src={{vhosts_bak_path}}/{{item}} dest={{vhosts_conf_path}}/{{item}} owner=anzhihe group=anzhihe mode=0755 remote_src=yes
loop: "{{vhosts.stdout_lines}}"
when: dorollback == "yes"
register: rollbackres
- name: "nginx回滚结果"
debug:
msg: "{{item.item}}回滚成功!- checksum:{{item.checksum}}"
when: rollbackres is success
loop: "{{rollbackres.results}}"
loop_control:
label: "{{item.item}}"
- name: "nginx reload结果"
debug:
msg: "nginx reload成功"
when: reloadres.rc == 0
ignore_errors: yes
- name: "!!!摘流灰度检查"
pause:
prompt: " ⬇请检察摘流灰度执行是否有问题,没有问题请按回车 [Enter] 全量执行,慎重!"
# run_once: true
when: inventory_hostname == groups['ungrouped'][0]
handlers:
- name: "reload nginx"
shell: /home/anzhihe/app/nginx/sbin/nginx -s reload
when: checkres.rc == 0
register: reloadresonline_upstream_host.yml:上线依赖的yml文件
- hosts: all
remote_user: root
#serial: 1 # 串行执行
serial: [1,3,5,15%]
any_errors_fatal: true
gather_facts: no
vars:
- vhosts_conf_path: "/home/anzhihe/app/nginx/conf/vhosts"
- vhosts_bak_path: "/home/anzhihe/app/nginx/conf/vhosts_bak"
- vhosts_md5_file: "/home/anzhihe/app/nginx/conf/vhosts_bak/vhost.md5"
- getvhosts: "yes"
- doprev: "yes"
- dongxcheck: "yes"
- dongxreload: "yes"
- dorollback: "yes"
tasks:
- name: "获取待上线主机vhosts文件"
shell: cd {{vhosts_conf_path}} && grep -r -w {{ip}} *.conf | awk -F":" '{print $1}' | uniq
when: getvhosts == "yes"
register: vhosts
args:
warn: false
- name: "获取到的vhosts conf文件"
debug:
msg: "{{vhosts.stdout_lines}}"
when: vhosts is success
- name : "执行上线预览变更"
#shell: "sed -n -e \"/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/i ### ⬇ offline_host `date +'%Y%m%d %H:%M:%S'`\" -e '/^.*server[[:blank:]]\\+\\<{{ip}}\\>:.*/s/^#//gp' {{vhosts_conf_path}}/{{item}}"
shell: "delmark() { sed -n '/offline_host/,+1p' $@;echo '<<<<<<< 下线注释预览,如有他人下线标识请沟通后再操作!!! <<<<<<<\n---\n>>>>>>> 上线后结果预览';sed -n '/offline_host/p' $@|sort|uniq|head -1|xargs -r -i bash -c \"sed -n -e '/{}/p' -e '/\\<{{ip}}\\>/s/^#//gp' $@\"; };delmark {{vhosts_conf_path}}/{{item}}"
loop: "{{vhosts.stdout_lines}}"
when: doprev == "yes"
register: prevres
args:
warn: false
- name: "预览上线变更结果"
debug:
msg: "{{item.stdout_lines}}"
loop: "{{prevres.results}}"
loop_control:
label: "{{item.item}}"
when: prevres is success and item > 0
- name: " ⬆预览上线结果"
pause:
prompt: "请检察上线预览结果是否符合预期,没有问题请按回车 [Enter] 继续执行, [CTRL + C] 终止执行"
when: inventory_hostname == groups['ungrouped'][0]
- block:
- name: "创建vhost_bak备份文件夹"
file:
path: "{{vhosts_bak_path}}"
state: directory
owner: root
group: root
mode: 0775
- name: "创建vhost.md5备份文件"
file:
path: "{{vhosts_md5_file}}"
state: touch
owner: root
group: root
mode: 0775
- name: "备份变更vhosts文件"
copy: src={{vhosts_conf_path}}/{{item}} dest={{vhosts_bak_path}}/{{item}} owner=anzhihe group=anzhihe mode=0755 backup=yes remote_src=yes
loop: "{{vhosts.stdout_lines}}"
when: item > 0
changed_when: true
register: copyres
- name: "备份返回结果"
debug:
msg: "{{item.item}}备份成功 - md5sum:{{item.md5sum}}"
when: copyres is success
loop: "{{copyres.results}}"
loop_control:
label: "{{item.item}}"
- name: "删除待上线主机注释"
shell: "delmark() { sed -n '/offline_host/p' $@|sort|uniq|head -1|xargs -r -i bash -c \"sed -i.bakon_{{sre}}_`date +'%Y%m%d%H%M%S'` -e '/{}/d' -e '/\\<{{ip}}\\>/s/^#//g' $@\"; };delmark {{vhosts_conf_path}}/{{item}}"
loop: "{{vhosts.stdout_lines}}"
when: copyres.changed and item > 0
register: offres
args:
warn: false
- name: "取消注释返回结果"
debug:
msg: "{{item.end}} - {{item.item}} 取消注释成功"
when: offres is success
loop: "{{offres.results}}"
loop_control:
label: "{{item.item}}"
- name: "nginx配置检察"
shell: /home/anzhihe/app/nginx/sbin/nginx -t
when: copyres.changed and dongxcheck == "yes"
register: checkres
notify: reload nginx
- name: "nginx配置检察结果"
debug:
msg: "nginx检察返回:{{checkres.end}} - {{checkres.stderr_lines}}"
when: checkres.rc == 0
#ignore_errors: yes
# - debug:
# msg: "nginx配置检察失败"
# when: checkres is failure
# - debug:
# msg: "nginxs配置没有变更"
# when: checkres is skip
- meta: flush_handlers
when: checkres.changed and checkres.rc == 0
- name: "清空vhosts.md5"
shell: ">{{vhosts_md5_file}}"
- name: "获取vhost文件md5值"
shell: "cd {{vhosts_conf_path}} && md5sum {{item}} >> {{vhosts_md5_file}}"
loop: "{{vhosts.stdout_lines}}"
when: item > 0
rescue:
- name: "测试失败,执行回滚操作"
copy: src={{vhosts_bak_path}}/{{item}} dest={{vhosts_conf_path}}/{{item}} owner=anzhihe group=anzhihe mode=0755 remote_src=yes
loop: "{{vhosts.stdout_lines}}"
when: dorollback == "yes"
register: rollbackres
- name: "nginx回滚结果"
debug:
msg: "{{item.item}}回滚成功!- checksum:{{item.checksum}}"
when: rollbackres is success
loop: "{{rollbackres.results}}"
loop_control:
label: "{{item.item}}"
- name: "nginx reload结果"
debug:
msg: "nginx reload成功"
when: reloadres.rc == 0
ignore_errors: yes
- name: "!!!上线灰度检查"
pause:
prompt: " ⬇请检察上线灰度执行是否有问题,没有问题请按回车 [Enter] 全量执行,慎重!"
# run_once: true
when: inventory_hostname == groups['ungrouped'][0]
handlers:
- name: "reload nginx"
shell: /home/anzhihe/app/nginx/sbin/nginx -s reload
when: checkres.rc == 0
register: reloadres备份文件:
代理机器变更vhosts备份回滚目录: /home/anzhihe/app/nginx/conf/vhosts_bak
代理机器变更vhosts备份文件:/home/anzhihe/app/nginx/conf/vhosts/目录下以 .bak[on/off]_操作标识_时间戳 为后缀文件
代理机器变更vhosts md5备份文件:/home/anzhihe/app/nginx/conf/vhosts.md5
变更vhosts本地md5变更记录文件:/home/anzhihe/test/md5/代理机器名或ip
演示效果:
1、第一台代理运行时会列出要变更的代理IP的变更效果预览
2、第一台代理机器运行完会暂停提示灰度检查,可以上机器上检查变更是否符合预期,然后全量执行
3、nginx -t 检察失败后,会自动进行回滚
4、全量后显示执行结果,输出变更vhosts的md5值
5、操作前后可以手动校验摘流下线机器状态
pssh -h proxy-test -P "grep -r -w "摘流上线主机ip" /home/anzhihe/app/nginx/conf/vhosts/*" | grep -v FAILURE | grep -v SUCCESS | grep -v bak
注意事项:
1、执行是灰度分步执行,代理机器少也可以改为串行,中间如果出错会回滚abort,此时为了保持所有代理机器配置一致性,最好把之前执行完和这台出错的机器注释掉后再继续执行
2、一次摘流操作对应一次上线操作,同人操作尽快完成,中间不要停滞,不要给自己留坑,把风险降到最低
3、由于服务存在混部的情况,不能多人同时执行,且在上下线过程中vhosts文件不能变更,需错开执行
4、注意接口存在单点机器(单点注释了会回滚其实还好!),流量有没有完全摘除的情况(机器可能还有其它流量)
5、预览、灰度检察一定要认真对待,操作地同时关注报警和异常反馈,有问题直接终止脚本止损
6、会在下线主机上方添加 日期时间+ 操作人 来标识下线操作,操作完成后注意vhosts变更文件的md5值是否有不一致的情况
改进优化:
☑️ 脚本不允许同时执行,同时会对传入参数进行校验,加强工具的健壮性
☑️ 输出变更文件md5值,摘流注释加当前执行人标识,提供操作前后结果预览
☑️ 解决多人同时操作同一个vhosts文件的情况,不采用人为约束,人肉操作难免犯错
参考:



