Shell/Python 运维自动化:配置管理与批量部署的工程化实践

发布时间:2026/6/10 18:33:24

Shell/Python 运维自动化:配置管理与批量部署的工程化实践 Shell/Python 运维自动化配置管理与批量部署的工程化实践一、运维的重复劳动100 台服务器手动配置要多久运维团队最常见的场景新上线一个服务需要在 100 台服务器上安装依赖、修改配置、重启进程。手动 SSH 到每台服务器执行命令即使每台只需 2 分钟100 台也要 3 个多小时。更糟糕的是手动操作的一致性无法保证——第 47 台服务器的配置文件少了一个逗号排查这个 typo 又花了 2 小时。配置管理和批量部署是运维自动化的基石。Ansible、SaltStack 等工具提供了声明式的配置管理能力但在特定场景下如无法安装 Agent 的受限环境、需要精细控制执行逻辑的复杂部署Shell 脚本和 Python 脚本仍然是最灵活的选择。关键在于如何将灵活与可靠结合——脚本既要有足够的控制力又要有完善的错误处理和幂等性保证。二、配置管理的架构与幂等性设计配置管理的核心原则是幂等性——同一份配置执行多次结果始终一致。非幂等操作如echo config file在重复执行时会产生重复内容导致配置错误。幂等操作如lineinfile或先检查再写入则确保最终状态与期望一致。flowchart TD A[配置清单 YAML] -- B[配置渲染引擎br/Jinja2 模板] B -- C[目标配置文件] C -- D[差异比较br/当前 vs 目标] D -- E{是否有差异?} E --|否| F[跳过无需变更] E --|是| G[应用变更] G -- H[验证变更结果] H -- I{验证通过?} I --|是| J[记录变更日志] I --|否| K[自动回滚] K -- L[告警通知] subgraph 幂等性保证 M[检查当前状态] N[仅在状态不一致时变更] O[变更后验证] end D -- M G -- N H -- O批量部署的四个阶段预检检查目标主机的连通性、磁盘空间、依赖版本分发将配置文件和二进制包推送到目标主机执行按序执行部署步骤停止服务 → 更新文件 → 启动服务验证检查服务健康状态确认部署成功三、工程化运维脚本的完整实现配置管理框架# config_manager.py — 声明式配置管理框架 # 设计意图以 YAML 声明期望状态自动计算差异并应用变更 # 确保幂等性和可回滚性 import yaml import hashlib import shutil import logging from pathlib import Path from dataclasses import dataclass from typing import List, Dict, Optional from datetime import datetime logging.basicConfig(levellogging.INFO) logger logging.getLogger(__name__) dataclass class ConfigChange: 配置变更记录 path: str action: str # create / modify / delete old_hash: Optional[str] new_hash: str timestamp: datetime class ConfigManager: 声明式配置管理器 def __init__(self, state_dir: str /var/lib/config-manager): self.state_dir Path(state_dir) self.state_dir.mkdir(parentsTrue, exist_okTrue) self.changes: List[ConfigChange] [] def apply(self, manifest_path: str) - dict: 应用配置清单 with open(manifest_path) as f: manifest yaml.safe_load(f) results { total: 0, changed: 0, unchanged: 0, failed: 0, details: [], } for item in manifest.get(files, []): results[total] 1 try: changed self._apply_file(item) if changed: results[changed] 1 else: results[unchanged] 1 results[details].append({ path: item[path], status: changed if changed else unchanged, }) except Exception as e: results[failed] 1 results[details].append({ path: item[path], status: failed, error: str(e), }) for item in manifest.get(packages, []): results[total] 1 try: changed self._apply_package(item) if changed: results[changed] 1 else: results[unchanged] 1 results[details].append({ package: item[name], status: changed if changed else unchanged, }) except Exception as e: results[failed] 1 results[details].append({ package: item[name], status: failed, error: str(e), }) return results def _apply_file(self, item: dict) - bool: 应用文件配置幂等 path Path(item[path]) content item[content] owner item.get(owner, root) group item.get(group, root) mode item.get(mode, 0644) # 计算目标内容的哈希 target_hash hashlib.sha256(content.encode()).hexdigest() # 检查当前状态 if path.exists(): current_content path.read_text() current_hash hashlib.sha256(current_content.encode()).hexdigest() if current_hash target_hash: # 内容一致无需变更 return False # 内容不一致备份后更新 self._backup(path) path.write_text(content) old_hash current_hash action modify else: # 文件不存在创建 path.parent.mkdir(parentsTrue, exist_okTrue) path.write_text(content) old_hash None action create # 设置权限和属主 import os os.chmod(path, int(mode, 8)) os.chown(path, self._get_uid(owner), self._get_gid(group)) # 记录变更 self.changes.append(ConfigChange( pathstr(path), actionaction, old_hashold_hash, new_hashtarget_hash, timestampdatetime.now(), )) logger.info(f{action}: {path}) return True def _apply_package(self, item: dict) - bool: 应用包配置幂等 import subprocess name item[name] state item.get(state, present) # 检查包是否已安装 check_cmd fdpkg -l {name} 2/dev/null | grep -q ^ii is_installed subprocess.call( check_cmd, shellTrue, stdoutsubprocess.DEVNULL ) 0 if state present and not is_installed: subprocess.check_call( [apt-get, install, -y, name], stdoutsubprocess.DEVNULL ) logger.info(finstalled: {name}) return True elif state absent and is_installed: subprocess.check_call( [apt-get, remove, -y, name], stdoutsubprocess.DEVNULL ) logger.info(fremoved: {name}) return True return False def rollback(self) - int: 回滚所有变更 rolled_back 0 for change in reversed(self.changes): backup_path self._get_backup_path(change.path) if backup_path.exists(): shutil.copy2(str(backup_path), change.path) rolled_back 1 logger.info(frolled back: {change.path}) elif change.action create: # 创建的文件删除 Path(change.path).unlink(missing_okTrue) rolled_back 1 logger.info(fremoved created file: {change.path}) self.changes.clear() return rolled_back def _backup(self, path: Path): 备份当前文件 backup_path self._get_backup_path(str(path)) backup_path.parent.mkdir(parentsTrue, exist_okTrue) shutil.copy2(str(path), str(backup_path)) def _get_backup_path(self, original_path: str) - Path: 获取备份文件路径 safe_name original_path.replace(/, _) return self.state_dir / f{safe_name}.bak def _get_uid(self, username: str) - int: 获取用户 UID import pwd try: return pwd.getpwnam(username).pw_uid except KeyError: return 0 def _get_gid(self, groupname: str) - int: 获取组 GID import grp try: return grp.getgrnam(groupname).gr_gid except KeyError: return 0批量部署脚本#!/bin/bash # batch_deploy.sh — 批量部署脚本 # 设计意图在多台目标主机上并行执行部署包含预检、 # 分发、执行和验证四个阶段支持滚动部署和失败回滚 set -euo pipefail # ---- 配置 ---- HOSTS_FILE${HOSTS_FILE:-hosts.txt} DEPLOY_DIR/opt/app MAX_PARALLEL10 ROLLING_BATCH5 # 滚动部署每批 5 台 ROLLBACK_ON_FAIL1 # 失败时自动回滚 # ---- 日志 ---- LOG_DIR/var/log/deploy mkdir -p $LOG_DIR TIMESTAMP$(date %Y%m%d_%H%M%S) LOG_FILE$LOG_DIR/deploy_${TIMESTAMP}.log log() { echo [$(date %Y-%m-%d %H:%M:%S)] $* | tee -a $LOG_FILE } # ---- 预检 ---- preflight_check() { local host$1 log [PREFLIGHT] Checking $host... # 检查 SSH 连通性 if ! ssh -o ConnectTimeout5 -o BatchModeyes $host echo ok /dev/null; then log [PREFLIGHT] FAILED: Cannot connect to $host return 1 fi # 检查磁盘空间至少 1GB local disk_avail disk_avail$(ssh $host df -BG $DEPLOY_DIR | awk NR2{print \$4} | tr -d G) if [[ $disk_avail -lt 1 ]]; then log [PREFLIGHT] FAILED: $host has less than 1GB disk space return 1 fi # 检查当前服务状态 local service_status service_status$(ssh $host systemctl is-active app-service 2/dev/null || echo unknown) log [PREFLIGHT] $host: disk${disk_avail}G, service$service_status return 0 } # ---- 部署单台主机 ---- deploy_host() { local host$1 local log_prefix[DEPLOY] $host log $log_prefix: Starting deployment... # Step 1: 停止服务优雅停止 if ! ssh $host systemctl stop app-service 2/dev/null; then log $log_prefix: WARN - Service stop failed, may not be running fi # Step 2: 备份当前版本 ssh $host cp -r $DEPLOY_DIR/current $DEPLOY_DIR/previous 2/dev/null || true # Step 3: 分发新版本 if ! scp -r ./dist/* $host:$DEPLOY_DIR/current/; then log $log_prefix: FAILED - File distribution error return 1 fi # Step 4: 更新配置 if ! ssh $host cd $DEPLOY_DIR/current python3 config_manager.py apply manifest.yml; then log $log_prefix: FAILED - Configuration error # 回滚恢复上一版本 if [[ $ROLLBACK_ON_FAIL -eq 1 ]]; then ssh $host rm -rf $DEPLOY_DIR/current mv $DEPLOY_DIR/previous $DEPLOY_DIR/current fi return 1 fi # Step 5: 启动服务 if ! ssh $host systemctl start app-service; then log $log_prefix: FAILED - Service start error return 1 fi # Step 6: 健康检查等待最多 30 秒 local retries6 local wait_sec5 local healthyfalse for i in $(seq 1 $retries); do if ssh $host curl -sf http://localhost:3000/health /dev/null; then healthytrue break fi sleep $wait_sec done if [[ $healthy ! true ]]; then log $log_prefix: FAILED - Health check timeout # 回滚 if [[ $ROLLBACK_ON_FAIL -eq 1 ]]; then ssh $host systemctl stop app-service ssh $host rm -rf $DEPLOY_DIR/current mv $DEPLOY_DIR/previous $DEPLOY_DIR/current ssh $host systemctl start app-service fi return 1 fi log $log_prefix: Deployment successful return 0 } # ---- 滚动部署 ---- rolling_deploy() { local hosts($) local total${#hosts[]} local failed0 local succeeded0 log Starting rolling deployment: $total hosts, batch size $ROLLING_BATCH for ((i0; itotal; iROLLING_BATCH)); do local batch(${hosts[]:i:ROLLING_BATCH}) local batch_num$((i / ROLLING_BATCH 1)) local batch_total$(( (total ROLLING_BATCH - 1) / ROLLING_BATCH )) log Batch $batch_num/$batch_total # 并行部署当前批次 local pids() for host in ${batch[]}; do deploy_host $host pids($!) done # 等待当前批次完成 local batch_failed0 for pid in ${pids[]}; do if ! wait $pid; then batch_failed$((batch_failed 1)) fi done if [[ $batch_failed -gt 0 ]]; then log Batch $batch_num: $batch_failed hosts failed failed$((failed batch_failed)) # 如果批次失败率超过 50%停止部署 if [[ $batch_failed -gt $((ROLLING_BATCH / 2)) ]]; then log Too many failures in batch $batch_num, stopping deployment break fi else succeeded$((succeeded ${#batch[]})) log Batch $batch_num: All hosts deployed successfully fi done log Deployment Summary log Total: $total, Succeeded: $succeeded, Failed: $failed return $((failed 0 ? 1 : 0)) } # ---- 主流程 ---- main() { # 读取主机列表 if [[ ! -f $HOSTS_FILE ]]; then log ERROR: Hosts file not found: $HOSTS_FILE exit 1 fi mapfile -t hosts $HOSTS_FILE log Loaded ${#hosts[]} hosts from $HOSTS_FILE # 预检 local valid_hosts() for host in ${hosts[]}; do if preflight_check $host; then valid_hosts($host) fi done if [[ ${#valid_hosts[]} -eq 0 ]]; then log ERROR: No valid hosts after preflight check exit 1 fi log Preflight passed: ${#valid_hosts[]}/${#hosts[]} hosts # 执行滚动部署 rolling_deploy ${valid_hosts[]} } main $四、运维自动化的 Trade-offs脚本 vs 配置管理工具Shell/Python 脚本灵活但缺乏标准化Ansible 等工具标准化但灵活性受限。脚本适合一次性任务和特殊场景工具适合标准化运维。建议核心基础设施服务器初始化、安全加固使用 Ansible业务部署和特殊操作使用脚本。并行度与风险控制并行部署可以大幅缩短时间但增加了同时出错的风险。滚动部署分批执行在速度和风险之间取得平衡——每批部署后验证失败则停止。建议生产环境使用 5-10 台/批的滚动部署测试环境可以全量并行。幂等性的实现成本确保每个操作幂等需要额外的检查逻辑如先判断文件是否存在再写入增加了脚本复杂度。但非幂等操作在重复执行时的调试成本远高于幂等性的实现成本。建议对所有配置文件操作实现幂等性对不可逆操作如数据库迁移实现显式的确认机制。回滚的完整性文件回滚相对简单恢复备份但数据库变更和服务状态回滚更复杂。一个已经执行的 ALTER TABLE 无法简单回滚一个已经发送的通知无法撤回。需要在部署计划中为每个步骤定义回滚操作并在自动化脚本中实现。五、总结Shell/Python 运维自动化通过声明式配置管理和滚动批量部署将运维操作从手动逐台执行推向自动化批量处理。核心原则是幂等性同一操作多次执行结果一致和滚动部署分批执行降低风险。但脚本与工具的选择、并行度与风险控制的平衡、幂等性的实现成本和回滚的完整性是需要权衡的因素。在实际落地中建议核心基础设施使用 Ansible 等工具标准化管理业务部署使用脚本实现精细控制所有操作实现幂等性保证生产环境使用滚动部署策略。运维自动化的目标不是消除人工而是让机器做重复的事让人做决策的事。

相关新闻