Kingbase运维日常:用ksql命令行搞定数据库监控、备份与日志分析的5个脚本

发布时间:2026/5/28 9:58:00

Kingbase运维日常:用ksql命令行搞定数据库监控、备份与日志分析的5个脚本 Kingbase运维实战5个高效脚本解决监控、备份与日志分析难题每天早上打开终端面对几十个Kingbase实例的运维工作你是否也感到力不从心数据库连接数突然飙升、备份是否成功无从验证、慢查询日志堆积如山...这些问题如果全靠手动操作不仅效率低下还容易遗漏关键指标。本文将分享5个经过生产环境验证的ksql脚本帮你把日常运维工作自动化节省至少70%的操作时间。1. 实时连接数监控与告警脚本数据库连接数暴增是导致性能问题的常见原因。这个脚本不仅能实时监控连接数还能在超过阈值时自动触发告警。#!/bin/bash # 配置数据库连接参数 DB_HOSTlocalhost DB_PORT54321 DB_USERsystem DB_NAMEtest MAX_CONNECTIONS100 # 获取当前连接数 CONN_COUNT$(ksql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -t -c SELECT count(*) FROM sys_stat_activity;) # 判断并发送告警 if [ $CONN_COUNT -gt $MAX_CONNECTIONS ]; then echo 警告数据库连接数已达 ${CONN_COUNT}超过阈值 ${MAX_CONNECTIONS} | mail -s Kingbase连接数告警 adminexample.com fi # 输出监控结果 echo $(date %Y-%m-%d %H:%M:%S), 当前连接数: $CONN_COUNT /var/log/kingbase_connections.log关键参数说明-t参数只输出数据不显示列名sys_stat_activityKingbase系统视图记录所有活动连接建议通过crontab设置每5分钟执行一次实际使用中我发现这个脚本有几点可以优化添加连接来源分析SELECT client_addr, count(*) FROM sys_stat_activity GROUP BY client_addr;区分活跃/空闲连接SELECT state, count(*) FROM sys_stat_activity GROUP BY state;2. 自动化备份验证方案备份文件存在≠备份有效。这个方案不仅执行备份还会自动验证备份的完整性和可恢复性。#!/bin/bash # 备份配置 BACKUP_DIR/kingbase/backups DB_NAMEproduction_db TIMESTAMP$(date %Y%m%d_%H%M%S) # 执行逻辑备份 ksql_dump -h localhost -p 54321 -U backup_user -F c -b -v -f ${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.backup $DB_NAME # 验证备份文件 if [ ! -f ${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.backup ]; then echo 备份文件创建失败 | mail -s Kingbase备份失败告警 dbaexample.com exit 1 fi # 创建测试恢复环境 TEST_DB${DB_NAME}_backup_test_${TIMESTAMP} ksql -h localhost -p 54321 -U postgres -c CREATE DATABASE ${TEST_DB}; # 执行恢复测试 kb_restore -h localhost -p 54321 -U postgres -d ${TEST_DB} -v ${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.backup # 检查关键表是否存在 TABLE_CHECK$(ksql -h localhost -p 54321 -U postgres -d ${TEST_DB} -t -c SELECT count(*) FROM information_schema.tables WHERE table_schemapublic;) if [ $TABLE_CHECK -eq 0 ]; then echo 备份恢复验证失败 | mail -s Kingbase备份验证告警 dbaexample.com else echo 备份验证成功关键表数量${TABLE_CHECK} fi # 清理测试环境 ksql -h localhost -p 54321 -U postgres -c DROP DATABASE ${TEST_DB};最佳实践建议为备份单独创建用户并限制权限备份文件建议按日期归档并保留多个版本关键表可以自定义检查列表而不仅是统计数量3. 慢查询分析与优化助手慢查询是性能杀手这个脚本能自动识别并分析TOP 10慢查询生成优化建议报告。-- 启用慢查询记录如果尚未启用 ALTER SYSTEM SET log_min_duration_statement 1000; -- 记录执行超过1秒的查询 SELECT sys_reload_conf(); -- 重载配置使生效 -- 分析慢查询日志的脚本 #!/bin/bash SLOW_LOG/kingbase/data/kingbase.log REPORT_FILE/var/log/kingbase_slow_query_$(date %Y%m%d).html # 提取慢查询并生成HTML报告 ksql -h localhost -p 54321 -U analyst -H -c SELECT query_start as \执行时间\, duration as \耗时(ms)\, query as \SQL语句\, user_name as \执行用户\, client_addr as \客户端IP\ FROM sys_stat_statements WHERE duration 1000 ORDER BY duration DESC LIMIT 10; -o $REPORT_FILE # 添加执行计划分析到报告 echo h2执行计划分析/h2 $REPORT_FILE for query in $(ksql -h localhost -p 54321 -U analyst -t -c SELECT query FROM sys_stat_statements WHERE duration 1000 ORDER BY duration DESC LIMIT 3;) do echo h3SQL: ${query}/h3 $REPORT_FILE ksql -h localhost -p 54321 -U analyst -H -c EXPLAIN ANALYZE ${query} $REPORT_FILE done报告解读技巧重点关注执行时间最长的TOP 3查询检查执行计划中的Seq Scan全表扫描操作注意嵌套循环连接的代价估算是否准确在我的实践中这个脚本帮助发现了几个关键问题缺少索引导致的全表扫描N1查询问题不合理的连接顺序4. 会话管理与死锁处理工具异常会话可能导致资源耗尽这个脚本能自动识别并处理问题会话。#!/bin/bash # 配置参数 IDLE_TIMEOUT3600 # 闲置超时(秒) MAX_RUNNING_TIME7200 # 最长运行时间(秒) # 获取需要终止的会话ID TERMINATE_SESSIONS$(ksql -h localhost -p 54321 -U monitor -t -c SELECT pid FROM sys_stat_activity WHERE (state idle AND current_timestamp - state_change interval ${IDLE_TIMEOUT} seconds) OR (backend_start current_timestamp - interval ${MAX_RUNNING_TIME} seconds) AND pid pg_backend_pid(); ) # 终止会话 for pid in $TERMINATE_SESSIONS; do echo 终止会话: $pid ksql -h localhost -p 54321 -U monitor -c SELECT sys_terminate_backend($pid); done # 死锁检测与处理 DEADLOCKS$(ksql -h localhost -p 54321 -U monitor -t -c SELECT DISTINCT blocked_locks.pid AS blocked_pid FROM sys_locks blocked_locks JOIN sys_locks blocking_locks ON blocking_locks.locktype blocked_locks.locktype AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid ! blocked_locks.pid JOIN sys_stat_activity blocking_activity ON blocking_activity.pid blocking_locks.pid WHERE NOT blocked_locks.GRANTED; ) if [ -n $DEADLOCKS ]; then echo 检测到死锁涉及会话: $DEADLOCKS | mail -s Kingbase死锁告警 dbaexample.com fi使用注意事项谨慎设置超时阈值避免影响正常业务终止会话前可考虑先通知用户定期分析被终止的会话模式找出系统设计问题5. 性能快照与趋势分析系统性能问题往往需要历史数据对比分析这个脚本能定期采集关键指标并建立性能基线。#!/bin/bash # 创建性能快照表如果不存在 ksql -h localhost -p 54321 -U collector -c CREATE TABLE IF NOT EXISTS public.performance_snapshots ( snapshot_time TIMESTAMP PRIMARY KEY, connection_count INTEGER, cache_hit_rate NUMERIC(5,2), transactions_per_sec NUMERIC(10,2), disk_usage_gb NUMERIC(10,2), index_usage_ratio NUMERIC(5,2) ); # 采集当前性能指标 SNAPSHOT_DATA$(ksql -h localhost -p 54321 -U collector -t -c SELECT now(), (SELECT count(*) FROM sys_stat_activity), (SELECT round(blks_hit*100/(blks_hitblks_read),2) FROM sys_stat_database WHERE datnamecurrent_database()), (SELECT xact_commit/(extract(epoch from now()-stats_reset)/60) FROM sys_stat_database WHERE datnamecurrent_database()), (SELECT round(pg_database_size(current_database())/1024/1024/1024,2)), (SELECT round(100.0*idx_scan/(seq_scanidx_scan),2) FROM sys_stat_all_tables WHERE schemanamepublic AND relnameyour_main_table); ) # 存储快照数据 ksql -h localhost -p 54321 -U collector -c INSERT INTO public.performance_snapshots VALUES (${SNAPSHOT_DATA//,/,}); # 生成趋势报告每周 if [ $(date %u) -eq 1 ]; then # 每周一执行 ksql -h localhost -p 54321 -U reporter -H -c SELECT date_trunc(day, snapshot_time) as day, round(avg(connection_count)) as avg_connections, round(avg(cache_hit_rate),1) as avg_cache_hit_rate, round(avg(transactions_per_sec)) as avg_tps, round(max(disk_usage_gb),1) as max_disk_usage FROM public.performance_snapshots WHERE snapshot_time now() - interval 7 days GROUP BY day ORDER BY day; -o /var/reports/kingbase_performance_weekly_$(date %Y%m%d).html fi指标解读指南指标名称健康范围异常处理建议缓存命中率95%考虑增加shared_buffers索引使用率80%检查缺少索引的表连接数最大连接数的70%优化连接池配置磁盘使用每周增长5%检查数据归档策略这套脚本在我的生产环境中运行了6个月成功预警了3次潜在性能问题。最有用的是能够建立性能基线当指标偏离基线10%以上时触发告警而不是使用固定阈值。

相关新闻