通过Python实现对SQL Server 数据文件大小的监控告警
阅读原文时间:2021年04月24日阅读:1

1.需求背景

系统程序突然报错,报错信息如下:

The transaction log for database '@dbname' is full. To find out why space in the log cannot be reused, see the log_reuse_wait_desc column in sys.databases

此时查看log文件,已达2T。

当时的紧急处理方案是,移除掉镜像,修改数据库恢复模式(由full修改为simple),收缩日志。

为了防止类似问题再次发生,需对log 文件的大小进行监控,当到达阈值后,触发告警。

2.主要基础组件(类)

配置文件 qqmssqltest_db_server_conf.ini

同过此配置文件获取DB Server信息、DB信息、UID信息、邮件服务器信息等。

[sqlserver]
db_user = XXXXXX
db_pwd = XXXXXXX

[sqlserver_qq]
db_host = 110.119.120.114
db_port = 1433

[windows]
user =
pwd =

[mail]
host = zheshiceshidemail.qq.com
port = 25
user =
pwd =
sender = zhejiushiceshidebuyaodangzhen@qq.com

获取连接串的组件mssql_get_db_connect.py

# -*- coding: utf-8 -*-

import sys
import os
import datetime
import configparser
import pymssql

pip3 install pymssql-2.1.4-cp37-cp37m-win_amd64.whl

pip3 install pymssql -i https://pypi.doubanio.com/simple

获取连接串信息

def mssql_get_db_connect(db_host, db_port):
db_host = db_host
db_port = db_port

db\_ps\_file = os.path.join(sys.path\[0\], "qqmssqltest\_db\_server\_conf.ini")  
config = configparser.ConfigParser()  
config.read(db\_ps\_file, encoding="utf-8")  
db\_user = config.get('sqlserver', 'db\_user')  
db\_pwd = config.get('sqlserver', 'db\_pwd')

conn = pymssql.connect(host=db\_host, port=db\_port, user=db\_user, password=db\_pwd, charset="utf8", login\_timeout=5, timeout=600, autocommit=True)

return conn

执行SQL语句的组件mysql_exec_sql.py

# -*- coding: utf-8 -*-

import mysql_get_db_connect

def mysql_exec_dml_sql(db_host, db_port, exec_sql):
conn = mysql_get_db_connect.mysql_get_db_connect(db_host, db_port)
with conn.cursor() as cursor_db:
cursor_db.execute(exec_sql)
conn.commit()

def mysql_exec_select_sql(db_host, db_port, exec_sql):
conn = mysql_get_db_connect.mysql_get_db_connect(db_host, db_port)
with conn.cursor() as cursor_db:
cursor_db.execute(exec_sql)
sql_rst = cursor_db.fetchall()

return sql\_rst

def mysql_exec_select_sql_include_colnames(db_host, db_port, exec_sql):
conn = mysql_get_db_connect.mysql_get_db_connect(db_host, db_port)
with conn.cursor() as cursor_db:
cursor_db.execute(exec_sql)
sql_rst = cursor_db.fetchall()
col_names = cursor_db.description
return sql_rst, col_names

发邮件的功能send_monitor_mail.py

# -*- coding: utf-8 -*-

pip3 install PyEmail

import smtplib
from email.mime.text import MIMEText
import configparser
import os
import sys

发送告警邮件

def send_monitor_mail(mail_subject, mail_body, mail_receivers="testwukongbaigujing@qq.com"):
db_ps_file = os.path.join(sys.path[0], "qqmssqltest_db_server_conf.ini")
config = configparser.ConfigParser()
config.read(db_ps_file, encoding="utf-8")
mail_host = config.get('mail', 'host')
mail_port = config.get('mail', 'port')
# mail_user = config.get('mail', 'user')
# mail_pwd = config.get('mail', 'pwd')
sender = config.get('mail', 'sender')
# receivers = config.get('mail', 'receivers')

# 发送HTML格式邮件  
message = MIMEText(mail\_body, 'html', 'utf-8')  
# message = MIMEText(mail\_body, 'plain', 'utf-8')  
message\['subject'\] = mail\_subject  
message\['From'\] = sender  
message\['To'\] = mail\_receivers

try:  
    smtpObj = smtplib.SMTP()  
    smtpObj.connect(mail\_host, mail\_port)          # 25 为 SMTP 端口号  
    # SMTP AUTH extension not supported by server.  
    # https://github.com/miguelgrinberg/microblog/issues/76  
    # smtpObj.ehlo()  
    # smtpObj.starttls()  
    # smtpObj.login(mail\_user, mail\_pwd)  
    smtpObj.sendmail(sender, mail\_receivers, message.as\_string())  
    smtpObj.quit()  
    print("邮件发送成功")  
except Exception as e:  
    print(e)  
# except smtplib.SMTPException:  
    # print("Error: 无法发送邮件")

3.主要功能代码

收集到的DB数据文件的信息保存到表mssql_dblogsize中,其建表的脚本如下:

CREATE TABLE [dbo].[mssql_dblogsize](
[id] [int] IDENTITY(1,1) NOT NULL,
[createtime] [datetime] NULL,
[vip] [nvarchar](100) NULL,
[port] [nvarchar](100) NULL,
[Environment] [nvarchar](200) NULL,
[Dbname] [varchar](200) NULL,
[Logical_Name] [varchar](200) NULL,
[Physical_Name] [varchar](1500) NULL,
[Size] [bigint] NULL,
PRIMARY KEY CLUSTERED
(
[id] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO

ALTER TABLE [dbo].[mssql_dblogsize] ADD DEFAULT (getdate()) FOR [createtime]
GO

为了方便对表mssql_dblogsize的数据进行管理和展示,在其基础上抽象加工出了一个视图v_mssql_dblogsize,注意Size大小的转换(Size/128/1024 as SizeGB)

创建视图的脚本如下:

CREATE view [dbo].[v_mssql_dblogsize]
as
SELECT [id]
,[createtime]
,[vip]
,[port]
,[Environment]
,[Dbname]
,[Logical_Name]
,[Physical_Name]
,Size/128/1024 as SizeGB
FROM [dbo].[mssql_dblogsize]
where size >50*128*1024
and Physical_Name like '%ldf%'
GO

本测试实例使用的数据库为qqDB,监控的各个DB Server保存在了表QQDBServer中,注意Port 不一定为标准端口1433.

collect_mssql_dblogsize_info.py

# -*- coding: utf-8 -*-

import sys
import os
import configparser
import pymssql
import mssql_get_db_connect
import mssql_exec_sql
from datetime import datetime

def collect_mssql_dblogsize_info():

db\_ps\_file = os.path.join(sys.path\[0\], "qqmssqltest\_db\_server\_conf.ini")  
config = configparser.ConfigParser()  
config.read(db\_ps\_file, encoding="utf-8")  
m\_db\_host = config.get('sqlserver\_qq', 'db\_host')  
m\_db\_port = config.getint('sqlserver\_qq', 'db\_port')

# 获取需要遍历的DB列表  
exec\_sql\_1 = """  

SELECT IP, case Port when '1444,1433' then '1433' else Port end as Port, Environment
FROM qqDB.dbo.QQDBServer
where InUse =1 AND ServerType IN ('SQL')
and IP=VIP ;
"""
sql_rst_1 = mssql_exec_sql.mssql_exec_select_sql(m_db_host, m_db_port, exec_sql_1)
for j in sql_rst_1:
db_host_2 = j[0]
db_port_2 = j[1]
db_Environment = j[2]
exec_sql_2 = """
select '""" + db_host_2 + """' as vip, '""" + db_port_2 + """' as port, '""" + db_Environment + """' as Environment,DB_NAME(database_id) AS DatabaseName,
Name AS Logical_Name,
Physical_Name, size
FROM master.sys.master_files;
"""
try:
sql_rst_2 = mssql_exec_sql.mssql_exec_select_sql(db_host_2, db_port_2, exec_sql_2)
except Exception as e:
print(e)

    for k in sql\_rst\_2:  
       exec\_sql\_3 = """  
       insert into qqDB..mssql\_dblogsize(\[vip\], \[port\], \[Environment\], \[Dbname\], \[Logical\_Name\], \[Physical\_Name\], \[Size\])  
       values('%s', '%s', '%s', '%s', '%s', '%s', '%s');  
       """  
       conn = mssql\_get\_db\_connect.mssql\_get\_db\_connect(m\_db\_host, m\_db\_port)  
       with conn.cursor() as cursor\_db:  
           cursor\_db.execute(exec\_sql\_3 % (k\[0\], k\[1\], k\[2\], k\[3\], k\[4\], k\[5\], k\[6\] ))  
           conn.commit()  

collect_mssql_dblogsize_info()

告警邮件的功能实现为mssql_alert_dblogsize.py,此份代码的告警阈值设置的为50G,数据来自于视图v_mssql_dblogsize。

# -*- coding: utf-8 -*-

import sys
import os
import configparser
import pymssql
import mssql_get_db_connect
import mssql_exec_sql
import datetime
import send_monitor_mail
import pandas as pd

def mssql_alert_dblogsize():
mail_subject = "SQL Server DB Log Size Greater than 50G, please check!!! "
mail_receivers = "testDBAgrp@qtiantianq.com"

db\_ps\_file = os.path.join(sys.path\[0\], "qqmssqltest\_db\_server\_conf.ini")  
config = configparser.ConfigParser()  
config.read(db\_ps\_file, encoding="utf-8")  
m\_db\_host = config.get('sqlserver\_qq', 'db\_host')  
m\_db\_port = config.getint('sqlserver\_qq', 'db\_port')

# 获取需要遍历的DB列表

exec\_sql\_4 = """  
    SELECT \[vip\] as IP,\[port\],\[Environment\],\[Dbname\]  
  ,\[Logical\_Name\],\[Physical\_Name\],\[SizeGB\],\[createtime\]  

FROM qqDB.[dbo].[v_mssql_dblogsize]
order by VIP,Dbname;
"""
sql_rst_4, col_name = mssql_exec_sql.mssql_exec_select_sql_include_colnames(m_db_host, m_db_port, exec_sql_4)
# print(sql_rst_4)

if len(sql\_rst\_4):  
    mail\_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    columns = \[\]  
    for i in range(len(col\_name)):  
        columns.append(col\_name\[i\]\[0\])

    df = pd.DataFrame(columns=columns)  
    for i in range(len(sql\_rst\_4)):  
        df.loc\[i\] = list(sql\_rst\_4\[i\])

    mail\_body = df.to\_html(index=False, justify="left").replace('<th>', '<th style = "color:red; text-align:left; background-color: yellow">')  
    mail\_html = "<html><body><h4>" + "Deal All : " + "<br><h4>" + "以下数据库的db log文件,已大于50G.请及时检查,谢谢! " + "<br><h4>" + mail\_body + "</body></html>"

    send\_monitor\_mail.send\_monitor\_mail(mail\_subject=mail\_subject, mail\_body=mail\_html, mail\_receivers=mail\_receivers)

mssql_alert_dblogsize()

4.实现

定时任务是通过windows的计划任务来实现的,在此不做过多的叙述。告警邮件的部分截图如下:

5.附录

1.报错定位,判断是不是log文件过大

https://blog.csdn.net/weixin_30785593/article/details/99912405

2.关于为什么数据库log文件过大,我们可以参考以下分享的文章

https://blog.csdn.net/chinadm123/article/details/44941275

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器