Skip to content

Commit ddef38d

Browse files
yksituiSecloud
authored andcommitted
feat(sqlserver): sqlserver巡检任务 #8033
1 parent 3e99a54 commit ddef38d

File tree

10 files changed

+986
-13
lines changed

10 files changed

+986
-13
lines changed

dbm-ui/backend/db_monitor/tpls/alarm/sqlserver/Sqlserver-进程存活.json

Lines changed: 80 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
"no_data_config": {
1515
"level": 2,
1616
"continuous": 10,
17-
"is_enabled": false,
18-
"agg_dimension": []
17+
"is_enabled": true,
18+
"agg_dimension": [
19+
"bk_target_service_instance_id"
20+
]
1921
},
2022
"target": [],
2123
"expression": "a",
@@ -88,14 +90,81 @@
8890
"connector": "and"
8991
}
9092
],
91-
"actions": [],
93+
"actions": [
94+
{
95+
"id": 168888,
96+
"config_id": 137317,
97+
"user_groups": [],
98+
"user_type": "main",
99+
"signal": [
100+
"abnormal"
101+
],
102+
"options": {
103+
"end_time": "23:59:59",
104+
"start_time": "00:00:00",
105+
"converge_config": {
106+
"count": 1,
107+
"condition": [
108+
{
109+
"value": [
110+
"self"
111+
],
112+
"dimension": "action_info"
113+
}
114+
],
115+
"timedelta": 60,
116+
"is_enabled": false,
117+
"converge_func": "skip_when_success",
118+
"need_biz_converge": true
119+
}
120+
},
121+
"relate_type": "ACTION",
122+
"config": {
123+
"id": 137317,
124+
"name": "dbm_autofix_http_callback",
125+
"desc": "",
126+
"bk_biz_id": "5005578",
127+
"plugin_id": "2",
128+
"execute_config": {
129+
"template_detail": {
130+
"need_poll": false,
131+
"notify_interval": 60,
132+
"interval_notify_mode": "standard",
133+
"method": "POST",
134+
"url": "",
135+
"headers": [],
136+
"authorize": {
137+
"auth_type": "bearer_token",
138+
"auth_config": {
139+
"token": ""
140+
}
141+
},
142+
"body": {
143+
"data_type": "raw",
144+
"params": [],
145+
"content": "{\"callback_message\": {{alarm.callback_message}},\"appointees\": \"{{alarm.appointees}}\"}",
146+
"content_type": "json"
147+
},
148+
"query_params": [],
149+
"failed_retry": {
150+
"is_enabled": true,
151+
"timeout": 10,
152+
"max_retry_times": 2,
153+
"retry_interval": 2
154+
}
155+
},
156+
"timeout": 600
157+
}
158+
}
159+
}
160+
],
92161
"notice": {
93-
"config_id": 47942,
162+
"config_id": 118363,
94163
"user_groups": [],
95164
"user_type": "main",
96165
"signal": [
97-
"no_data",
98-
"abnormal"
166+
"abnormal",
167+
"no_data"
99168
],
100169
"options": {
101170
"end_time": "23:59:59",
@@ -218,17 +287,17 @@
218287
"template": [
219288
{
220289
"signal": "abnormal",
221-
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
290+
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
222291
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
223292
},
224293
{
225294
"signal": "recovered",
226-
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
295+
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
227296
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
228297
},
229298
{
230299
"signal": "closed",
231-
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
300+
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
232301
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
233302
}
234303
]
@@ -250,8 +319,8 @@
250319
},
251320
"is_enabled": true,
252321
"monitor_indicator": "MAX(mssql_serveice_available)",
253-
"version": 2,
322+
"version": 3,
254323
"alert_source": "time_series",
255324
"custom_conditions": [],
256-
"export_at": "2024-04-02T16:01:58+08:00"
325+
"export_at": "2024-11-29T16:28:30+08:00"
257326
}

dbm-ui/backend/db_periodic_task/local_tasks/sqlserver/__init__.py

Whitespace-only changes.
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
"""
2+
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
3+
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
4+
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at https://opensource.org/licenses/MIT
6+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
7+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
8+
specific language governing permissions and limitations under the License.
9+
"""
10+
11+
from backend.db_meta.enums import ClusterPhase, ClusterType, InstanceInnerRole, InstanceRole, InstanceStatus
12+
from backend.db_meta.models import Cluster, StorageInstance
13+
from backend.db_meta.models.storage_set_dtl import SqlserverClusterSyncMode
14+
from backend.db_report.models.sqlserver_check_report import (
15+
SqlserverCheckAppSettingReport,
16+
SqlserverCheckJobSyncReport,
17+
SqlserverCheckLinkServerReport,
18+
SqlserverCheckSysJobStatuReport,
19+
SqlserverCheckUserSyncReport,
20+
)
21+
from backend.flow.utils.sqlserver.sqlserver_bk_config import (
22+
get_module_infos,
23+
get_sqlserver_alarm_config,
24+
get_sqlserver_backup_config,
25+
)
26+
from backend.flow.utils.sqlserver.sqlserver_db_function import (
27+
check_ha_config,
28+
check_sys_job_status,
29+
fix_app_setting_data,
30+
get_app_setting_data,
31+
insert_sqlserver_config,
32+
)
33+
34+
35+
class CheckAppSettingData(object):
36+
"""
37+
已dbm元数据为准
38+
检查实例的app_setting表的信息是否符合预期,如果存在信息不一致,则需要已某种方式输出告知相关DBA
39+
"""
40+
41+
def __init__(self):
42+
# 获取所有的online状态的cluster
43+
self.clusters = Cluster.objects.prefetch_related(
44+
"storageinstance_set",
45+
"storageinstance_set__machine",
46+
).filter(phase=ClusterPhase.ONLINE, cluster_type__in=[ClusterType.SqlserverHA, ClusterType.SqlserverSingle])
47+
48+
def check_task(self):
49+
"""
50+
定义巡检逻辑
51+
"""
52+
for cluster in self.clusters:
53+
print(cluster.name)
54+
self.check_app_setting_data(cluster)
55+
self.check_job_is_disabled(cluster)
56+
if cluster.cluster_type == ClusterType.SqlserverHA:
57+
master = cluster.storageinstance_set.get(instance_inner_role=InstanceInnerRole.MASTER)
58+
for s in cluster.storageinstance_set.filter(
59+
status=InstanceStatus.RUNNING, instance_inner_role=InstanceInnerRole.SLAVE
60+
):
61+
self.check_user(master_instance=master, slave_instance=s, cluster=cluster)
62+
self.check_job(master_instance=master, slave_instance=s, cluster=cluster)
63+
self.check_link_server(master_instance=master, slave_instance=s, cluster=cluster)
64+
65+
@staticmethod
66+
def fix_app_setting_data(cluster: Cluster, instance: StorageInstance, sync_mode: str, master: StorageInstance):
67+
"""
68+
存在不一致元数据,进行修复
69+
"""
70+
is_fix = 0
71+
status, msg = fix_app_setting_data(cluster=cluster, instance=instance, sync_mode=sync_mode, master=master)
72+
if status:
73+
is_fix = 1
74+
SqlserverCheckAppSettingReport.objects.create(
75+
cluster=cluster.name,
76+
cluster_type=cluster.cluster_type,
77+
instance_host=instance.machine.ip,
78+
instance_port=instance.port,
79+
is_inconsistent=1,
80+
is_fix=is_fix,
81+
status=status,
82+
msg=msg,
83+
)
84+
return True
85+
86+
@staticmethod
87+
def add_app_setting_data(cluster: Cluster, instance: StorageInstance):
88+
"""
89+
插入app_setting数据
90+
"""
91+
is_fix = 0
92+
fix_status = False
93+
msg = "fix failed"
94+
# 获取集群字符集配置
95+
charset = get_module_infos(
96+
bk_biz_id=cluster.bk_biz_id,
97+
db_module_id=cluster.db_module_id,
98+
cluster_type=ClusterType(cluster.cluster_type),
99+
)["charset"]
100+
101+
# 获取集群的备份配置
102+
backup_config = get_sqlserver_backup_config(
103+
bk_biz_id=cluster.bk_biz_id,
104+
db_module_id=cluster.db_module_id,
105+
cluster_domain=cluster.immute_domain,
106+
)
107+
108+
# 获取集群的个性化配置
109+
alarm_config = get_sqlserver_alarm_config(
110+
bk_biz_id=cluster.bk_biz_id,
111+
db_module_id=cluster.db_module_id,
112+
cluster_domain=cluster.immute_domain,
113+
)
114+
115+
# 配置数据
116+
try:
117+
fix_status = insert_sqlserver_config(
118+
cluster=cluster,
119+
storages=[instance],
120+
charset=charset,
121+
backup_config=backup_config,
122+
alarm_config=alarm_config,
123+
)
124+
except Exception:
125+
is_fix = 0
126+
127+
if fix_status:
128+
is_fix = 1
129+
msg = "fix successfully"
130+
131+
SqlserverCheckAppSettingReport.objects.create(
132+
cluster=cluster.name,
133+
cluster_type=cluster.cluster_type,
134+
instance_host=instance.machine.ip,
135+
instance_port=instance.port,
136+
is_inconsistent=1,
137+
is_fix=is_fix,
138+
status=fix_status,
139+
msg=msg,
140+
)
141+
return True
142+
143+
def check_app_setting_data(self, cluster: Cluster):
144+
master = cluster.storageinstance_set.get(instance_role__in=[InstanceRole.ORPHAN, InstanceRole.BACKEND_MASTER])
145+
if cluster.cluster_type == ClusterType.SqlserverHA:
146+
sync_mode = SqlserverClusterSyncMode.objects.get(cluster_id=cluster.id).sync_mode
147+
else:
148+
sync_mode = ""
149+
150+
# 按照集群维度查询所有的实例,状态running中的
151+
for instance in cluster.storageinstance_set.filter(status=InstanceStatus.RUNNING):
152+
data, err = get_app_setting_data(instance=instance, bk_cloud_id=cluster.bk_cloud_id)
153+
if data is None:
154+
# 如果返回是空则,则大概率是访问异常,录入异常信息,跳过这次的校验
155+
SqlserverCheckAppSettingReport.objects.create(
156+
cluster=cluster.name,
157+
cluster_type=cluster.cluster_type,
158+
instance_host=instance.machine.ip,
159+
instance_port=instance.port,
160+
is_inconsistent=1,
161+
is_fix=0,
162+
status=False,
163+
msg=err,
164+
)
165+
continue
166+
167+
if len(data) == 0:
168+
# 则说明没有配置app_setting,需要重新执行
169+
self.add_app_setting_data(cluster=cluster, instance=instance)
170+
171+
elif (
172+
int(data["APP"]) != cluster.bk_biz_id
173+
or int(data["BK_BIZ_ID"]) != cluster.bk_biz_id
174+
or int(data["BK_CLOUD_ID"]) != cluster.bk_cloud_id
175+
or int(data["CLUSTER_ID"]) != cluster.id
176+
or data["CLUSTER_DOMAIN"] != cluster.immute_domain
177+
or int(data["PORT"]) != instance.port
178+
or data["ROLE"] != instance.instance_inner_role
179+
or data["SYNCHRONOUS_MODE"] != sync_mode
180+
or data["MASTER_IP"] != master.machine.ip
181+
or int(data["MASTER_PORT"]) != master.port
182+
):
183+
# 尝试修复数据
184+
self.fix_app_setting_data(cluster=cluster, instance=instance, sync_mode=sync_mode, master=master)
185+
186+
@staticmethod
187+
def check_user(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
188+
"""
189+
检查主从的用户是否一致
190+
"""
191+
status, msg = check_ha_config(
192+
master_instance=master_instance,
193+
slave_instance=slave_instance,
194+
bk_cloud_id=cluster.bk_cloud_id,
195+
check_tag="user",
196+
)
197+
if not status:
198+
SqlserverCheckUserSyncReport.objects.create(
199+
cluster=cluster.name,
200+
cluster_type=cluster.cluster_type,
201+
instance_host=slave_instance.machine.ip,
202+
instance_port=slave_instance.port,
203+
is_user_inconsistent=1,
204+
status=status,
205+
msg=msg,
206+
)
207+
208+
@staticmethod
209+
def check_job(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
210+
"""
211+
检测主从的业务作业是否一致
212+
"""
213+
status, msg = check_ha_config(
214+
master_instance=master_instance,
215+
slave_instance=slave_instance,
216+
bk_cloud_id=cluster.bk_cloud_id,
217+
check_tag="job",
218+
)
219+
if not status:
220+
SqlserverCheckJobSyncReport.objects.create(
221+
cluster=cluster.name,
222+
cluster_type=cluster.cluster_type,
223+
instance_host=slave_instance.machine.ip,
224+
instance_port=slave_instance.port,
225+
is_job_inconsistent=1,
226+
status=status,
227+
msg=msg,
228+
)
229+
230+
@staticmethod
231+
def check_link_server(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
232+
"""
233+
检测主从的link_server是否一致
234+
"""
235+
status, msg = check_ha_config(
236+
master_instance=master_instance,
237+
slave_instance=slave_instance,
238+
bk_cloud_id=cluster.bk_cloud_id,
239+
check_tag="job",
240+
)
241+
if not status:
242+
SqlserverCheckLinkServerReport.objects.create(
243+
cluster=cluster.name,
244+
cluster_type=cluster.cluster_type,
245+
instance_host=slave_instance.machine.ip,
246+
instance_port=slave_instance.port,
247+
is_link_server_inconsistent=1,
248+
status=status,
249+
msg=msg,
250+
)
251+
252+
@staticmethod
253+
def check_job_is_disabled(cluster: Cluster):
254+
# 按照集群维度查询所有的实例,状态running中的
255+
for instance in cluster.storageinstance_set.filter(status=InstanceStatus.RUNNING):
256+
status, msg = check_sys_job_status(cluster=cluster, instance=instance)
257+
if not status:
258+
# 只有异常才记录
259+
SqlserverCheckSysJobStatuReport.objects.create(
260+
cluster=cluster.name,
261+
cluster_type=cluster.cluster_type,
262+
instance_host=instance.machine.ip,
263+
instance_port=instance.port,
264+
is_job_disable=1,
265+
status=status,
266+
msg=msg,
267+
)

0 commit comments

Comments
 (0)