Skip to content

Commit ded6ac2

Browse files
cyckeriSecloud
authored andcommitted
feat(redis): 内存分析 #12239
1 parent 3725d29 commit ded6ac2

File tree

5 files changed

+347
-36
lines changed

5 files changed

+347
-36
lines changed

dbm-services/redis/db-tools/dbactuator/pkg/atomjobs/atomsys/keystat.go

Lines changed: 94 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"strconv"
1919
"strings"
2020
"sync"
21+
"syscall"
2122
"time"
2223

2324
"github.com/go-playground/validator/v10"
@@ -104,9 +105,36 @@ func (job *KeyStat) useLocalPlayLoadFile() (payload string, err error) {
104105
return payload, nil
105106
}
106107

108+
// tryLockFile 尝试获取文件锁. 返回文件锁对象.
109+
// 如果获取失败,则尝试等待10秒后重试,最多重试360*8=2880次,即8小时.
110+
// 重试时,每60次重试打印一次日志.
111+
112+
func (job *KeyStat) tryLockFile(workDir string, maxConcurrent int, retryTimes int) (lock *os.File, err error) {
113+
for i := 0; i < retryTimes; i++ {
114+
for j := 0; j < maxConcurrent; j++ {
115+
lockFile := filepath.Join(workDir, fmt.Sprintf("keystat.lock.%d", i))
116+
if _, err := os.Stat(lockFile); os.IsNotExist(err) {
117+
lock, err = os.Create(lockFile)
118+
if err != nil {
119+
return nil, err
120+
}
121+
err = syscall.Flock(int(lock.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
122+
if err != nil {
123+
return nil, err
124+
}
125+
return lock, nil
126+
}
127+
}
128+
time.Sleep(1 * time.Second)
129+
if i%60 == 1 {
130+
job.runtime.Logger.Info("try lock file failed, try again, retryTimes:(%d of %d)", i, retryTimes)
131+
}
132+
}
133+
return nil, fmt.Errorf("try lock file failed, retryTimes:(%d of %d)", retryTimes, retryTimes)
134+
}
135+
107136
// Init 初始化
108137
func (job *KeyStat) Init(m *jobruntime.JobGenericRuntime) error {
109-
110138
job.runtime = m
111139
var err error
112140
if s, err := job.useLocalPlayLoadFile(); err == nil {
@@ -153,16 +181,34 @@ func (job *KeyStat) Init(m *jobruntime.JobGenericRuntime) error {
153181
return nil
154182
}
155183

184+
const maxRetryTimes = 3600 * 8
185+
const maxConcurrent = 4
186+
156187
// Run 运行监听请求任务
157188
func (job *KeyStat) Run() (err error) {
189+
// 1. 尝试获取文件锁
190+
keystatDir := filepath.Join(consts.GetRedisBackupDir(), "dbbak/keystat")
191+
lock, err := job.tryLockFile(keystatDir, maxConcurrent, maxRetryTimes)
192+
if err != nil {
193+
job.runtime.Logger.Error("tryLockFile failed, err:%s", err)
194+
err = job.updateReportStatus(statusFailed, map[string]any{
195+
"error": err.Error(),
196+
})
197+
if err != nil {
198+
job.runtime.Logger.Error("update report status to failed failed, err:%s", err)
199+
}
200+
return err
201+
}
202+
defer lock.Close()
203+
158204
err = job.updateReportStatus(statusRunning, nil)
159205
if err != nil {
160206
job.runtime.Logger.Error("update report status failed, err:%s", err)
161207
// return err
162208
}
163209
job.runtime.Logger.Info("update report status success, status:%s", statusRunning)
164210
// 1. 创建工作目录
165-
workDir := filepath.Join(consts.GetRedisBackupDir(), "dbbak/keystat", job.runtime.UID)
211+
workDir := filepath.Join(keystatDir, job.runtime.UID)
166212
util.MkDirsIfNotExists([]string{workDir})
167213
util.LocalDirChownMysql(workDir)
168214
job.runtime.Logger.Info("KeyStat Run, workDir:%s", workDir)
@@ -173,6 +219,19 @@ func (job *KeyStat) Run() (err error) {
173219
return err
174220
}
175221

222+
// if >= 7.4 暂不支持
223+
224+
if versionInfo.Major >= 7 && versionInfo.Minor >= 4 {
225+
job.runtime.Logger.Error("redis version >= 7.4, version:%s, will not support", versionInfo.Str)
226+
err = job.updateReportStatus(statusFailed, map[string]any{
227+
"error": "redis version >= 7.4, will not support",
228+
})
229+
if err != nil {
230+
job.runtime.Logger.Error("update report status to failed failed, err:%s", err)
231+
}
232+
return err
233+
}
234+
176235
if versionInfo.Major >= 6 {
177236
job.runtime.Logger.Info("redis version >= 6, version:%s, will check atime", versionInfo.Str)
178237
job.atimeRequired = true
@@ -516,7 +575,11 @@ func (job *KeyStat) uploadReport(workDir string) (err error) {
516575
return errors.New("rankReportFile is empty")
517576
}
518577

519-
reportRows, rankRows, err := keystat_report.LoadReport(reportFile, rankReportFile)
578+
reportRows, err := keystat_report.LoadReport(reportFile)
579+
if err != nil {
580+
return err
581+
}
582+
rankRows, err := keystat_report.LoadRankReport(rankReportFile)
520583
if err != nil {
521584
return err
522585
}
@@ -577,6 +640,7 @@ class StateType(str, StructuredEnum):
577640
*/
578641

579642
const statusReady = "READY"
643+
const statusInqueue = "INQUEUE"
580644
const statusRunning = "RUNNING"
581645
const statusSuccess = "FINISHED"
582646
const statusFailed = "FAILED"
@@ -617,27 +681,38 @@ func (job *KeyStat) sendReportToDB(
617681
}
618682

619683
// 2. upload key report.
620-
ret, err := cli.Do(http.MethodPost, KeyStatReportItemUrl, map[string]any{
621-
"keystat_report_item": reportRows,
622-
"record_id": job.params.RecordId,
623-
"truncate": true,
624-
})
625-
if err != nil {
626-
return errors.New("upload key report failed, err:" + err.Error())
684+
// 这里改为分批上传,一次上传200条.
685+
for i := 0; i < len(reportRows); i += 200 {
686+
batchReportRows := reportRows[i:int(math.Min(float64(i+200), float64(len(reportRows))))]
687+
_, err := cli.Do(http.MethodPost, KeyStatReportItemUrl, map[string]any{
688+
"keystat_report_item": batchReportRows,
689+
"record_id": job.params.RecordId,
690+
"truncate": i == 0, // 第一次上传时,清空记录.
691+
})
692+
if err != nil {
693+
return errors.New("upload key report failed, err:" + err.Error())
694+
}
695+
job.runtime.Logger.Info("upload %d key report items batch %d success", len(batchReportRows), i/200+1)
627696
}
628-
job.runtime.Logger.Info("upload key report success, ret:%+v", ret)
697+
698+
job.runtime.Logger.Info("upload %d key report items success", len(reportRows))
629699

630700
// 3. upload rank report.
631-
ret, err = cli.Do(http.MethodPost, KeyStatRankReportUrl, map[string]any{
632-
"keystat_rank_item": rankRows,
633-
"record_id": job.params.RecordId,
634-
"truncate": true,
635-
})
636-
if err != nil {
637-
return errors.New("upload rank report failed, err:" + err.Error())
701+
// 这里改为分批上传,一次上传200条.
702+
for i := 0; i < len(rankRows); i += 200 {
703+
batchRankRows := rankRows[i:int(math.Min(float64(i+200), float64(len(rankRows))))]
704+
_, err := cli.Do(http.MethodPost, KeyStatRankReportUrl, map[string]any{
705+
"keystat_rank_item": batchRankRows,
706+
"record_id": job.params.RecordId,
707+
"truncate": i == 0, // 第一次上传时,清空记录.
708+
})
709+
if err != nil {
710+
return errors.New("upload rank report failed, err:" + err.Error())
711+
}
712+
job.runtime.Logger.Info("upload %d rank report items batch %d success", len(batchRankRows), i/200+1)
638713
}
639-
job.runtime.Logger.Info("upload rank report success, ret:%+v", ret)
640714

715+
job.runtime.Logger.Info("upload %d rank report success", len(rankRows))
641716
return nil
642717
}
643718

dbm-services/redis/db-tools/dbactuator/pkg/keystat_report/keystat_report.go

Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package keystat_report
33
import (
44
"encoding/json"
55
"errors"
6+
"fmt"
67
"os"
78
)
89

@@ -41,15 +42,18 @@ type KeyStatReportItem struct {
4142
KeyName string `json:"key_name"`
4243
Class string `json:"key_class"`
4344
Count int `json:"count"`
44-
AvgTtl int64 `json:"avg_ttl" ` // ttl 需要当前时间比较.
45+
CountWithTtl int `json:"count_with_ttl"` // 不显示.
46+
AvgTtl int64 `json:"avg_ttl" ` // ttl 需要当前时间比较.
4547
AvgTtlHuman string `json:"avg_ttl_human"`
4648
MinIdleTime int64 `json:"min_idletime"`
4749
MinIdletimeHuman string `json:"min_idletime_human"`
50+
MinIdletimeShow string `json:"min_idletime_show,omitempty"`
4851
SharedObjectMinIdletimeHuman string `json:"so_min_idletime_human"`
49-
CountWithTtl int64 `json:"count_with_ttl"`
5052
MemberCountMax int `json:"member_max_count"`
5153
MemUsedBytes int64 `json:"mem_used_bytes"`
52-
MemUsedPct float64 `json:"mem_used_pct,omitempty"` // 需计算
54+
MemUsedPct float64 `json:"mem_used_pct,omitempty"`
55+
AvgKeyUsedBytes int64 `json:"avg_key_used_bytes" ` // 平均key占用字节数
56+
AvgKeyLength int64 `json:"avg_key_length"` // 平均key长度
5357
}
5458

5559
// RankKeyReportRow
@@ -67,8 +71,10 @@ type KeyStatRankItem struct {
6771
KeyReportReportBase
6872
LdbKey
6973
RankValue int64 `json:"rank_value"`
70-
KeyType string `json:"key_type"` // 从 LdbKey.Type 中解析出来.
71-
KeyName string `json:"key_name"` // 从 LdbKey.key 中解析出来.
74+
KeyType string `json:"key_type"` // 从 LdbKey.Type 中解析出来.
75+
KeyName string `json:"key_name"` // 从 LdbKey.key 中解析出来.
76+
TtlHuman string `json:"ttl_human,omitempty"` // 过期时间(带单位) 用于展示, 非必填. 从 LdbKey.Ttl 中解析出来.
77+
KeyLen int `json:"key_length"` // Key的长度. 从 LdbKey.Key 中解析出来.
7278
}
7379

7480
// LdbKey 用于解析 ldb 导出来的Key
@@ -78,38 +84,110 @@ type LdbKey struct {
7884
Ttl int64 `json:"ttl"` // second/1000
7985
Atime int64 `json:"atime"`
8086
Member int `json:"member"` // 成员的数量
81-
MemberLen int `json:"member_len"` // 成员的平均长度. 用于计算内存占用
87+
MemberLen int `json:"member_len"` // 成员的平均长度.
8288
ValueSize int `json:"value_size"` // Value的长度或者成员Value的长度.
8389
Db uint8 `json:"db"`
8490
SharedObject bool `json:"-"` // [0-9999] 的数字. 在这个范围内,它的Atime有可能是不准确的.
8591
MemorySize int `json:"memory_size"` // 基础内存占用, 复合类型中是采样计算结果。
8692
}
8793

8894
// loadReport 加载报告 返回集群报告记录, key报告和rank报告
89-
func LoadReport(reportFile, rankReportFile string) (
90-
keyReportRowItems []KeyStatReportItem, rankKeyReportRow map[string]RankKeyReportRow, err error) {
95+
func LoadReport(reportFile string) (keyReportRowItems []KeyStatReportItem, err error) {
9196
report, err := os.ReadFile(reportFile)
9297
if err != nil {
9398
err = errors.New("open reportFile failed, err:" + err.Error())
9499
return
95100
}
96101

97-
rankReport, err := os.ReadFile(rankReportFile)
102+
err = json.Unmarshal(report, &keyReportRowItems)
98103
if err != nil {
99-
err = errors.New("open rankReportFile failed, err:" + err.Error())
104+
err = errors.New("loadReport failed, err:" + err.Error())
100105
return
101106
}
102107

103-
err = json.Unmarshal(report, &keyReportRowItems)
104-
if err != nil {
105-
err = errors.New("loadReport failed, err:" + err.Error())
108+
// 计算CountWithTtl
109+
for i := range keyReportRowItems {
110+
updateAvgTtlHuman(&keyReportRowItems[i])
111+
}
112+
113+
return keyReportRowItems, nil
114+
}
115+
116+
// updateAvgTtlHuman 更新AvgTtlHuman. 显示在UI上.
117+
// 如果CountWithTtl为0, 则设置为"N/A".
118+
// 如果全都有过期时间, 则设置为"avg:xxx".
119+
// 否则设置为"count:xxx, avg:xxx".
120+
func updateAvgTtlHuman(item *KeyStatReportItem) {
121+
if item.CountWithTtl == 0 {
122+
item.AvgTtlHuman = "N/A"
106123
return
107124
}
125+
if item.CountWithTtl == item.Count {
126+
item.AvgTtlHuman = fmt.Sprintf("avg:%s", getTtlHuman(item.AvgTtl))
127+
} else {
128+
item.AvgTtlHuman = fmt.Sprintf("count:%d, avg:%s", item.CountWithTtl, getTtlHuman(item.AvgTtl))
129+
}
130+
}
131+
132+
// LoadRankReport 加载rank报告 返回rank报告
133+
func LoadRankReport(rankReportFile string) (rankKeyReportRow map[string]RankKeyReportRow, err error) {
134+
rankReport, err := os.ReadFile(rankReportFile)
135+
if err != nil {
136+
err = errors.New("open rankReportFile failed, err:" + err.Error())
137+
return nil, err
138+
}
108139

109140
err = json.Unmarshal(rankReport, &rankKeyReportRow)
110141
if err != nil {
111-
return nil, nil, errors.New("load rankReport failed, err:" + err.Error())
142+
return nil, err
143+
}
144+
// 将rank报告的Ttl转换为人类可读的时间格式.
145+
for t := range rankKeyReportRow {
146+
for i := range rankKeyReportRow[t].KeyList {
147+
item := rankKeyReportRow[t].KeyList[i]
148+
rankKeyReportRow[t].KeyList[i].TtlHuman = getTtlHuman(item.Ttl)
149+
rankKeyReportRow[t].KeyList[i].KeyLen = len(item.Key)
150+
}
151+
}
152+
153+
return rankKeyReportRow, nil
154+
}
155+
156+
const (
157+
secondsPerYear = int64(365 * 24 * 3600) // 31536000
158+
secondsPerMonth = int64(30 * 24 * 3600) // 2592000
159+
secondsPerDay = int64(24 * 3600) // 86400
160+
secondsPerHour = int64(3600)
161+
)
162+
163+
// getTtlHuman 将秒数转换为人类可读的时间格式
164+
// 例如: 31536000 -> "1.0year", 86400 -> "1.0day", 3600 -> "1.0hour", 60 -> "60sec"
165+
func getTtlHuman(t int64) string {
166+
var value float64
167+
var unit string
168+
169+
switch {
170+
case t == -1:
171+
return "-"
172+
case t >= secondsPerYear:
173+
value = float64(t) / float64(secondsPerYear)
174+
unit = "year"
175+
case t >= secondsPerMonth:
176+
value = float64(t) / float64(secondsPerMonth)
177+
unit = "mon"
178+
case t >= secondsPerDay:
179+
value = float64(t) / float64(secondsPerDay)
180+
unit = "day"
181+
case t >= secondsPerHour:
182+
value = float64(t) / float64(secondsPerHour)
183+
unit = "hour"
184+
case t >= 0:
185+
value = float64(t)
186+
unit = "sec"
187+
default:
188+
// 其他情况, 直接返回原值吧. 这样也能兼容一些特殊情况.
189+
return fmt.Sprintf("%d", t)
112190
}
113191

114-
return keyReportRowItems, rankKeyReportRow, nil
192+
return fmt.Sprintf("%.1f%s", value, unit)
115193
}

0 commit comments

Comments
 (0)