Skip to content

Commit e4da5f8

Browse files
committed
worker-stats command
1 parent e81aabf commit e4da5f8

File tree

3 files changed

+101
-32
lines changed

3 files changed

+101
-32
lines changed

go/logic/coordinator.go

Lines changed: 65 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,17 @@ type Coordinator struct {
3030
// Atomic counter for number of active workers
3131
busyWorkers atomic.Int64
3232

33+
// Mutex protecting currentCoordinates
3334
currentCoordinatesMutex sync.Mutex
3435
// The binlog coordinates of the low water mark transaction.
3536
currentCoordinates mysql.BinlogCoordinates
3637

3738
// Mutex to protect the fields below
3839
mu sync.Mutex
3940

41+
// list of workers
42+
workers []*Worker
43+
4044
// The low water mark. This is the sequence number of the last job that has been committed.
4145
lowWaterMark int64
4246

@@ -56,13 +60,25 @@ type Coordinator struct {
5660
finishedMigrating atomic.Bool
5761
}
5862

63+
// Worker takes jobs from the Coordinator and applies the job's DML events.
5964
type Worker struct {
6065
id int
6166
coordinator *Coordinator
67+
eventQueue chan *replication.BinlogEvent
6268

63-
executedJobs int
69+
executedJobs atomic.Int64
70+
dmlEventsApplied atomic.Int64
71+
waitTimeNs atomic.Int64
72+
busyTimeNs atomic.Int64
73+
}
6474

65-
eventQueue chan *replication.BinlogEvent
75+
type stats struct {
76+
dmlRate float64
77+
trxRate float64
78+
dmlEventsApplied int64
79+
executedJobs int64
80+
busyTime time.Duration
81+
waitTime time.Duration
6682
}
6783

6884
func (w *Worker) ProcessEvents() error {
@@ -86,21 +102,15 @@ func (w *Worker) ProcessEvents() error {
86102
// Wait for conditions to be met
87103
waitChannel := w.coordinator.WaitForTransaction(gtidEvent.LastCommitted)
88104
if waitChannel != nil {
89-
//fmt.Printf("Worker %d - transaction %d waiting for transaction: %d\n", w.id, gtidEvent.SequenceNumber, gtidEvent.LastCommitted)
90-
t := time.Now()
105+
waitStart := time.Now()
91106
<-waitChannel
92-
timeWaited := time.Since(t)
93-
w.coordinator.migrationContext.Log.Infof(
94-
"Worker %d waited for transaction %d for: %d\n",
95-
w.id, gtidEvent.LastCommitted, timeWaited)
107+
timeWaited := time.Since(waitStart)
108+
w.waitTimeNs.Add(timeWaited.Nanoseconds())
96109
}
97110

98111
// Process the transaction
99-
100112
var changelogEvent *binlog.BinlogDMLEvent
101-
102113
dmlEvents := make([]*binlog.BinlogDMLEvent, 0, int(atomic.LoadInt64(&w.coordinator.migrationContext.DMLBatchSize)))
103-
104114
events:
105115
for {
106116
ev := <-w.eventQueue
@@ -166,21 +176,21 @@ func (w *Worker) ProcessEvents() error {
166176
dmlEvents = append(dmlEvents, dmlEvent)
167177

168178
if len(dmlEvents) == cap(dmlEvents) {
169-
err := w.coordinator.applier.ApplyDMLEventQueries(dmlEvents)
170-
if err != nil {
171-
//TODO(meiji163) add retry
179+
if err := w.applyDMLEvents(dmlEvents); err != nil {
172180
w.coordinator.migrationContext.Log.Errore(err)
173-
w.coordinator.migrationContext.PanicAbort <- err
174181
}
175182
dmlEvents = dmlEvents[:0]
176183
}
177184
}
178185
}
179186
case *replication.XIDEvent:
180187
if len(dmlEvents) > 0 {
181-
w.coordinator.applier.ApplyDMLEventQueries(dmlEvents)
188+
if err := w.applyDMLEvents(dmlEvents); err != nil {
189+
w.coordinator.migrationContext.Log.Errore(err)
190+
}
182191
}
183192

193+
w.executedJobs.Add(1)
184194
break events
185195
}
186196
}
@@ -190,19 +200,33 @@ func (w *Worker) ProcessEvents() error {
190200
// Did we see a changelog event?
191201
// Handle it now
192202
if changelogEvent != nil {
203+
// wait for all transactions before this point
193204
waitChannel = w.coordinator.WaitForTransaction(gtidEvent.SequenceNumber - 1)
194205
if waitChannel != nil {
206+
waitStart := time.Now()
195207
<-waitChannel
208+
w.waitTimeNs.Add(time.Since(waitStart).Nanoseconds())
196209
}
197210
w.coordinator.HandleChangeLogEvent(changelogEvent)
198211
}
199212

200-
w.executedJobs += 1
201213
w.coordinator.workerQueue <- w
202214
w.coordinator.busyWorkers.Add(-1)
203215
}
204216
}
205217

218+
func (w *Worker) applyDMLEvents(dmlEvents []*binlog.BinlogDMLEvent) error {
219+
busyStart := time.Now()
220+
err := w.coordinator.applier.ApplyDMLEventQueries(dmlEvents)
221+
if err != nil {
222+
//TODO(meiji163) add retry
223+
return err
224+
}
225+
w.busyTimeNs.Add(time.Since(busyStart).Nanoseconds())
226+
w.dmlEventsApplied.Add(int64(len(dmlEvents)))
227+
return nil
228+
}
229+
206230
func NewCoordinator(migrationContext *base.MigrationContext, applier *Applier, onChangelogEvent func(dmlEvent *binlog.BinlogDMLEvent) error) *Coordinator {
207231
connectionConfig := migrationContext.InspectorConnectionConfig
208232

@@ -338,7 +362,6 @@ func (c *Coordinator) ProcessEventsUntilDrained() error {
338362
if c.finishedMigrating.Load() {
339363
return nil
340364
}
341-
// c.migrationContext.Log.Infof("Received event: %T - %+v", ev.Event, ev.Event)
342365

343366
switch binlogEvent := ev.Event.(type) {
344367
case *replication.GTIDEvent:
@@ -358,13 +381,10 @@ func (c *Coordinator) ProcessEventsUntilDrained() error {
358381
worker := <-c.workerQueue
359382
c.busyWorkers.Add(1)
360383

361-
// c.migrationContext.Log.Infof("Submitting job %d to worker", ev.Event.(*replication.GTIDEvent).SequenceNumber)
362384
worker.eventQueue <- ev
363385

364386
ev = <-c.events
365387

366-
// c.migrationContext.Log.Infof("Received event: %T - %+v", ev.Event, ev.Event)
367-
368388
switch binlogEvent := ev.Event.(type) {
369389
case *replication.QueryEvent:
370390
if bytes.Equal([]byte("BEGIN"), binlogEvent.Query) {
@@ -381,9 +401,6 @@ func (c *Coordinator) ProcessEventsUntilDrained() error {
381401
events:
382402
for {
383403
ev = <-c.events
384-
385-
// c.migrationContext.Log.Infof("Received event: %T - %+v", ev.Event, ev.Event)
386-
387404
switch ev.Event.(type) {
388405
case *replication.RowsEvent:
389406
worker.eventQueue <- ev
@@ -402,8 +419,6 @@ func (c *Coordinator) ProcessEventsUntilDrained() error {
402419
busyWorkers := c.busyWorkers.Load()
403420
if busyWorkers == 0 {
404421
return nil
405-
} else {
406-
//c.migrationContext.Log.Infof("%d/%d workers are busy\n", busyWorkers, cap(c.workerQueue))
407422
}
408423
}
409424
}
@@ -414,11 +429,35 @@ func (c *Coordinator) InitializeWorkers(count int) {
414429
c.workerQueue = make(chan *Worker, count)
415430
for i := 0; i < count; i++ {
416431
w := &Worker{id: i, coordinator: c, eventQueue: make(chan *replication.BinlogEvent, 1000)}
432+
433+
c.mu.Lock()
434+
c.workers = append(c.workers, w)
435+
c.mu.Unlock()
436+
417437
c.workerQueue <- w
418438
go w.ProcessEvents()
419439
}
420440
}
421441

442+
func (c *Coordinator) GetWorkerStats() []stats {
443+
c.mu.Lock()
444+
defer c.mu.Unlock()
445+
statSlice := make([]stats, 0, len(c.workers))
446+
for _, w := range c.workers {
447+
stat := stats{}
448+
stat.dmlEventsApplied = w.dmlEventsApplied.Load()
449+
stat.executedJobs = w.executedJobs.Load()
450+
stat.busyTime = time.Duration(w.busyTimeNs.Load())
451+
stat.waitTime = time.Duration(w.waitTimeNs.Load())
452+
if stat.busyTime.Milliseconds() > 0 {
453+
stat.dmlRate = 1000.0 * float64(stat.dmlEventsApplied) / float64(stat.busyTime.Milliseconds())
454+
stat.trxRate = 1000.0 * float64(stat.executedJobs) / float64(stat.busyTime.Milliseconds())
455+
}
456+
statSlice = append(statSlice, stat)
457+
}
458+
return statSlice
459+
}
460+
422461
func (c *Coordinator) WaitForTransaction(lastCommitted int64) chan struct{} {
423462
c.mu.Lock()
424463
defer c.mu.Unlock()
@@ -438,11 +477,8 @@ func (c *Coordinator) WaitForTransaction(lastCommitted int64) chan struct{} {
438477
}
439478

440479
func (c *Coordinator) HandleChangeLogEvent(event *binlog.BinlogDMLEvent) {
441-
//c.migrationContext.Log.Infof("Coordinator: Handling changelog event: %+v\n", event)
442-
443480
c.mu.Lock()
444481
defer c.mu.Unlock()
445-
446482
c.onChangelogEvent(event)
447483
}
448484

go/logic/migrator.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,7 @@ waitForGhostTable:
456456
return err
457457
}
458458
this.printStatus(ForcePrintStatusRule)
459+
this.printWorkerStats()
459460

460461
if this.migrationContext.IsCountingTableRows() {
461462
this.migrationContext.Log.Info("stopping query for exact row count, because that can accidentally lock out the cut over")
@@ -762,10 +763,13 @@ func (this *Migrator) atomicCutOver() (err error) {
762763

763764
// initiateServer begins listening on unix socket/tcp for incoming interactive commands
764765
func (this *Migrator) initiateServer() (err error) {
765-
var f printStatusFunc = func(rule PrintStatusRule, writer io.Writer) {
766+
var printStatus printStatusFunc = func(rule PrintStatusRule, writer io.Writer) {
766767
this.printStatus(rule, writer)
767768
}
768-
this.server = NewServer(this.migrationContext, this.hooksExecutor, f)
769+
var printWorkers printWorkersFunc = func(writer io.Writer) {
770+
this.printWorkerStats(writer)
771+
}
772+
this.server = NewServer(this.migrationContext, this.hooksExecutor, printStatus, printWorkers)
769773
if err := this.server.BindSocketFile(); err != nil {
770774
return err
771775
}
@@ -1022,6 +1026,29 @@ func (this *Migrator) shouldPrintMigrationStatusHint(rule PrintStatusRule, elaps
10221026
return shouldPrint
10231027
}
10241028

1029+
// printWorkerStats prints cumulative stats from the trxCoordinator workers.
1030+
func (this *Migrator) printWorkerStats(writers ...io.Writer) {
1031+
writers = append(writers, os.Stdout)
1032+
mw := io.MultiWriter(writers...)
1033+
1034+
busyWorkers := this.trxCoordinator.busyWorkers.Load()
1035+
totalWorkers := cap(this.trxCoordinator.workerQueue)
1036+
fmt.Fprintf(mw, "# %d/%d workers are busy\n", busyWorkers, totalWorkers)
1037+
1038+
stats := this.trxCoordinator.GetWorkerStats()
1039+
for id, stat := range stats {
1040+
fmt.Fprintf(mw,
1041+
"Worker %d; Waited: %s; Busy: %s; DML Applied: %d (%.2f/s), Trx Applied: %d (%.2f/s)\n",
1042+
id,
1043+
base.PrettifyDurationOutput(stat.waitTime),
1044+
base.PrettifyDurationOutput(stat.busyTime),
1045+
stat.dmlEventsApplied,
1046+
stat.dmlRate,
1047+
stat.executedJobs,
1048+
stat.trxRate)
1049+
}
1050+
}
1051+
10251052
// printStatus prints the progress status, and optionally additionally detailed
10261053
// dump of configuration.
10271054
// `rule` indicates the type of output expected.

go/logic/server.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var (
3232
)
3333

3434
type printStatusFunc func(PrintStatusRule, io.Writer)
35+
type printWorkersFunc func(io.Writer)
3536

3637
// Server listens for requests on a socket file or via TCP
3738
type Server struct {
@@ -40,14 +41,16 @@ type Server struct {
4041
tcpListener net.Listener
4142
hooksExecutor *HooksExecutor
4243
printStatus printStatusFunc
44+
printWorkers printWorkersFunc
4345
isCPUProfiling int64
4446
}
4547

46-
func NewServer(migrationContext *base.MigrationContext, hooksExecutor *HooksExecutor, printStatus printStatusFunc) *Server {
48+
func NewServer(migrationContext *base.MigrationContext, hooksExecutor *HooksExecutor, printStatus printStatusFunc, printWorkers printWorkersFunc) *Server {
4749
return &Server{
4850
migrationContext: migrationContext,
4951
hooksExecutor: hooksExecutor,
5052
printStatus: printStatus,
53+
printWorkers: printWorkers,
5154
}
5255
}
5356

@@ -232,6 +235,9 @@ help # This message
232235
return ForcePrintStatusOnlyRule, nil
233236
case "info", "status":
234237
return ForcePrintStatusAndHintRule, nil
238+
case "worker-stats":
239+
this.printWorkers(writer)
240+
return NoPrintStatusRule, nil
235241
case "cpu-profile":
236242
cpuProfile, err := this.runCPUProfile(arg)
237243
if err == nil {

0 commit comments

Comments
 (0)