Skip to content

Commit 3dc4930

Browse files
author
Shlomi Noach
authored
Merge branch 'master' into support-aliyun-rds
2 parents 619f982 + 3b7f1a7 commit 3dc4930

File tree

3 files changed

+50
-1
lines changed

3 files changed

+50
-1
lines changed

go/base/context.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ type MigrationContext struct {
119119
CriticalLoadHibernateSeconds int64
120120
PostponeCutOverFlagFile string
121121
CutOverLockTimeoutSeconds int64
122+
CutOverExponentialBackoff bool
123+
ExponentialBackoffMaxInterval int64
122124
ForceNamedCutOverCommand bool
123125
PanicFlagFile string
124126
HooksPath string
@@ -342,6 +344,14 @@ func (this *MigrationContext) SetCutOverLockTimeoutSeconds(timeoutSeconds int64)
342344
return nil
343345
}
344346

347+
func (this *MigrationContext) SetExponentialBackoffMaxInterval(intervalSeconds int64) error {
348+
if intervalSeconds < 2 {
349+
return fmt.Errorf("Minimal maximum interval is 2sec. Timeout remains at %d", this.ExponentialBackoffMaxInterval)
350+
}
351+
this.ExponentialBackoffMaxInterval = intervalSeconds
352+
return nil
353+
}
354+
345355
func (this *MigrationContext) SetDefaultNumRetries(retries int64) {
346356
this.throttleMutex.Lock()
347357
defer this.throttleMutex.Unlock()

go/cmd/gh-ost/main.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ func main() {
8383

8484
flag.BoolVar(&migrationContext.SwitchToRowBinlogFormat, "switch-to-rbr", false, "let this tool automatically switch binary log format to 'ROW' on the replica, if needed. The format will NOT be switched back. I'm too scared to do that, and wish to protect you if you happen to execute another migration while this one is running")
8585
flag.BoolVar(&migrationContext.AssumeRBR, "assume-rbr", false, "set to 'true' when you know for certain your server uses 'ROW' binlog_format. gh-ost is unable to tell, event after reading binlog_format, whether the replication process does indeed use 'ROW', and restarts replication to be certain RBR setting is applied. Such operation requires SUPER privileges which you might not have. Setting this flag avoids restarting replication and you can proceed to use gh-ost without SUPER privileges")
86+
flag.BoolVar(&migrationContext.CutOverExponentialBackoff, "cut-over-exponential-backoff", false, "Wait exponentially longer intervals between failed cut-over attempts. Wait intervals obey a maximum configurable with 'exponential-backoff-max-interval').")
87+
exponentialBackoffMaxInterval := flag.Int64("exponential-backoff-max-interval", 64, "Maximum number of seconds to wait between attempts when performing various operations with exponential backoff.")
8688
chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 100-100,000)")
8789
dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)")
8890
defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking")
@@ -238,6 +240,9 @@ func main() {
238240
if err := migrationContext.SetCutOverLockTimeoutSeconds(*cutOverLockTimeoutSeconds); err != nil {
239241
log.Errore(err)
240242
}
243+
if err := migrationContext.SetExponentialBackoffMaxInterval(*exponentialBackoffMaxInterval); err != nil {
244+
log.Errore(err)
245+
}
241246

242247
log.Infof("starting gh-ost %+v", AppVersion)
243248
acceptSignals(migrationContext)

go/logic/migrator.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,34 @@ func (this *Migrator) retryOperation(operation func() error, notFatalHint ...boo
149149
return err
150150
}
151151

152+
// `retryOperationWithExponentialBackoff` attempts running given function, waiting 2^(n-1)
153+
// seconds between each attempt, where `n` is the running number of attempts. Exits
154+
// as soon as the function returns with non-error, or as soon as `MaxRetries`
155+
// attempts are reached. Wait intervals between attempts obey a maximum of
156+
// `ExponentialBackoffMaxInterval`.
157+
func (this *Migrator) retryOperationWithExponentialBackoff(operation func() error, notFatalHint ...bool) (err error) {
158+
var interval int64
159+
maxRetries := int(this.migrationContext.MaxRetries())
160+
maxInterval := this.migrationContext.ExponentialBackoffMaxInterval
161+
for i := 0; i < maxRetries; i++ {
162+
newInterval := int64(math.Exp2(float64(i - 1)))
163+
if newInterval <= maxInterval {
164+
interval = newInterval
165+
}
166+
if i != 0 {
167+
time.Sleep(time.Duration(interval) * time.Second)
168+
}
169+
err = operation()
170+
if err == nil {
171+
return nil
172+
}
173+
}
174+
if len(notFatalHint) == 0 {
175+
this.migrationContext.PanicAbort <- err
176+
}
177+
return err
178+
}
179+
152180
// executeAndThrottleOnError executes a given function. If it errors, it
153181
// throttles.
154182
func (this *Migrator) executeAndThrottleOnError(operation func() error) (err error) {
@@ -372,7 +400,13 @@ func (this *Migrator) Migrate() (err error) {
372400
if err := this.hooksExecutor.onBeforeCutOver(); err != nil {
373401
return err
374402
}
375-
if err := this.retryOperation(this.cutOver); err != nil {
403+
var retrier func(func() error, ...bool) error
404+
if this.migrationContext.CutOverExponentialBackoff {
405+
retrier = this.retryOperationWithExponentialBackoff
406+
} else {
407+
retrier = this.retryOperation
408+
}
409+
if err := retrier(this.cutOver); err != nil {
376410
return err
377411
}
378412
atomic.StoreInt64(&this.migrationContext.CutOverCompleteFlag, 1)

0 commit comments

Comments
 (0)