Skip to content

Commit c488578

Browse files
committed
roachtest: add more retries to sql-stats/mixed-version
Previously, we had some HTTP requests to SQL Stats which did not retry. Retries have been added there. Additionally, we will retry on HTTP errors as well to deal with timeout errors and cluster network issues. This test is bringing nodes up and down during upgrades and a retry is sometimes necessary to make sure we don't get stuck waiting on a draining node. Resolves: cockroachdb#146699 Release note: None
1 parent dc81a93 commit c488578

File tree

1 file changed

+25
-3
lines changed

1 file changed

+25
-3
lines changed

pkg/cmd/roachtest/tests/mixed_version_sql_stats.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,25 @@ func runGetRequests(
123123
// We can ensure we request from in-memory tables by requesting stats from
124124
// a time range for which no data is available, as the in-memory table
125125
// is only used when no data is available from system tables.
126-
if err := s.requestSQLStatsFromEmptyInterval(ctx, roachNodes, statsType, addToTableSources); err != nil {
127-
return errors.Wrap(err, "failed to request stats for empty interval")
126+
rEmpty := retry.StartWithCtx(ctx, retry.Options{
127+
InitialBackoff: 10 * time.Second,
128+
MaxBackoff: 30 * time.Second,
129+
MaxRetries: 6, // 6 * 30s = 3m, which is well past the flush interval.
130+
})
131+
var lastErr error
132+
for rEmpty.Next() {
133+
if err := s.requestSQLStatsFromEmptyInterval(ctx, roachNodes, statsType, addToTableSources); err != nil {
134+
// We retry despite these errors because they could be due to cluster
135+
// operations which cause timeouts and instability.
136+
lastErr = errors.Wrap(err, "failed to request stats for empty interval")
137+
continue
138+
}
139+
break
140+
}
141+
if lastErr != nil {
142+
return lastErr
128143
}
144+
129145
inMemTable := crdbInternalStmtStatsCombined
130146
if statsType == serverpb.CombinedStatementsStatsRequest_TxnStatsOnly {
131147
inMemTable = crdbInternalTxnStatsCombined
@@ -144,14 +160,20 @@ func runGetRequests(
144160
for r.Next() {
145161
// Requesting data from the last two hours should return data from the persisted tables eventually.
146162
if err := s.requestSQLStatsFromLastTwoHours(ctx, roachNodes, statsType, addToTableSources); err != nil {
147-
return errors.Wrap(err, "failed to request stats for last two hours")
163+
// We retry despite these errors because they could be due to cluster
164+
// operations which cause timeouts and instability.
165+
lastErr = errors.Wrap(err, "failed to request stats for last two hours")
166+
continue
148167
}
149168
if len(foundTableSources) >= expectedTableCount {
150169
foundFlushedStats = true
151170
break
152171
}
153172
l.Printf("waiting for flushed stats...")
154173
}
174+
if lastErr != nil {
175+
return lastErr
176+
}
155177
if !foundFlushedStats {
156178
return errors.Newf("failed to find flushed stats, found: %v", maps.Keys(foundTableSources))
157179
}

0 commit comments

Comments
 (0)