Skip to content

Commit 6796a32

Browse files
authored
DAOS-17607 bio: SPDK I/O monitor (#17071)
Monitor inflight SPDK I/Os, if any I/O isn't completed within certain amount of time (120 seconds, configurable through env var DAOS_SPDK_IO_TIMEOUT), we assume the SPDK I/O is stalled due to hardware issue (or software bug), RAS event will be raised and the corresponding device will be marked as faulty. Signed-off-by: Niu Yawei <[email protected]>
1 parent 2e560d8 commit 6796a32

File tree

8 files changed

+160
-16
lines changed

8 files changed

+160
-16
lines changed

src/bio/bio_buffer.c

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ bio_iod_alloc(struct bio_io_context *ctxt, struct umem_instance *umem,
280280
return NULL;
281281

282282
D_ASSERT(type < BIO_IOD_TYPE_MAX);
283+
bio_io_lug_init(&biod->bd_io_lug);
283284
biod->bd_umem = umem;
284285
biod->bd_ctxt = ctxt;
285286
biod->bd_type = type;
@@ -336,6 +337,7 @@ bio_iod_free(struct bio_desc *biod)
336337
bio_sgl_fini(&biod->bd_sgls[i]);
337338

338339
D_FREE(biod->bd_bulk_hdls);
340+
bio_io_lug_fini(&biod->bd_io_lug);
339341

340342
D_FREE(biod);
341343
}
@@ -1041,8 +1043,7 @@ rw_completion(void *cb_arg, int err)
10411043

10421044
bxb = biod->bd_ctxt->bic_xs_blobstore;
10431045
D_ASSERT(bxb != NULL);
1044-
D_ASSERT(bxb->bxb_blob_rw > 0);
1045-
bxb->bxb_blob_rw--;
1046+
bio_io_lug_dequeue(bxb, &biod->bd_io_lug);
10461047

10471048
io_ctxt = biod->bd_ctxt;
10481049
D_ASSERT(io_ctxt != NULL);
@@ -1184,7 +1185,7 @@ nvme_rw(struct bio_desc *biod, struct bio_rsrvd_region *rg)
11841185

11851186
biod->bd_dma_issued = 1;
11861187
biod->bd_inflights++;
1187-
bxb->bxb_blob_rw++;
1188+
bio_io_lug_enqueue(xs_ctxt, bxb, &biod->bd_io_lug);
11881189
biod->bd_ctxt->bic_inflight_dmas++;
11891190

11901191
rw_cnt = (pg_cnt > bio_chk_sz) ? bio_chk_sz : pg_cnt;
@@ -1982,3 +1983,42 @@ bio_copy(struct bio_io_context *ioctxt, struct umem_instance *umem,
19821983

19831984
return rc;
19841985
}
1986+
1987+
#define IO_MONITOR_INTVL 1000000 /* us, 1 second */
1988+
1989+
void
1990+
bio_io_monitor(struct bio_xs_context *xs_ctxt, uint64_t now)
1991+
{
1992+
enum smd_dev_type st;
1993+
struct bio_xs_blobstore *bxb;
1994+
struct bio_io_lug *io_lug;
1995+
struct media_error_msg *mem;
1996+
1997+
if ((xs_ctxt->bxc_io_monitor_ts + IO_MONITOR_INTVL) > now)
1998+
return;
1999+
2000+
xs_ctxt->bxc_io_monitor_ts = now;
2001+
2002+
for (st = SMD_DEV_TYPE_DATA; st < SMD_DEV_TYPE_MAX; st++) {
2003+
bxb = xs_ctxt->bxc_xs_blobstores[st];
2004+
2005+
if (!bxb || d_list_empty(&bxb->bxb_pending_ios))
2006+
continue;
2007+
2008+
io_lug = d_list_entry(bxb->bxb_pending_ios.next, struct bio_io_lug, bil_link);
2009+
D_ASSERT(io_lug->bil_submit_ts != 0);
2010+
2011+
if ((io_lug->bil_submit_ts + bio_io_timeout) >= now)
2012+
continue;
2013+
2014+
D_ALLOC_PTR(mem);
2015+
if (mem == NULL) {
2016+
D_ERROR("Out of memory: NVMe stalled I/O report is skipped\n");
2017+
continue;
2018+
}
2019+
mem->mem_err_type = MET_IO_STALLED;
2020+
mem->mem_bs = bxb->bxb_blobstore;
2021+
mem->mem_tgt_id = xs_ctxt->bxc_tgt_id;
2022+
spdk_thread_send_msg(owner_thread(mem->mem_bs), bio_media_error, mem);
2023+
}
2024+
}

src/bio/bio_context.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
struct blob_cp_arg {
1414
spdk_blob_id bca_id;
1515
struct spdk_blob *bca_blob;
16+
struct bio_io_lug bca_io_lug;
1617
/*
1718
* Completion could run on different xstream when NVMe
1819
* device is shared by multiple xstreams.
@@ -36,6 +37,7 @@ blob_cp_arg_init(struct blob_cp_arg *ba)
3637
{
3738
int rc;
3839

40+
bio_io_lug_init(&ba->bca_io_lug);
3941
rc = ABT_eventual_create(0, &ba->bca_eventual);
4042
if (rc != ABT_SUCCESS)
4143
return dss_abterr2der(rc);
@@ -46,6 +48,7 @@ blob_cp_arg_init(struct blob_cp_arg *ba)
4648
static inline void
4749
blob_cp_arg_fini(struct blob_cp_arg *ba)
4850
{
51+
bio_io_lug_fini(&ba->bca_io_lug);
4952
ABT_eventual_free(&ba->bca_eventual);
5053
}
5154

@@ -164,8 +167,7 @@ blob_unmap_cb(void *arg, int rc)
164167

165168
bxb = bma->bma_ioc->bic_xs_blobstore;
166169
D_ASSERT(bxb != NULL);
167-
D_ASSERT(bxb->bxb_blob_rw > 0);
168-
bxb->bxb_blob_rw--;
170+
bio_io_lug_dequeue(bxb, &ba->bca_io_lug);
169171

170172
blob_common_cb(ba, rc);
171173
}
@@ -1232,7 +1234,7 @@ blob_unmap_sgl(struct bio_io_context *ioctxt, d_sg_list_t *unmap_sgl, uint32_t b
12321234
drain_inflight_ios(xs_ctxt, bxb);
12331235

12341236
ba->bca_inflights++;
1235-
bxb->bxb_blob_rw++;
1237+
bio_io_lug_enqueue(xs_ctxt, bxb, &ba->bca_io_lug);
12361238

12371239
pg_off = (uint64_t)unmap_iov->iov_buf;
12381240
pg_cnt = unmap_iov->iov_len;

src/bio/bio_internal.h

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ struct bio_dev_health {
278278
void *bdh_intel_smart_buf; /*Intel SMART attributes*/
279279
uint64_t bdh_stat_age;
280280
unsigned int bdh_inflights;
281-
unsigned int bdh_stopping:1;
281+
unsigned int bdh_stopping : 1, bdh_io_stalled : 1;
282282
uint16_t bdh_vendor_id; /* PCI vendor ID */
283283

284284
/**
@@ -365,10 +365,21 @@ struct bio_blobstore {
365365
bb_faulty_done:1; /* Faulty reaction is done */
366366
};
367367

368+
struct bio_io_lug {
369+
/* Link to bio_xs_blobstore::bxb_pending_ios */
370+
d_list_t bil_link;
371+
/* When the I/O is submitted */
372+
uint64_t bil_submit_ts;
373+
/* Reference count */
374+
uint32_t bil_ref;
375+
};
376+
368377
/* Per-xstream blobstore */
369378
struct bio_xs_blobstore {
370379
/* In-flight blob read/write */
371380
unsigned int bxb_blob_rw;
381+
/* Pending I/Os */
382+
d_list_t bxb_pending_ios;
372383
/* spdk io channel */
373384
struct spdk_io_channel *bxb_io_channel;
374385
/* per bio blobstore */
@@ -381,13 +392,60 @@ struct bio_xs_blobstore {
381392
/* Per-xstream NVMe context */
382393
struct bio_xs_context {
383394
int bxc_tgt_id;
395+
uint64_t bxc_io_monitor_ts;
384396
struct spdk_thread *bxc_thread;
385397
struct bio_xs_blobstore *bxc_xs_blobstores[SMD_DEV_TYPE_MAX];
386398
struct bio_dma_buffer *bxc_dma_buf;
387399
unsigned int bxc_self_polling:1; /* for standalone VOS */
388400
unsigned int bxc_skip_draining : 1;
389401
};
390402

403+
static inline void
404+
bio_io_lug_init(struct bio_io_lug *io_lug)
405+
{
406+
D_INIT_LIST_HEAD(&io_lug->bil_link);
407+
io_lug->bil_submit_ts = 0;
408+
io_lug->bil_ref = 0;
409+
}
410+
411+
static inline void
412+
bio_io_lug_fini(struct bio_io_lug *io_lug)
413+
{
414+
D_ASSERT(io_lug->bil_ref == 0);
415+
D_ASSERT(d_list_empty(&io_lug->bil_link));
416+
}
417+
418+
static inline void
419+
bio_io_lug_dequeue(struct bio_xs_blobstore *bxb, struct bio_io_lug *io_lug)
420+
{
421+
D_ASSERT(bxb->bxb_blob_rw > 0);
422+
bxb->bxb_blob_rw--;
423+
424+
D_ASSERT(!d_list_empty(&io_lug->bil_link));
425+
D_ASSERT(io_lug->bil_submit_ts != 0);
426+
D_ASSERT(io_lug->bil_ref > 0);
427+
io_lug->bil_ref--;
428+
if (io_lug->bil_ref == 0)
429+
d_list_del_init(&io_lug->bil_link);
430+
}
431+
432+
static inline void
433+
bio_io_lug_enqueue(struct bio_xs_context *xs_ctxt, struct bio_xs_blobstore *bxb,
434+
struct bio_io_lug *io_lug)
435+
{
436+
bxb->bxb_blob_rw++;
437+
if (io_lug->bil_ref == 0) {
438+
if (xs_ctxt->bxc_io_monitor_ts)
439+
io_lug->bil_submit_ts = xs_ctxt->bxc_io_monitor_ts;
440+
else
441+
io_lug->bil_submit_ts = d_timeus_secdiff(0);
442+
443+
D_ASSERT(d_list_empty(&io_lug->bil_link));
444+
d_list_add_tail(&io_lug->bil_link, &bxb->bxb_pending_ios);
445+
}
446+
io_lug->bil_ref++;
447+
}
448+
391449
/* Per VOS instance I/O context */
392450
struct bio_io_context {
393451
d_list_t bic_link; /* link to bxb_io_ctxts */
@@ -437,6 +495,7 @@ struct bio_rsrvd_dma {
437495

438496
/* I/O descriptor */
439497
struct bio_desc {
498+
struct bio_io_lug bd_io_lug;
440499
struct umem_instance *bd_umem;
441500
struct bio_io_context *bd_ctxt;
442501
/* DMA buffers reserved by this io descriptor */
@@ -546,6 +605,7 @@ extern unsigned int bio_chk_cnt_max;
546605
extern unsigned int bio_numa_node;
547606
extern unsigned int bio_spdk_max_unmap_cnt;
548607
extern unsigned int bio_max_async_sz;
608+
extern unsigned int bio_io_timeout;
549609

550610
int xs_poll_completion(struct bio_xs_context *ctxt, unsigned int *inflights,
551611
uint64_t timeout);
@@ -583,6 +643,8 @@ int iod_add_region(struct bio_desc *biod, struct bio_dma_chunk *chk,
583643
uint64_t end, uint8_t media);
584644
int dma_buffer_grow(struct bio_dma_buffer *buf, unsigned int cnt);
585645
void iod_dma_wait(struct bio_desc *biod);
646+
void
647+
bio_io_monitor(struct bio_xs_context *xs_ctxt, uint64_t now);
586648

587649
static inline struct bio_dma_buffer *
588650
iod_dma_buf(struct bio_desc *biod)

src/bio/bio_monitor.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,8 @@ get_spdk_health_info_completion(struct spdk_bdev_io *bdev_io, bool success,
680680
static bool
681681
is_bbs_faulty(struct bio_blobstore *bbs)
682682
{
683-
struct nvme_stats *dev_stats = &bbs->bb_dev_health.bdh_health_state;
683+
struct bio_dev_health *bdh = &bbs->bb_dev_health;
684+
struct nvme_stats *dev_stats = &bdh->bdh_health_state;
684685

685686
/*
686687
* Used for DAOS NVMe Recovery Tests. Will trigger bs faulty reaction
@@ -708,6 +709,12 @@ is_bbs_faulty(struct bio_blobstore *bbs)
708709
}
709710
}
710711

712+
/* Auto-faulty for stalled I/O stalled is always enabled */
713+
if (bdh->bdh_io_stalled) {
714+
D_ERROR("I/O stalled on NVMe device " DF_UUID "\n", DP_UUID(bbs->bb_dev->bb_uuid));
715+
return true;
716+
}
717+
711718
if (!glb_criteria.fc_enabled)
712719
return false;
713720

src/bio/bio_recovery.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,11 +697,20 @@ bio_media_error(void *msg_arg)
697697
"Device: "DF_UUID" csum error logged from tgt_id:%d\n",
698698
DP_UUID(mem->mem_bs->bb_dev->bb_uuid), mem->mem_tgt_id);
699699
break;
700+
case MET_IO_STALLED:
701+
/* I/O stalling has been reported for this device */
702+
if (bdh->bdh_io_stalled)
703+
goto out;
704+
bdh->bdh_io_stalled = 1;
705+
snprintf(err_str, DAOS_RAS_STR_FIELD_SIZE,
706+
"Device: " DF_UUID " stalled I/O logged from tgt_id:%d\n",
707+
DP_UUID(mem->mem_bs->bb_dev->bb_uuid), mem->mem_tgt_id);
708+
break;
700709
}
701710

702711
ras_notify_event(RAS_DEVICE_MEDIA_ERROR, err_str, RAS_TYPE_INFO, RAS_SEV_ERROR,
703712
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
713+
out:
704714
auto_faulty_detect(mem->mem_bs);
705-
706715
D_FREE(mem);
707716
}

src/bio/bio_xstream.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ unsigned int bio_spdk_subsys_timeout = 25000; /* ms */
6262
/* How many blob unmap calls can be called in a row */
6363
unsigned int bio_spdk_max_unmap_cnt = 32;
6464
unsigned int bio_max_async_sz = (1UL << 15) /* 32k */;
65+
unsigned int bio_io_timeout = 120000000; /* us, 120 seconds */
6566

6667
struct bio_nvme_data {
6768
ABT_mutex bd_mutex;
@@ -219,7 +220,7 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size,
219220
{
220221
char *env;
221222
int rc, fd;
222-
unsigned int size_mb = BIO_DMA_CHUNK_MB;
223+
unsigned int size_mb = BIO_DMA_CHUNK_MB, io_timeout_secs = 0;
223224

224225
if (tgt_nr <= 0) {
225226
D_ERROR("tgt_nr: %u should be > 0\n", tgt_nr);
@@ -277,6 +278,16 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size,
277278
d_getenv_uint("DAOS_MAX_ASYNC_SZ", &bio_max_async_sz);
278279
D_INFO("Max async data size is set to %u bytes\n", bio_max_async_sz);
279280

281+
d_getenv_uint("DAOS_SPDK_IO_TIMEOUT", &io_timeout_secs);
282+
if (io_timeout_secs > 0) {
283+
if (io_timeout_secs < 30 || io_timeout_secs > 300)
284+
D_WARN("DAOS_SPDK_IO_TIMEOUT(%u) is invalid. Min:30,Max:300,Default:120\n",
285+
io_timeout_secs);
286+
else
287+
bio_io_timeout = io_timeout_secs * 1000000; /* convert to us */
288+
}
289+
D_INFO("SPDK IO timeout set to %u us\n", bio_io_timeout);
290+
280291
/* Hugepages disabled */
281292
if (mem_size == 0) {
282293
D_INFO("Set per-xstream DMA buffer upper bound to %u %uMB chunks\n",
@@ -1241,6 +1252,7 @@ alloc_xs_blobstore(void)
12411252
if (bxb == NULL)
12421253
return NULL;
12431254

1255+
D_INIT_LIST_HEAD(&bxb->bxb_pending_ios);
12441256
D_INIT_LIST_HEAD(&bxb->bxb_io_ctxts);
12451257

12461258
return bxb;
@@ -1761,8 +1773,10 @@ bio_nvme_ctl(unsigned int cmd, void *arg)
17611773
static inline void
17621774
reset_media_errors(struct bio_blobstore *bbs)
17631775
{
1764-
struct nvme_stats *dev_stats = &bbs->bb_dev_health.bdh_health_state;
1776+
struct bio_dev_health *bdh = &bbs->bb_dev_health;
1777+
struct nvme_stats *dev_stats = &bdh->bdh_health_state;
17651778

1779+
bdh->bdh_io_stalled = 0;
17661780
dev_stats->bio_read_errs = 0;
17671781
dev_stats->bio_write_errs = 0;
17681782
dev_stats->bio_unmap_errs = 0;
@@ -1992,5 +2006,8 @@ bio_nvme_poll(struct bio_xs_context *ctxt)
19922006
bio_led_event_monitor(ctxt, now);
19932007
}
19942008

2009+
/* Detect stalled I/Os */
2010+
bio_io_monitor(ctxt, now);
2011+
19952012
return rc;
19962013
}

src/engine/sched.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1928,7 +1928,13 @@ need_nvme_poll(struct dss_xstream *dx, struct sched_cycle *cycle)
19281928

19291929
dmi = dss_get_module_info();
19301930
D_ASSERT(dmi != NULL);
1931-
return bio_need_nvme_poll(dmi->dmi_nvme_ctxt);
1931+
/*
1932+
* If SPDK I/O stalls indefinitely due to a hardware fault (or software bug),
1933+
* the resulting backlog of undrained I/Os will cause bio_need_nvme_poll() to
1934+
* consistently return true. To prevent starvation and ensure system progress,
1935+
* schedule the NVMe polling ULT and other ULTs intverleavingly.
1936+
*/
1937+
return !cycle->sc_age_nvme && bio_need_nvme_poll(dmi->dmi_nvme_ctxt);
19321938
}
19331939

19341940
static ABT_unit

src/include/daos_srv/daos_engine.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -739,10 +739,11 @@ enum dss_init_state {
739739
};
740740

741741
enum dss_media_error_type {
742-
MET_WRITE = 0, /* write error */
743-
MET_READ, /* read error */
744-
MET_UNMAP, /* unmap error */
745-
MET_CSUM /* checksum error */
742+
MET_WRITE = 0, /* NVME write error */
743+
MET_READ, /* NVME read error */
744+
MET_UNMAP, /* NVME unmap error */
745+
MET_CSUM, /* Checksum error */
746+
MET_IO_STALLED, /* NVMe I/O stalled */
746747
};
747748

748749
void dss_init_state_set(enum dss_init_state state);

0 commit comments

Comments
 (0)