From 6a242911eef1cc24ef0de5584889ca31a1720541 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 26 Feb 2024 17:08:46 +0800 Subject: [PATCH 1/4] anolis: fs: record page or bio info while process is waiting on it cherry picked from devel-6.6 commit 19aa2fcdc9b7224c17546f83b4e26f3d13731f49. ANBZ: #34068 If one process context is stucked in wait_on_buffer(), lock_buffer(), lock_page() and wait_on_page_writeback() and wait_on_bit_io(), it's hard to tell true reason, for example, whether this page is under io, or this page is just locked too long by other process context. Normally io request has multiple bios, and every bio contains multiple pages which will hold data to be read from or written to device, so here we record page info or bio info in task_struct while process calls lock_page(), lock_buffer(), wait_on_page_writeback(), wait_on_buffer() and wait_on_bit_io(), we add a new proce interface: [lege@localhost linux]$ cat /proc/4516/wait_res 1 ffffd0969f95d3c0 4295369599 4295381596 Above info means that thread 4516 is waiting on a page, address is ffffd0969f95d3c0, and has waited for 11997ms. First field denotes the page address process is waiting on. Second field denotes the wait moment and the third denotes current moment. In practice, if we found a process waiting on one page for too long time, we can get page's address by reading /proc/$pid/wait_page, and search this page address in all block dev' /sys/kernel/debug/block/${devname}/rq_hang, if search operation hits one, we can get the request and know why this io request hangs that long. Signed-off-by: Xiaoguang Wang [ change permission of wait_res to S_IRUSR, and record in folio_wait_writeback() ] Signed-off-by: Joseph Qi --- block/bio.c | 2 ++ fs/jbd2/transaction.c | 2 ++ fs/proc/base.c | 12 +++++++++++ include/linux/buffer_head.h | 10 ++++++++-- include/linux/sched.h | 40 +++++++++++++++++++++++++++++++++++++ mm/filemap.c | 2 ++ mm/page-writeback.c | 2 ++ 7 files changed, 68 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 641ef0928d73..4d500beef061 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1510,7 +1510,9 @@ EXPORT_SYMBOL_GPL(bio_await); */ int submit_bio_wait(struct bio *bio) { + task_set_wait_res(TASK_WAIT_BIO, bio); bio_await(bio, NULL, NULL); + task_clear_wait_res(); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f8c238b3aadb..9a5a9970a8bf 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1115,7 +1115,9 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&jh->b_state_lock); + task_set_wait_res(TASK_WAIT_FOLIO, bh->b_folio); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + task_clear_wait_res(); goto repeat; } diff --git a/fs/proc/base.c b/fs/proc/base.c index d9acfa89c894..abf4903dafa9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -603,6 +603,16 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, return 0; } +static int proc_wait_res(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%d %px %lu %lu\n", READ_ONCE(task->wait_res_type), + READ_ONCE(task->wait_folio), READ_ONCE(task->wait_moment), + jiffies); + + return 0; +} + struct limit_names { const char *name; const char *unit; @@ -3421,6 +3431,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif + ONE("wait_res", S_IRUSR, proc_wait_res), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3755,6 +3766,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif + ONE("wait_res", S_IRUSR, proc_wait_res), }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e4939e33b4b5..5f1a1dcbbd20 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -416,8 +416,11 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) static inline void wait_on_buffer(struct buffer_head *bh) { might_sleep(); - if (buffer_locked(bh)) + if (buffer_locked(bh)) { + task_set_wait_res(TASK_WAIT_FOLIO, bh->b_folio); __wait_on_buffer(bh); + task_clear_wait_res(); + } } static inline int trylock_buffer(struct buffer_head *bh) @@ -428,8 +431,11 @@ static inline int trylock_buffer(struct buffer_head *bh) static inline void lock_buffer(struct buffer_head *bh) { might_sleep(); - if (!trylock_buffer(bh)) + if (!trylock_buffer(bh)) { + task_set_wait_res(TASK_WAIT_FOLIO, bh->b_folio); __lock_buffer(bh); + task_clear_wait_res(); + } } static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) diff --git a/include/linux/sched.h b/include/linux/sched.h index 213033e7109a..3614e5f20d91 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1678,6 +1678,13 @@ struct task_struct { struct unwind_task_info unwind_info; #endif + int wait_res_type; + union { + struct folio *wait_folio; + struct bio *wait_bio; + }; + unsigned long wait_moment; + /* CPU-specific state of this task: */ struct thread_struct thread; @@ -1697,6 +1704,39 @@ struct task_struct { CK_KABI_RESERVE(8) } __attribute__ ((aligned (64))); +/* copy from jiffies.h to avoid circular dependency */ +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; + +enum { + TASK_WAIT_FOLIO = 1, + TASK_WAIT_BIO, +}; + +static inline void task_set_wait_res(int type, void *res) +{ + switch (type) { + case TASK_WAIT_FOLIO: + current->wait_folio = (struct folio *)res; + break; + case TASK_WAIT_BIO: + current->wait_bio = (struct bio *)res; + break; + default: + current->wait_folio = NULL; + break; + } + + current->wait_res_type = type; + current->wait_moment = jiffies; +} + +static inline void task_clear_wait_res(void) +{ + current->wait_folio = NULL; + current->wait_res_type = 0; + current->wait_moment = 0; +} + #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) diff --git a/mm/filemap.c b/mm/filemap.c index 4e636647100c..4b3f2c183044 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1702,8 +1702,10 @@ EXPORT_SYMBOL(folio_end_writeback); */ void __folio_lock(struct folio *folio) { + task_set_wait_res(TASK_WAIT_FOLIO, folio); folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); + task_clear_wait_res(); } EXPORT_SYMBOL(__folio_lock); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 88cd53d4ba09..2a6d361a2e96 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3063,7 +3063,9 @@ void folio_wait_writeback(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); + task_set_wait_res(TASK_WAIT_FOLIO, folio); folio_wait_bit(folio, PG_writeback); + task_clear_wait_res(); } } EXPORT_SYMBOL_GPL(folio_wait_writeback); -- Gitee From 93de3ecf627019e8a33e17cec495506d0a6cecef Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 26 Feb 2024 17:21:45 +0800 Subject: [PATCH 2/4] anolis: blk: add iohang check function cherry picked from devel-6.6 commit 5c3c218d8bb840261cebe0c6598853397e3c38d5. ANBZ: #34068 Background: We do not have a dependable block layer interface to determine whether block device has io requests which have not been completed for somewhat long time. Currently we have 'in_flight' interface, it counts the number of I/O requests that have been issued to the device driver but have not yet completed, and it does not include I/O requests that are in the queue but not yet issued to the device driver, which means it will not count io requests that have been stucked in block layer. Also say that there are steady io requests issued to device driver, 'in_flight' maybe always non-zero, but you could not determine whether there is one io request which has not been completed for too long. Solution: To find io requests which have not been completed for too long, here add 3 new inferfaces: /sys/block/vdb/queue/hang_threshold If one io request's running time has been greater than this value, count this io as hang. /sys/block/vdb/hang Show read/write io requests' hang counter. /sys/kernel/debug/block/vdb/rq_hang Show all hang io requests's detailed info, like below: ffff97db96301200 {.op=WRITE, .cmd_flags=SYNC, .rq_flags=STARTED| ELVPRIV|IO_STAT|STATS, .state=in_flight, .tag=30, .internal_tag=169, .start_time_ns=140634088407, .io_start_time_ns=140634102958, .current_time=146497371953, .bio = ffff97db91e8e000, .bio_pages = { ffffd096a0602540 }, .bio = ffff97db91e8ec00, .bio_pages = { ffffd096a070eec0 }, .bio = ffff97db91e8f600, .bio_pages = { ffffd096a0424cc0 }, .bio = ffff97db91e8f300, .bio_pages = { ffffd096a0600a80 }} With above info, we can easily see this request's latency distribution, and see next patch for bio_pages's usage. Note this feature needs CONFIG_BLK_DEBUG_FS to be enabled. Signed-off-by: Xiaoguang Wang [ remove ref get in blk_mq_check_rq_hang() since bi_iter() already does ] Signed-off-by: Joseph Qi --- block/blk-core.c | 1 + block/blk-mq-debugfs.c | 70 +++++++++++++++++++++++++++++++++++++++++ block/blk-mq.c | 28 +++++++++++++++++ block/blk-mq.h | 2 ++ block/blk-settings.c | 7 +++++ block/blk-sysfs.c | 23 ++++++++++++++ block/blk.h | 2 ++ block/genhd.c | 19 +++++++++++ block/partitions/core.c | 2 ++ include/linux/blkdev.h | 9 ++++++ 10 files changed, 163 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 474700ffaa1c..9b96acd218db 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -401,6 +401,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) return ERR_PTR(-ENOMEM); q->last_merge = NULL; + q->rq_hang_threshold = BLK_REQ_HANG_THRESHOLD; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) { diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 047ec887456b..0870e368445b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -148,12 +148,36 @@ static ssize_t queue_state_write(void *data, const char __user *buf, return count; } +static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq); + +static bool blk_mq_check_rq_hang(struct request *rq, void *priv) +{ + struct seq_file *m = priv; + u64 now = ktime_get_ns(); + u64 duration; + + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if (duration >= READ_ONCE(rq->q->rq_hang_threshold)) + blk_mq_debugfs_rq_hang_show(m, rq); + + return true; +} + +static int queue_rq_hang_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_rq_hang, m); + return 0; +} + static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, + { "rq_hang", 0400, queue_rq_hang_show, NULL }, { }, }; @@ -297,6 +321,52 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); +static void blk_mq_debugfs_rq_hang_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + const unsigned int op = req_op(rq); + const char *op_str = blk_op_str(op); + struct bio *bio; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + seq_printf(m, "%px {.op=", rq); + if (strcmp(op_str, "UNKNOWN") == 0) + seq_printf(m, "%u", op); + else + seq_printf(m, "%s", op_str); + seq_puts(m, ", .cmd_flags="); + blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, + ARRAY_SIZE(cmd_flag_name)); + seq_puts(m, ", .rq_flags="); + blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, + ARRAY_SIZE(rqf_name)); + seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); + seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, + rq->internal_tag); + seq_printf(m, ", .start_time_ns=%llu", rq->start_time_ns); + seq_printf(m, ", .io_start_time_ns=%llu", rq->io_start_time_ns); + seq_printf(m, ", .current_time=%llu", ktime_get_ns()); + + __rq_for_each_bio(bio, rq) { + seq_printf(m, ", .bio = %px", bio); + seq_printf(m, ", .sector = %llu, .len=%u", + bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + seq_puts(m, ", .bio_pages = { "); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + if (!page) + continue; + seq_printf(m, "%px ", page); + } + seq_puts(m, "}"); + } + if (mq_ops->show_rq) + mq_ops->show_rq(m, rq); + seq_puts(m, "}\n"); +} + static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8..bdc01e9bd8dc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,6 +112,34 @@ void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) inflight[WRITE] = mi.inflight[WRITE]; } +struct mq_hang { + struct block_device *part; + unsigned int hang[2]; +}; + +static bool blk_mq_check_hang(struct request *rq, void *priv) +{ + struct mq_hang *mh = priv; + u64 now = ktime_get_ns(), duration; + + duration = div_u64(now - rq->start_time_ns, NSEC_PER_MSEC); + if ((duration >= READ_ONCE(rq->q->rq_hang_threshold)) && + (!bdev_partno(mh->part) || rq->part == mh->part)) + mh->hang[rq_data_dir(rq)]++; + + return true; +} + +void blk_mq_hang_rw(struct request_queue *q, struct block_device *part, + unsigned int hang[2]) +{ + struct mq_hang mh = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_hang, &mh); + hang[0] = mh.hang[0]; + hang[1] = mh.hang[1]; +} + #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) diff --git a/block/blk-mq.h b/block/blk-mq.h index 7fc651df78f4..aa1407e693e0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -265,6 +265,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); +void blk_mq_hang_rw(struct request_queue *q, struct block_device *part, + unsigned int hang[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) diff --git a/block/blk-settings.c b/block/blk-settings.c index 78c83817b9d3..625b38dc06e8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -27,6 +27,13 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); +void blk_queue_rq_hang_threshold(struct request_queue *q, + unsigned int hang_threshold) +{ + WRITE_ONCE(q->rq_hang_threshold, hang_threshold); +} +EXPORT_SYMBOL_GPL(blk_queue_rq_hang_threshold); + /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f22c1f253eb3..df4ee9663078 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -553,6 +553,27 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, return count; } +static ssize_t queue_hang_threshold_show(struct gendisk *disk, char *page) +{ + return sysfs_emit(page, "%u\n", READ_ONCE(disk->queue->rq_hang_threshold)); +} + +static ssize_t queue_hang_threshold_store(struct gendisk *disk, const char *page, + size_t count) +{ + unsigned int hang_threshold; + int err; + struct request_queue *q = disk->queue; + + err = kstrtou32(page, 10, &hang_threshold); + if (err || hang_threshold == 0) + return -EINVAL; + + blk_queue_rq_hang_threshold(q, hang_threshold); + + return count; +} + static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) @@ -663,6 +684,7 @@ QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); +QUEUE_RW_ENTRY(queue_hang_threshold, "hang_threshold"); /* legacy alias for logical_block_size: */ static const struct queue_sysfs_entry queue_hw_sector_size_entry = { @@ -775,6 +797,7 @@ static const struct attribute *const queue_attrs[] = { &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, &queue_ra_entry.attr, + &queue_hang_threshold_entry.attr, /* * Attributes which don't require locking. diff --git a/block/blk.h b/block/blk.h index ec4674cdf2ea..4e3f26205276 100644 --- a/block/blk.h +++ b/block/blk.h @@ -342,6 +342,8 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); +ssize_t part_hang_show(struct device *dev, struct device_attribute *attr, + char *buf); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, diff --git a/block/genhd.c b/block/genhd.c index 7d6854fd28e9..a9a1ae18f744 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1120,6 +1120,23 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } +ssize_t part_hang_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); + unsigned int hang[2] = {0, 0}; + + /* + * For now, we only support mq device, since don't find a generic method + * to track reqs in single queue device. + */ + if (queue_is_mq(q)) + blk_mq_hang_rw(q, bdev, hang); + + return sysfs_emit(buf, "%8u %8u\n", hang[0], hang[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1170,6 +1187,7 @@ static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(hang, 0444, part_hang_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); @@ -1218,6 +1236,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_hang.attr, &dev_attr_badblocks.attr, &dev_attr_events.attr, &dev_attr_events_async.attr, diff --git a/block/partitions/core.c b/block/partitions/core.c index 5d5332ce586b..29d47c7d1dcc 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -213,6 +213,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(hang, 0444, part_hang_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); @@ -227,6 +228,7 @@ static struct attribute *part_attrs[] = { &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_hang.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e896b223d435..bf997eac2a2c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,6 +495,12 @@ struct blk_independent_access_ranges { struct blk_independent_access_range ia_range[]; }; +/* + * default request hang threshold, unit is millisecond. If one request does + * not complete in this threashold time, consider this request as hang. + */ +#define BLK_REQ_HANG_THRESHOLD 5000 + struct request_queue { /* * The queue owner gets to use this for whatever they like. @@ -515,6 +521,7 @@ struct request_queue { unsigned long queue_flags; unsigned int __data_racy rq_timeout; + unsigned int rq_hang_threshold; unsigned int queue_depth; @@ -1169,6 +1176,8 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); +extern void blk_queue_rq_hang_threshold(struct request_queue *, + unsigned int hang_threshold); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); -- Gitee From 4578ebafbfcd80e33936ca6a657372a114cc7aa3 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 26 Feb 2024 19:41:23 +0800 Subject: [PATCH 3/4] anolis: block: add counter to track io request's d2c time cherry picked from devel-6.6 commit 22af05f48902a3a56aee4ae68ae42394713534ae. ANBZ: #34068 Indeed tool iostat's await is not good enough, which is somewhat sketchy and could not show request's latency on device driver's side. Here we add a new counter to track io request's d2c time, also with this patch, we can extend iostat to show this value easily. Note: I had checked how iostat is implemented, it just reads fields it needs, so iostat won't be affected by this change, so does tsar. Signed-off-by: Xiaoguang Wang Signed-off-by: Joseph Qi --- Documentation/admin-guide/iostats.rst | 6 ++++++ block/blk-mq.c | 4 ++++ block/genhd.c | 15 +++++++++++++-- include/linux/part_stat.h | 1 + 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index 9453196ade51..6d675788362e 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -108,6 +108,12 @@ Field 16 -- # of flush requests completed Field 17 -- # of milliseconds spent flushing This is the total number of milliseconds spent by all flush requests. +Field 18 -- # of milliseconds spent reading on device driver's side + +Field 19 -- # of milliseconds spent writing on device driver's side + +Field 20 -- # of milliseconds spent discarding on device driver's side + To avoid introducing performance bottlenecks, no locks are held while modifying these counters. This implies that minor inaccuracies may be introduced when changes collide, so (for instance) adding up all the diff --git a/block/blk-mq.c b/block/blk-mq.c index bdc01e9bd8dc..4546a91ca5df 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1112,6 +1112,10 @@ static inline void blk_account_io_done(struct request *req, u64 now) part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); + if (req->rq_flags & RQF_STATS) { + part_stat_add(req->part, d2c_nsecs[sgrp], + now - req->io_start_time_ns); + } part_stat_unlock(); } } diff --git a/block/genhd.c b/block/genhd.c index a9a1ae18f744..594cd1e51c50 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -116,6 +116,7 @@ static void part_stat_read_all(struct block_device *part, for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; + stat->d2c_nsecs[group] += ptr->d2c_nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; @@ -1078,7 +1079,8 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " - "%8lu %8u" + "%8lu %8u " + "%8u %8u %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], @@ -1100,7 +1102,10 @@ ssize_t part_stat_show(struct device *dev, (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], - (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), + (unsigned int)div_u64(stat.d2c_nsecs[STAT_READ], NSEC_PER_MSEC), + (unsigned int)div_u64(stat.d2c_nsecs[STAT_WRITE], NSEC_PER_MSEC), + (unsigned int)div_u64(stat.d2c_nsecs[STAT_DISCARD], NSEC_PER_MSEC)); } /* @@ -1425,6 +1430,12 @@ static int diskstats_show(struct seq_file *seqf, void *v) seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.d2c_nsecs[STAT_READ], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.d2c_nsecs[STAT_WRITE], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.d2c_nsecs[STAT_DISCARD], + NSEC_PER_MSEC)); seq_putc(seqf, '\n'); } rcu_read_unlock(); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 729415e91215..7440f274041b 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -7,6 +7,7 @@ struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; + u64 d2c_nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; -- Gitee From 5e7d1ae73980d7174d021d7b151d0d83014c67e9 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 27 May 2025 19:51:52 +0800 Subject: [PATCH 4/4] anolis: block: fix request d2c time accounting cherry picked from devel-6.6 commit a2249a28b757736f2794433bad4e2c81b1c03bc8. ANBZ: #34068 After commit 8e15dfbd9ae2 ("blk-throttle: only enable blk-stat when BLK_DEV_THROTTLING_LOW"), QUEUE_FLAG_STATS is not set by default, so that request io_start_time_ns will always be 0. This will finally lead d2c stats not work as expected. Since accounting io_start_time_ns will call ktime_get_ns() and it will bring extra ~20ns per request, it may impact peak IOPS in fast device. In my test server with null_blk, it will drop ~2% peak IOPS. So introduce a new sysfs interface to control if it is enabled or not. Fixes: 22af05f48902 ("anolis: block: add counter to track io request's d2c time") Signed-off-by: Joseph Qi --- block/blk-mq.c | 4 ++-- block/blk-settings.c | 6 ++++++ block/blk-sysfs.c | 23 +++++++++++++++++++++++ include/linux/blkdev.h | 3 +++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4546a91ca5df..cbb5ffa031c3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1403,8 +1403,8 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); - if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && - !blk_rq_is_passthrough(rq)) { + if ((test_bit(QUEUE_FLAG_STATS, &q->queue_flags) || + READ_ONCE(q->enable_d2c_stats)) && !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; diff --git a/block/blk-settings.c b/block/blk-settings.c index 625b38dc06e8..81eb9ec38a3f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -34,6 +34,12 @@ void blk_queue_rq_hang_threshold(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_rq_hang_threshold); +void blk_queue_d2c_stats(struct request_queue *q, bool enable) +{ + WRITE_ONCE(q->enable_d2c_stats, enable); +} +EXPORT_SYMBOL_GPL(blk_queue_d2c_stats); + /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index df4ee9663078..9bba438a4851 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -574,6 +574,27 @@ static ssize_t queue_hang_threshold_store(struct gendisk *disk, const char *page return count; } +static ssize_t queue_d2c_stats_show(struct gendisk *disk, char *page) +{ + return sysfs_emit(page, "%u\n", READ_ONCE(disk->queue->enable_d2c_stats)); +} + +static ssize_t queue_d2c_stats_store(struct gendisk *disk, const char *page, + size_t count) +{ + struct request_queue *q = disk->queue; + bool enable; + int err; + + err = kstrtobool(page, &enable); + if (err) + return -EINVAL; + + blk_queue_d2c_stats(q, enable); + + return count; +} + static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) @@ -685,6 +706,7 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); QUEUE_RW_ENTRY(queue_hang_threshold, "hang_threshold"); +QUEUE_RW_ENTRY(queue_d2c_stats, "d2c_stats"); /* legacy alias for logical_block_size: */ static const struct queue_sysfs_entry queue_hw_sector_size_entry = { @@ -798,6 +820,7 @@ static const struct attribute *const queue_attrs[] = { &queue_dma_alignment_entry.attr, &queue_ra_entry.attr, &queue_hang_threshold_entry.attr, + &queue_d2c_stats_entry.attr, /* * Attributes which don't require locking. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bf997eac2a2c..d18a276739bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -675,6 +675,8 @@ struct request_queue { */ struct mutex debugfs_mutex; + bool enable_d2c_stats; + CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -1178,6 +1180,7 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_rq_hang_threshold(struct request_queue *, unsigned int hang_threshold); +extern void blk_queue_d2c_stats(struct request_queue *, bool enable); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); -- Gitee