From: Baisong Zhong zhongbaisong@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PY1Q CVE: NA
--------------------------------
We add stub info in some structures to maintain the consistency of KABI
Signed-off-by: Baisong Zhong zhongbaisong@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skbuff.h | 2 ++ include/net/ipv6.h | 2 +- include/net/sock.h | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9e3a454d2377..d485f17ff33a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include <linux/splice.h> #include <linux/in6.h> #include <linux/if_packet.h> +#include <linux/llist.h> #include <net/flow.h> #include <net/page_pool.h> #include <linux/kabi.h> @@ -732,6 +733,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; };
union { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index bd1f396cc9c7..c0273ae50296 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -344,9 +344,9 @@ struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; + __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; - __u16 gso_size; };
static inline void ipcm6_init(struct ipcm6_cookie *ipc6) diff --git a/include/net/sock.h b/include/net/sock.h index 712bb7b09f96..b3d451878640 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -63,6 +63,7 @@
#include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> @@ -405,6 +406,8 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; + struct llist_head defer_list; + #define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc;
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
Add a framwork to transform io informations to userspace client and process prefetch request sent by userspace client. Create a char device namede "acache" for connecting between kernelspace and userspace. Save informations of all io requests into a buffer and pass them to client when client reads from the device.
The prefetch request could be treated as normal io request. As deference, those requests have no need return data back to userspace, and they should not append readahead part.
Add two parameters. acache_dev_size is for controlling size of buffer to save io informations. acache_prefetch_workers is for controlling max threads to process prefetch requests.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 473 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 69 +++++ drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 129 ++++++---- drivers/md/bcache/request.h | 32 +++ drivers/md/bcache/super.c | 4 + include/trace/events/bcache.h | 11 + 8 files changed, 670 insertions(+), 54 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676b8..28f5294dd6cf 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c new file mode 100644 index 000000000000..ff3e120d9619 --- /dev/null +++ b/drivers/md/bcache/acache.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "acache.h" +#include "request.h" + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/cdev.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/circ_buf.h> +#include <linux/list.h> + +#include <trace/events/bcache.h> + +#define DEV_NAME "acache" + +int acache_dev_size = (1024 * 4096 + 4096); + +module_param_named(acache_size, acache_dev_size, int, 0444); +MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte"); + +int acache_prefetch_workers = 1000; + +module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); +MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests"); + +struct prefetch_worker { + struct acache_info s; + struct work_struct work; + struct list_head list; +}; + +struct acache_device { + bool initialized; + + dev_t devno; + struct cdev cdev; + struct class *class; + struct mem_reg *mem_regionp; + + struct acache_info *readbuf; + struct acache_info *writebuf; + + struct acache_circ *acache_info_circ; + + struct workqueue_struct *wq; + struct prefetch_worker *prefetch_workers; + struct list_head prefetch_workers_free; + spinlock_t prefetch_workers_free_list_lock; +} adev; + +#define MAX_TRANSFER_SIZE (1024 * 1024) + +static atomic_t acache_opened_dev = ATOMIC_INIT(0); +static struct acache_metadata metadata; + + +int acache_open(struct inode *inode, struct file *filp) +{ + struct mem_reg *dev; + + int minor = MINOR(inode->i_rdev); + + if (minor >= ACACHE_NR_DEVS) + return -ENODEV; + if (atomic_xchg(&acache_opened_dev, 1)) + return -EPERM; + + dev = &adev.mem_regionp[minor]; + + filp->private_data = dev; + + return 0; +} + +int acache_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&acache_opened_dev); + return 0; +} + +ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf, + size_t size) +{ + unsigned long first, todo, flags; + + spin_lock_irqsave(&circ->lock, flags); + + todo = CIRC_CNT(circ->head, circ->tail, circ->size); + if (todo == 0) { + spin_unlock_irqrestore(&circ->lock, flags); + return 0; + } + if (todo > size / sizeof(struct acache_info)) + todo = size / sizeof(struct acache_info); + + first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size); + if (first > todo) + first = todo; + + memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info)); + if (first < todo) + memcpy(buf + first, circ->data, + (todo - first) * sizeof(struct acache_info)); + circ->tail = (circ->tail + todo) & (circ->size - 1); + + spin_unlock_irqrestore(&circ->lock, flags); + return todo * sizeof(struct acache_info); +} + +static ssize_t acache_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + long ret, cut; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size); + if (ret <= 0) + return ret; + + cut = copy_to_user(buf, adev.readbuf, size); + return ret - cut; +} + +int process_one_request(struct acache_info *item); +static void prefetch_worker_func(struct work_struct *work) +{ + struct prefetch_worker *sw = + container_of(work, struct prefetch_worker, work); + + process_one_request(&sw->s); + spin_lock(&adev.prefetch_workers_free_list_lock); + list_add_tail(&sw->list, &adev.prefetch_workers_free); + spin_unlock(&adev.prefetch_workers_free_list_lock); +} + +static int queue_prefetch_item(struct acache_info *s) +{ + struct prefetch_worker *sw; + + spin_lock(&adev.prefetch_workers_free_list_lock); + sw = list_first_entry_or_null(&adev.prefetch_workers_free, + struct prefetch_worker, list); + if (!sw) { + spin_unlock(&adev.prefetch_workers_free_list_lock); + return -1; + } + list_del_init(&sw->list); + spin_unlock(&adev.prefetch_workers_free_list_lock); + + memcpy(&sw->s, s, sizeof(struct acache_info)); + INIT_WORK(&sw->work, prefetch_worker_func); + queue_work(adev.wq, &sw->work); + return 0; +} + +static ssize_t acache_write(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos) +{ + long cut; + int i; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + cut = copy_from_user(adev.writebuf, buf, size); + for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) { + if (queue_prefetch_item(adev.writebuf + i)) + break; + } + return i * sizeof(struct acache_info); +} + +static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case ACACHE_GET_METADATA: + return copy_to_user((struct acache_metadata __user *)arg, + &metadata, sizeof(struct acache_metadata)); + default: + return -EINVAL; + } +} + +static const struct file_operations acache_fops = { + .owner = THIS_MODULE, + .read = acache_read, + .write = acache_write, + .open = acache_open, + .release = acache_release, + .unlocked_ioctl = acache_ioctl, +}; + +void save_circ_item(struct acache_info *data) +{ + unsigned long flags; + struct acache_circ *circ = adev.acache_info_circ; + + spin_lock_irqsave(&circ->lock, flags); + if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) { + memcpy(&circ->data[circ->head], data, sizeof(struct acache_info)); + circ->head = (circ->head + 1) & (circ->size - 1); + } else { + pr_debug("ringbuffer is full; discard new request.\n"); + } + spin_unlock_irqrestore(&circ->lock, flags); +} + +void init_acache_circ(struct acache_circ **circ, void *startaddr) +{ + *circ = (struct acache_circ *)startaddr; + (*circ)->head = 0; + (*circ)->tail = 0; + (*circ)->size = ACACHE_CIRC_SIZE; + spin_lock_init(&(*circ)->lock); +} + +static void acache_free_mem(void) +{ + int i; + + for (i = 0; i < ACACHE_NR_DEVS; i++) + vfree(adev.mem_regionp[i].data); + + if (adev.readbuf) { + vfree(adev.readbuf); + adev.readbuf = NULL; + } + if (adev.writebuf) { + vfree(adev.writebuf); + adev.writebuf = NULL; + } + + kfree(adev.prefetch_workers); + adev.prefetch_workers = NULL; +} + +int acache_prefetch_init(struct acache_device *adev) +{ + int i; + + if (acache_prefetch_workers <= 0) { + pr_err("acache_dev_size should not be less than zero\n"); + return -1; + } + adev->prefetch_workers = kmalloc_array(acache_prefetch_workers, + sizeof(struct prefetch_worker), + GFP_KERNEL); + if (!adev->prefetch_workers) + goto fail_prefetch_workers_alloc; + + INIT_LIST_HEAD(&adev->prefetch_workers_free); + spin_lock_init(&adev->prefetch_workers_free_list_lock); + for (i = 0; i < acache_prefetch_workers; i++) { + spin_lock(&adev->prefetch_workers_free_list_lock); + list_add_tail(&adev->prefetch_workers[i].list, + &adev->prefetch_workers_free); + spin_unlock(&adev->prefetch_workers_free_list_lock); + } + + adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0); + if (!adev->wq) + goto fail_workqueue_alloc; + + return 0; + +fail_workqueue_alloc: + kfree(adev->prefetch_workers); + adev->prefetch_workers = NULL; +fail_prefetch_workers_alloc: + if (adev->wq) + destroy_workqueue(adev->wq); + return -1; +} + +int acache_dev_init(void) +{ + int ret; + int i; + int major; + struct device *dev; + + major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); + if (major < 0) { + pr_err("failed to allocate chrdev region: %d\n", major); + return major; + goto fail_allocdev; + } + + adev.class = class_create(THIS_MODULE, DEV_NAME); + if (IS_ERR(adev.class)) { + pr_err("failed to create acache class\n"); + ret = -1; + goto fail_class; + } + + if (acache_dev_size < PAGE_SIZE) { + pr_err("acache_dev_size should not be less than PAGE_SIZE\n"); + ret = -1; + goto fail_dev_add; + } + metadata.devsize = acache_dev_size; + metadata.magic = ACACHE_MAGIC; + metadata.conntype = ACACHE_READWRITE_CONN; + cdev_init(&adev.cdev, &acache_fops); + adev.cdev.owner = THIS_MODULE; + + ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS); + if (ret < 0) { + pr_err("failed to add cdev\n"); + goto fail_dev_add; + } + + dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME); + if (IS_ERR(dev)) { + pr_err("Could not create device\n"); + ret = -1; + goto fail_device; + } + + adev.readbuf = vmalloc(MAX_TRANSFER_SIZE); + adev.writebuf = vmalloc(MAX_TRANSFER_SIZE); + if (!adev.readbuf || !adev.writebuf) { + ret = -ENOMEM; + goto fail_malloc; + } + + adev.initialized = true; + adev.mem_regionp = + kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL); + if (!adev.mem_regionp) { + ret = -ENOMEM; + goto fail_malloc; + } + memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS); + + for (i = 0; i < ACACHE_NR_DEVS; i++) { + adev.mem_regionp[i].size = ACACHE_DEV_SIZE; + adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE); + if (!adev.mem_regionp[i].data) { + ret = -ENOMEM; + goto fail_memregion_data_malloc; + } + memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE); + } + + init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data); + if (acache_prefetch_init(&adev)) + goto fail_prefetch_init; + + return 0; + +fail_prefetch_init: +fail_memregion_data_malloc: + acache_free_mem(); +fail_malloc: + device_destroy(adev.class, adev.devno); +fail_device: + cdev_del(&adev.cdev); +fail_dev_add: + class_destroy(adev.class); +fail_class: + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); +fail_allocdev: + return ret; +} + +void acache_dev_exit(void) +{ + if (!adev.initialized) + return; + + if (adev.wq) { + flush_workqueue(adev.wq); + destroy_workqueue(adev.wq); + } + device_destroy(adev.class, adev.devno); + cdev_del(&adev.cdev); + acache_free_mem(); + kfree(adev.mem_regionp); + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); + class_destroy(adev.class); + kfree(adev.prefetch_workers); +} + +struct cached_dev *get_cached_device_by_dev(dev_t dev) +{ + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev->bd_dev == dev && cached_dev_get(dc)) + return dc; + + return NULL; +} + +struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item) +{ + struct bio *bio; + uint64_t offset = item->offset + dc->sb.data_offset; + + if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) { + pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %llx\n", + MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev), + offset + (item->length >> 9), + get_capacity(dc->bdev->bd_disk)); + return NULL; + } + + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split); + if (!bio) { + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL); + if (!bio) + return NULL; + } + + bio_set_dev(bio, dc->bdev); + bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset; + bio->bi_iter.bi_size = (item->length >> 9) << 9; + + bch_bio_map(bio, NULL); + if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO)) + goto out_put; + + return bio; +out_put: + bio_put(bio); + return NULL; +} + +int process_one_request(struct acache_info *item) +{ + struct cached_dev *dc; + struct bio *cache_bio; + struct search *s; + + dc = get_cached_device_by_dev(item->dev); + if (dc == NULL) + return -1; + cache_bio = get_bio_by_item(dc, item); + if (cache_bio == NULL) { + pr_err("acache: failed to alloc bio for prefetch\n"); + goto put_dev; + } + + s = search_alloc(cache_bio, &dc->disk, true); + + trace_bcache_prefetch_request(&dc->disk, cache_bio); + cached_dev_read(dc, s); + return 0; + +put_dev: + cached_dev_put(dc); + return -1; +} + diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h new file mode 100644 index 000000000000..dea6e8cb0a05 --- /dev/null +++ b/drivers/md/bcache/acache.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ACHACHE_INTERFACE_H_ +#define _ACHACHE_INTERFACE_H_ + +#define ACACHE_NR_DEVS 1 + +#define RING_SIZE + +#include "bcache.h" + +struct mem_reg { + char *data; + unsigned long size; +}; + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + dev_t dev; + int type; +}; + +enum acache_info_type { + ACACHE_INFO_READ = 0, + ACACHE_INFO_WRITE, + ACACHE_INFO_CACHE_INSERT, + ACACHE_INFO_LATENCY, +}; + +struct acache_circ { + spinlock_t lock; + int tail; + int head; + int size; + int item_size; + struct acache_info data[0]; +}; + +struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; +}; + +#define ACACHE_DEV_SIZE acache_dev_size +#define ACACHE_MAGIC 2 + +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_READWRITE_CONN = 2, +}; + +#define ACACHE_CIRC_SIZE \ + ({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \ + int bits = 0; \ + while (i > 0) {i >>= 1; bits++; } \ + 1 << (bits - 1); }) + + +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + +int acache_dev_init(void); +void acache_dev_exit(void); +struct acache_info *fetch_circ_item(struct acache_circ *circ); +void save_circ_item(struct acache_info *data); + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba..96951e638f17 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2122,12 +2122,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, BUG_ON(bkey_cmp(k, &b->key) > 0);
status = bch_btree_insert_key(&b->keys, k, replace_key); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert");
- trace_bcache_btree_insert_key(b, k, replace_key != NULL, - status); return true; } else return false; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..5a64afd56b97 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -8,11 +8,13 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "request.h" #include "writeback.h"
+#include <linux/time.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> @@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl) void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct acache_info msg;
trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass);
+ msg.offset = op->bio->bi_iter.bi_sector; + msg.length = op->bio->bi_iter.bi_size; + msg.type = ACACHE_INFO_CACHE_INSERT; + msg.dev = bio_dev(op->bio); + msg.start_time = ktime_get_ns(); + save_circ_item(&msg); + bch_keylist_init(&op->insert_keys); bio_get(op->bio); bch_data_insert_start(cl); @@ -460,28 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/* Cache lookup */
-struct search { - /* Stack frame for bio_complete */ - struct closure cl; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - struct bcache_device *d; - - unsigned int insert_bio_sectors; - unsigned int recoverable:1; - unsigned int write:1; - unsigned int read_dirty_data:1; - unsigned int cache_missed:1; - - struct hd_struct *part; - unsigned long start_time; - - struct btree_op op; - struct data_insert_op iop; -}; - static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -540,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return MAP_CONTINUE;
/* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0;
PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; @@ -557,21 +546,25 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
- n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; + if (!s->prefetch) { + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl;
- /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */
- __bch_submit_bbio(n, b->c); + __bch_submit_bbio(n, b->c); + } else { + bio_put(n); + } return n == bio ? MAP_DONE : MAP_CONTINUE; }
@@ -674,7 +667,12 @@ static void bio_complete(struct search *s)
trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; - bio_endio(s->orig_bio); + if (s->prefetch) { + bio_free_pages(s->orig_bio); + bio_put(s->orig_bio); + } else { + bio_endio(s->orig_bio); + } s->orig_bio = NULL; } } @@ -699,7 +697,7 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); }
-static void search_free(struct closure *cl) +void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);
@@ -713,8 +711,8 @@ static void search_free(struct closure *cl) mempool_free(s, &s->iop.c->search); }
-static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) +struct search *search_alloc(struct bio *bio, + struct bcache_device *d, bool prefetch) { struct search *s;
@@ -733,6 +731,7 @@ static inline struct search *search_alloc(struct bio *bio, s->read_dirty_data = 0; /* Count on the bcache device */ s->start_time = part_start_io_acct(d->disk, &s->part, bio); + s->prefetch = prefetch; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -836,17 +835,22 @@ static void cached_dev_read_done(struct closure *cl) s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL);
- bio_copy_data(s->cache_miss, s->iop.bio); + if (!s->prefetch) + bio_copy_data(s->cache_miss, s->iop.bio); + else + trace_bcache_prefetch_cache_miss(s->iop.bio);
bio_put(s->cache_miss); s->cache_miss = NULL; + }
if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio);
closure_get(&dc->disk.cl); - bio_complete(s); + if (!s->prefetch) + bio_complete(s);
if (s->iop.bio && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { @@ -862,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- bch_mark_cache_accounting(s->iop.c, s->d, + if (s->prefetch) + pr_debug("prefetch request; do not count cache_missed"); + else + bch_mark_cache_accounting(s->iop.c, s->d, !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
+ if (!s->prefetch && !s->iop.status) { + s->smp.type = ACACHE_INFO_LATENCY; + s->smp.start_time = ktime_get_ns() - s->smp.start_time; + save_circ_item(&s->smp); + } + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc)) @@ -891,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, }
if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + !(bio->bi_opf & (REQ_META|REQ_PRIO) ) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && + !s->prefetch) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio));
@@ -943,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; - /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); + if (!s->prefetch) { + miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + } else { + bio_put(miss); + } return ret; }
-static void cached_dev_read(struct cached_dev *dc, struct search *s) +void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;
@@ -1197,7 +1215,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, false); trace_bcache_request_start(s->d, bio);
if (!bio->bi_iter.bi_size) { @@ -1211,6 +1229,15 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else { s->iop.bypass = check_should_bypass(dc, bio);
+ if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { + s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; + s->smp.length = bio->bi_iter.bi_size; + s->smp.type = rw; + s->smp.dev = dc->bdev->bd_dev; + s->smp.start_time = ktime_get_ns(); + save_circ_item(&s->smp); + } + if (rw) cached_dev_write(dc, s); else @@ -1281,7 +1308,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; }
- s = search_alloc(bio, d); + s = search_alloc(bio, d, false); cl = &s->cl; bio = &s->bio.bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 82b38366a95d..21678037d215 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ +#include "btree.h" +#include "acache.h"
struct data_insert_op { struct closure cl; @@ -44,4 +46,34 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio);
extern struct kmem_cache *bch_search_cache;
+struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; + + struct hd_struct *part; + unsigned long start_time; + /* for prefetch, we do not need copy data to bio */ + bool prefetch; + struct list_head list_node; + wait_queue_head_t wqh; + struct acache_info smp; + + struct btree_op op; + struct data_insert_op iop; +}; + +void search_free(struct closure *cl); +struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch); +void cached_dev_read(struct cached_dev *dc, struct search *s); #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 81f1cc5b3499..a849fc51cd88 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -8,6 +8,7 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "extents.h" @@ -2846,6 +2847,7 @@ static void bcache_exit(void)
if (bcache_major) unregister_blkdev(bcache_major, "bcache"); + acache_dev_exit(); unregister_reboot_notifier(&reboot); mutex_destroy(&bch_register_lock); } @@ -2932,6 +2934,8 @@ static int __init bcache_init(void)
bch_debug_init(); closure_debug_init(); + if (acache_dev_init()) + goto err;
bcache_is_reboot = false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..f7be8c6e7cff 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("bucket %zu", __entry->bucket) );
+/* acache.c */ +DEFINE_EVENT(bcache_request, bcache_prefetch_request, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start, @@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_ARGS(bio) );
+DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + TRACE_EVENT(bcache_read, TP_PROTO(struct bio *bio, bool hit, bool bypass), TP_ARGS(bio, hit, bypass),
Hi zengkai, jianhai,ruilin,
As discussed earlier, this series about bcache has a major conflict with mainline backporting, so revert it now.
You can adapt them later. A better suggestion, push them to upstream community.
-- Thanks, Xie XiuQi
On 2022/1/10 17:33, Zheng Zengkai wrote:
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
Add a framwork to transform io informations to userspace client and process prefetch request sent by userspace client. Create a char device namede "acache" for connecting between kernelspace and userspace. Save informations of all io requests into a buffer and pass them to client when client reads from the device.
The prefetch request could be treated as normal io request. As deference, those requests have no need return data back to userspace, and they should not append readahead part.
Add two parameters. acache_dev_size is for controlling size of buffer to save io informations. acache_prefetch_workers is for controlling max threads to process prefetch requests.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com
drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 473 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 69 +++++ drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 129 ++++++---- drivers/md/bcache/request.h | 32 +++ drivers/md/bcache/super.c | 4 + include/trace/events/bcache.h | 11 + 8 files changed, 670 insertions(+), 54 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676b8..28f5294dd6cf 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
- io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\ util.o writeback.o features.o
diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c new file mode 100644 index 000000000000..ff3e120d9619 --- /dev/null +++ b/drivers/md/bcache/acache.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0
+#include "acache.h" +#include "request.h"
+#include <linux/module.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/cdev.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/circ_buf.h> +#include <linux/list.h>
+#include <trace/events/bcache.h>
+#define DEV_NAME "acache"
+int acache_dev_size = (1024 * 4096 + 4096);
+module_param_named(acache_size, acache_dev_size, int, 0444); +MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte");
+int acache_prefetch_workers = 1000;
+module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); +MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests");
+struct prefetch_worker {
- struct acache_info s;
- struct work_struct work;
- struct list_head list;
+};
+struct acache_device {
- bool initialized;
- dev_t devno;
- struct cdev cdev;
- struct class *class;
- struct mem_reg *mem_regionp;
- struct acache_info *readbuf;
- struct acache_info *writebuf;
- struct acache_circ *acache_info_circ;
- struct workqueue_struct *wq;
- struct prefetch_worker *prefetch_workers;
- struct list_head prefetch_workers_free;
- spinlock_t prefetch_workers_free_list_lock;
+} adev;
+#define MAX_TRANSFER_SIZE (1024 * 1024)
+static atomic_t acache_opened_dev = ATOMIC_INIT(0); +static struct acache_metadata metadata;
+int acache_open(struct inode *inode, struct file *filp) +{
- struct mem_reg *dev;
- int minor = MINOR(inode->i_rdev);
- if (minor >= ACACHE_NR_DEVS)
return -ENODEV;
- if (atomic_xchg(&acache_opened_dev, 1))
return -EPERM;
- dev = &adev.mem_regionp[minor];
- filp->private_data = dev;
- return 0;
+}
+int acache_release(struct inode *inode, struct file *filp) +{
- atomic_dec(&acache_opened_dev);
- return 0;
+}
+ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf,
size_t size)
+{
- unsigned long first, todo, flags;
- spin_lock_irqsave(&circ->lock, flags);
- todo = CIRC_CNT(circ->head, circ->tail, circ->size);
- if (todo == 0) {
spin_unlock_irqrestore(&circ->lock, flags);
return 0;
- }
- if (todo > size / sizeof(struct acache_info))
todo = size / sizeof(struct acache_info);
- first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size);
- if (first > todo)
first = todo;
- memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info));
- if (first < todo)
memcpy(buf + first, circ->data,
(todo - first) * sizeof(struct acache_info));
- circ->tail = (circ->tail + todo) & (circ->size - 1);
- spin_unlock_irqrestore(&circ->lock, flags);
- return todo * sizeof(struct acache_info);
+}
+static ssize_t acache_read(struct file *filp, char __user *buf,
size_t size, loff_t *ppos)
+{
- long ret, cut;
- if (metadata.conntype != ACACHE_READWRITE_CONN)
return -EINVAL;
- if (size > MAX_TRANSFER_SIZE)
size = MAX_TRANSFER_SIZE;
- ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size);
- if (ret <= 0)
return ret;
- cut = copy_to_user(buf, adev.readbuf, size);
- return ret - cut;
+}
+int process_one_request(struct acache_info *item); +static void prefetch_worker_func(struct work_struct *work) +{
- struct prefetch_worker *sw =
container_of(work, struct prefetch_worker, work);
- process_one_request(&sw->s);
- spin_lock(&adev.prefetch_workers_free_list_lock);
- list_add_tail(&sw->list, &adev.prefetch_workers_free);
- spin_unlock(&adev.prefetch_workers_free_list_lock);
+}
+static int queue_prefetch_item(struct acache_info *s) +{
- struct prefetch_worker *sw;
- spin_lock(&adev.prefetch_workers_free_list_lock);
- sw = list_first_entry_or_null(&adev.prefetch_workers_free,
struct prefetch_worker, list);
- if (!sw) {
spin_unlock(&adev.prefetch_workers_free_list_lock);
return -1;
- }
- list_del_init(&sw->list);
- spin_unlock(&adev.prefetch_workers_free_list_lock);
- memcpy(&sw->s, s, sizeof(struct acache_info));
- INIT_WORK(&sw->work, prefetch_worker_func);
- queue_work(adev.wq, &sw->work);
- return 0;
+}
+static ssize_t acache_write(struct file *filp, const char __user *buf,
size_t size, loff_t *ppos)
+{
- long cut;
- int i;
- if (metadata.conntype != ACACHE_READWRITE_CONN)
return -EINVAL;
- if (size > MAX_TRANSFER_SIZE)
size = MAX_TRANSFER_SIZE;
- cut = copy_from_user(adev.writebuf, buf, size);
- for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) {
if (queue_prefetch_item(adev.writebuf + i))
break;
- }
- return i * sizeof(struct acache_info);
+}
+static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{
- switch (cmd) {
- case ACACHE_GET_METADATA:
return copy_to_user((struct acache_metadata __user *)arg,
&metadata, sizeof(struct acache_metadata));
- default:
return -EINVAL;
- }
+}
+static const struct file_operations acache_fops = {
- .owner = THIS_MODULE,
- .read = acache_read,
- .write = acache_write,
- .open = acache_open,
- .release = acache_release,
- .unlocked_ioctl = acache_ioctl,
+};
+void save_circ_item(struct acache_info *data) +{
- unsigned long flags;
- struct acache_circ *circ = adev.acache_info_circ;
- spin_lock_irqsave(&circ->lock, flags);
- if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) {
memcpy(&circ->data[circ->head], data, sizeof(struct acache_info));
circ->head = (circ->head + 1) & (circ->size - 1);
- } else {
pr_debug("ringbuffer is full; discard new request.\n");
- }
- spin_unlock_irqrestore(&circ->lock, flags);
+}
+void init_acache_circ(struct acache_circ **circ, void *startaddr) +{
- *circ = (struct acache_circ *)startaddr;
- (*circ)->head = 0;
- (*circ)->tail = 0;
- (*circ)->size = ACACHE_CIRC_SIZE;
- spin_lock_init(&(*circ)->lock);
+}
+static void acache_free_mem(void) +{
- int i;
- for (i = 0; i < ACACHE_NR_DEVS; i++)
vfree(adev.mem_regionp[i].data);
- if (adev.readbuf) {
vfree(adev.readbuf);
adev.readbuf = NULL;
- }
- if (adev.writebuf) {
vfree(adev.writebuf);
adev.writebuf = NULL;
- }
- kfree(adev.prefetch_workers);
- adev.prefetch_workers = NULL;
+}
+int acache_prefetch_init(struct acache_device *adev) +{
- int i;
- if (acache_prefetch_workers <= 0) {
pr_err("acache_dev_size should not be less than zero\n");
return -1;
- }
- adev->prefetch_workers = kmalloc_array(acache_prefetch_workers,
sizeof(struct prefetch_worker),
GFP_KERNEL);
- if (!adev->prefetch_workers)
goto fail_prefetch_workers_alloc;
- INIT_LIST_HEAD(&adev->prefetch_workers_free);
- spin_lock_init(&adev->prefetch_workers_free_list_lock);
- for (i = 0; i < acache_prefetch_workers; i++) {
spin_lock(&adev->prefetch_workers_free_list_lock);
list_add_tail(&adev->prefetch_workers[i].list,
&adev->prefetch_workers_free);
spin_unlock(&adev->prefetch_workers_free_list_lock);
- }
- adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0);
- if (!adev->wq)
goto fail_workqueue_alloc;
- return 0;
+fail_workqueue_alloc:
- kfree(adev->prefetch_workers);
- adev->prefetch_workers = NULL;
+fail_prefetch_workers_alloc:
- if (adev->wq)
destroy_workqueue(adev->wq);
- return -1;
+}
+int acache_dev_init(void) +{
- int ret;
- int i;
- int major;
- struct device *dev;
- major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME);
- if (major < 0) {
pr_err("failed to allocate chrdev region: %d\n", major);
return major;
goto fail_allocdev;
- }
- adev.class = class_create(THIS_MODULE, DEV_NAME);
- if (IS_ERR(adev.class)) {
pr_err("failed to create acache class\n");
ret = -1;
goto fail_class;
- }
- if (acache_dev_size < PAGE_SIZE) {
pr_err("acache_dev_size should not be less than PAGE_SIZE\n");
ret = -1;
goto fail_dev_add;
- }
- metadata.devsize = acache_dev_size;
- metadata.magic = ACACHE_MAGIC;
- metadata.conntype = ACACHE_READWRITE_CONN;
- cdev_init(&adev.cdev, &acache_fops);
- adev.cdev.owner = THIS_MODULE;
- ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS);
- if (ret < 0) {
pr_err("failed to add cdev\n");
goto fail_dev_add;
- }
- dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME);
- if (IS_ERR(dev)) {
pr_err("Could not create device\n");
ret = -1;
goto fail_device;
- }
- adev.readbuf = vmalloc(MAX_TRANSFER_SIZE);
- adev.writebuf = vmalloc(MAX_TRANSFER_SIZE);
- if (!adev.readbuf || !adev.writebuf) {
ret = -ENOMEM;
goto fail_malloc;
- }
- adev.initialized = true;
- adev.mem_regionp =
kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL);
- if (!adev.mem_regionp) {
ret = -ENOMEM;
goto fail_malloc;
- }
- memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS);
- for (i = 0; i < ACACHE_NR_DEVS; i++) {
adev.mem_regionp[i].size = ACACHE_DEV_SIZE;
adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE);
if (!adev.mem_regionp[i].data) {
ret = -ENOMEM;
goto fail_memregion_data_malloc;
}
memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE);
- }
- init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data);
- if (acache_prefetch_init(&adev))
goto fail_prefetch_init;
- return 0;
+fail_prefetch_init: +fail_memregion_data_malloc:
- acache_free_mem();
+fail_malloc:
- device_destroy(adev.class, adev.devno);
+fail_device:
- cdev_del(&adev.cdev);
+fail_dev_add:
- class_destroy(adev.class);
+fail_class:
- unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS);
+fail_allocdev:
- return ret;
+}
+void acache_dev_exit(void) +{
- if (!adev.initialized)
return;
- if (adev.wq) {
flush_workqueue(adev.wq);
destroy_workqueue(adev.wq);
- }
- device_destroy(adev.class, adev.devno);
- cdev_del(&adev.cdev);
- acache_free_mem();
- kfree(adev.mem_regionp);
- unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS);
- class_destroy(adev.class);
- kfree(adev.prefetch_workers);
+}
+struct cached_dev *get_cached_device_by_dev(dev_t dev) +{
- struct cache_set *c, *tc;
- struct cached_dev *dc, *t;
- list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
list_for_each_entry_safe(dc, t, &c->cached_devs, list)
if (dc->bdev->bd_dev == dev && cached_dev_get(dc))
return dc;
- return NULL;
+}
+struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item) +{
- struct bio *bio;
- uint64_t offset = item->offset + dc->sb.data_offset;
- if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) {
pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %llx\n",
MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev),
offset + (item->length >> 9),
get_capacity(dc->bdev->bd_disk));
return NULL;
- }
- bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split);
- if (!bio) {
bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL);
if (!bio)
return NULL;
- }
- bio_set_dev(bio, dc->bdev);
- bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset;
- bio->bi_iter.bi_size = (item->length >> 9) << 9;
- bch_bio_map(bio, NULL);
- if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO))
goto out_put;
- return bio;
+out_put:
- bio_put(bio);
- return NULL;
+}
+int process_one_request(struct acache_info *item) +{
- struct cached_dev *dc;
- struct bio *cache_bio;
- struct search *s;
- dc = get_cached_device_by_dev(item->dev);
- if (dc == NULL)
return -1;
- cache_bio = get_bio_by_item(dc, item);
- if (cache_bio == NULL) {
pr_err("acache: failed to alloc bio for prefetch\n");
goto put_dev;
- }
- s = search_alloc(cache_bio, &dc->disk, true);
- trace_bcache_prefetch_request(&dc->disk, cache_bio);
- cached_dev_read(dc, s);
- return 0;
+put_dev:
- cached_dev_put(dc);
- return -1;
+}
diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h new file mode 100644 index 000000000000..dea6e8cb0a05 --- /dev/null +++ b/drivers/md/bcache/acache.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0
+#ifndef _ACHACHE_INTERFACE_H_ +#define _ACHACHE_INTERFACE_H_
+#define ACACHE_NR_DEVS 1
+#define RING_SIZE
+#include "bcache.h"
+struct mem_reg {
- char *data;
- unsigned long size;
+};
+struct acache_info {
- uint64_t length;
- uint64_t offset;
- uint64_t start_time;
- dev_t dev;
- int type;
+};
+enum acache_info_type {
- ACACHE_INFO_READ = 0,
- ACACHE_INFO_WRITE,
- ACACHE_INFO_CACHE_INSERT,
- ACACHE_INFO_LATENCY,
+};
+struct acache_circ {
- spinlock_t lock;
- int tail;
- int head;
- int size;
- int item_size;
- struct acache_info data[0];
+};
+struct acache_metadata {
- uint32_t magic;
- uint32_t conntype;
- uint32_t devsize;
+};
+#define ACACHE_DEV_SIZE acache_dev_size +#define ACACHE_MAGIC 2
+enum acache_conn_types {
- ACACHE_NO_CONN = 0,
- ACACHE_READWRITE_CONN = 2,
+};
+#define ACACHE_CIRC_SIZE \
- ({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \
- int bits = 0; \
- while (i > 0) {i >>= 1; bits++; } \
1 << (bits - 1); })
+#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata)
+int acache_dev_init(void); +void acache_dev_exit(void); +struct acache_info *fetch_circ_item(struct acache_circ *circ); +void save_circ_item(struct acache_info *data);
+#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba..96951e638f17 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2122,12 +2122,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, BUG_ON(bkey_cmp(k, &b->key) > 0);
status = bch_btree_insert_key(&b->keys, k, replace_key);
- trace_bcache_btree_insert_key(b, k, replace_key != NULL,
if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert");status);
trace_bcache_btree_insert_key(b, k, replace_key != NULL,
return true; } else return false;status);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..5a64afd56b97 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -8,11 +8,13 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "request.h" #include "writeback.h"
+#include <linux/time.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> @@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl) void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct acache_info msg;
trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass);
msg.offset = op->bio->bi_iter.bi_sector;
msg.length = op->bio->bi_iter.bi_size;
msg.type = ACACHE_INFO_CACHE_INSERT;
msg.dev = bio_dev(op->bio);
msg.start_time = ktime_get_ns();
save_circ_item(&msg);
bch_keylist_init(&op->insert_keys); bio_get(op->bio); bch_data_insert_start(cl);
@@ -460,28 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/* Cache lookup */
-struct search {
- /* Stack frame for bio_complete */
- struct closure cl;
- struct bbio bio;
- struct bio *orig_bio;
- struct bio *cache_miss;
- struct bcache_device *d;
- unsigned int insert_bio_sectors;
- unsigned int recoverable:1;
- unsigned int write:1;
- unsigned int read_dirty_data:1;
- unsigned int cache_missed:1;
- struct hd_struct *part;
- unsigned long start_time;
- struct btree_op op;
- struct data_insert_op iop;
-};
static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -540,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return MAP_CONTINUE;
/* XXX: figure out best pointer - for multiple cache devices */
ptr = 0;
PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
@@ -557,21 +546,25 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
- n->bi_end_io = bch_cache_read_endio;
- n->bi_private = &s->cl;
- if (!s->prefetch) {
n->bi_end_io = bch_cache_read_endio;
n->bi_private = &s->cl;
- /*
* The bucket we're reading from might be reused while our bio
* is in flight, and we could then end up reading the wrong
* data.
*
* We guard against this by checking (in cache_read_endio()) if
* the pointer is stale again; if so, we treat it as an error
* and reread from the backing device (but we don't pass that
* error up anywhere).
*/
/*
* The bucket we're reading from might be reused while our bio
* is in flight, and we could then end up reading the wrong
* data.
*
* We guard against this by checking (in cache_read_endio()) if
* the pointer is stale again; if so, we treat it as an error
* and reread from the backing device (but we don't pass that
* error up anywhere).
*/
- __bch_submit_bbio(n, b->c);
__bch_submit_bbio(n, b->c);
- } else {
bio_put(n);
- } return n == bio ? MAP_DONE : MAP_CONTINUE;
}
@@ -674,7 +667,12 @@ static void bio_complete(struct search *s)
trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
if (s->prefetch) {
bio_free_pages(s->orig_bio);
bio_put(s->orig_bio);
} else {
bio_endio(s->orig_bio);
s->orig_bio = NULL; }}
} @@ -699,7 +697,7 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); }
-static void search_free(struct closure *cl) +void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);
@@ -713,8 +711,8 @@ static void search_free(struct closure *cl) mempool_free(s, &s->iop.c->search); }
-static inline struct search *search_alloc(struct bio *bio,
struct bcache_device *d)
+struct search *search_alloc(struct bio *bio,
struct bcache_device *d, bool prefetch)
{ struct search *s;
@@ -733,6 +731,7 @@ static inline struct search *search_alloc(struct bio *bio, s->read_dirty_data = 0; /* Count on the bcache device */ s->start_time = part_start_io_acct(d->disk, &s->part, bio);
- s->prefetch = prefetch; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id;
@@ -836,17 +835,22 @@ static void cached_dev_read_done(struct closure *cl) s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL);
bio_copy_data(s->cache_miss, s->iop.bio);
if (!s->prefetch)
bio_copy_data(s->cache_miss, s->iop.bio);
else
trace_bcache_prefetch_cache_miss(s->iop.bio);
bio_put(s->cache_miss); s->cache_miss = NULL;
}
if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio);
closure_get(&dc->disk.cl);
- bio_complete(s);
if (!s->prefetch)
bio_complete(s);
if (s->iop.bio && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
@@ -862,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- bch_mark_cache_accounting(s->iop.c, s->d,
if (s->prefetch)
pr_debug("prefetch request; do not count cache_missed");
else
bch_mark_cache_accounting(s->iop.c, s->d, !s->cache_missed, s->iop.bypass);
trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
if (!s->prefetch && !s->iop.status) {
s->smp.type = ACACHE_INFO_LATENCY;
s->smp.start_time = ktime_get_ns() - s->smp.start_time;
save_circ_item(&s->smp);
}
if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc))
@@ -891,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, }
if (!(bio->bi_opf & REQ_RAHEAD) &&
!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
!(bio->bi_opf & (REQ_META|REQ_PRIO) ) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA &&
reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio));!s->prefetch)
@@ -943,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_put: bio_put(cache_bio); out_submit:
- miss->bi_end_io = backing_request_endio;
- miss->bi_private = &s->cl;
- /* I/O request sent to backing device */
- closure_bio_submit(s->iop.c, miss, &s->cl);
- if (!s->prefetch) {
miss->bi_end_io = backing_request_endio;
miss->bi_private = &s->cl;
/* I/O request sent to backing device */
closure_bio_submit(s->iop.c, miss, &s->cl);
- } else {
bio_put(miss);
- } return ret;
}
-static void cached_dev_read(struct cached_dev *dc, struct search *s) +void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;
@@ -1197,7 +1215,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
s = search_alloc(bio, d);
s = search_alloc(bio, d, false);
trace_bcache_request_start(s->d, bio);
if (!bio->bi_iter.bi_size) {
@@ -1211,6 +1229,15 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else { s->iop.bypass = check_should_bypass(dc, bio);
if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) {
s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset;
s->smp.length = bio->bi_iter.bi_size;
s->smp.type = rw;
s->smp.dev = dc->bdev->bd_dev;
s->smp.start_time = ktime_get_ns();
save_circ_item(&s->smp);
}
if (rw) cached_dev_write(dc, s); else
@@ -1281,7 +1308,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; }
- s = search_alloc(bio, d);
- s = search_alloc(bio, d, false); cl = &s->cl; bio = &s->bio.bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 82b38366a95d..21678037d215 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ +#include "btree.h" +#include "acache.h"
struct data_insert_op { struct closure cl; @@ -44,4 +46,34 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio);
extern struct kmem_cache *bch_search_cache;
+struct search {
- /* Stack frame for bio_complete */
- struct closure cl;
- struct bbio bio;
- struct bio *orig_bio;
- struct bio *cache_miss;
- struct bcache_device *d;
- unsigned int insert_bio_sectors;
- unsigned int recoverable:1;
- unsigned int write:1;
- unsigned int read_dirty_data:1;
- unsigned int cache_missed:1;
- struct hd_struct *part;
- unsigned long start_time;
- /* for prefetch, we do not need copy data to bio */
- bool prefetch;
- struct list_head list_node;
- wait_queue_head_t wqh;
- struct acache_info smp;
- struct btree_op op;
- struct data_insert_op iop;
+};
+void search_free(struct closure *cl); +struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch); +void cached_dev_read(struct cached_dev *dc, struct search *s); #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 81f1cc5b3499..a849fc51cd88 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -8,6 +8,7 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "extents.h" @@ -2846,6 +2847,7 @@ static void bcache_exit(void)
if (bcache_major) unregister_blkdev(bcache_major, "bcache");
- acache_dev_exit(); unregister_reboot_notifier(&reboot); mutex_destroy(&bch_register_lock);
} @@ -2932,6 +2934,8 @@ static int __init bcache_init(void)
bch_debug_init(); closure_debug_init();
if (acache_dev_init())
goto err;
bcache_is_reboot = false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..f7be8c6e7cff 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("bucket %zu", __entry->bucket) );
+/* acache.c */ +DEFINE_EVENT(bcache_request, bcache_prefetch_request,
- TP_PROTO(struct bcache_device *d, struct bio *bio),
- TP_ARGS(d, bio)
+);
/* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start, @@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_ARGS(bio) );
+DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
+);
TRACE_EVENT(bcache_read, TP_PROTO(struct bio *bio, bool hit, bool bypass), TP_ARGS(bio, hit, bypass),
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
provide a switch named read_bypass. If enbale, all IO requests will bypass the cache. This option could be useful when we enable userspace prefetch and the cache device is low capacity.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/bcache.rst | 4 ++++ drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/request.c | 6 ++++-- drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 6 ++++++ 5 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 8d3a2d045c0a..c75aebf5821a 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -441,6 +441,10 @@ sequential_cutoff most recent 128 IOs are tracked so sequential IO can be detected even when it isn't all done at once.
+read_bypass + If enbale, all IO will bypass the cache. This option could be useful when we + enable userspace prefetch and the cache device is low capacity. + sequential_merge If non zero, bcache keeps a list of the last 128 requests submitted to compare against all new requests to determine which new requests are sequential diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e8bf4f752e8b..8b10bd5df364 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -376,6 +376,8 @@ struct cached_dev { unsigned char writeback_percent; unsigned int writeback_delay;
+ unsigned int read_bypass; + uint64_t writeback_rate_target; int64_t writeback_rate_proportional; int64_t writeback_rate_integral; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5a64afd56b97..2e9ff76b877b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -852,7 +852,7 @@ static void cached_dev_read_done(struct closure *cl) if (!s->prefetch) bio_complete(s);
- if (s->iop.bio && + if (s->iop.bio && (!dc->read_bypass || s->prefetch) && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); closure_call(&s->iop.cl, bch_data_insert, NULL, cl); @@ -897,12 +897,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_missed = 1;
- if (s->cache_miss || s->iop.bypass) { + if (s->cache_miss || s->iop.bypass || + (dc->read_bypass && !s->prefetch)) { miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; }
+ /* if called form do_readahead, no need to do this */ if (!(bio->bi_opf & REQ_RAHEAD) && !(bio->bi_opf & (REQ_META|REQ_PRIO) ) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a849fc51cd88..e96174ca10d1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1439,6 +1439,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
dc->sequential_cutoff = 4 << 20; + dc->read_bypass = 0;
for (io = dc->io; io < dc->io + RECENT_IO; io++) { list_add(&io->lru, &dc->io_lru); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 554e3afc9b68..39c1e7a544e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -108,6 +108,7 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff); +rw_attribute(read_bypass); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(readahead_cache_policy); @@ -252,6 +253,7 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff); + var_print(read_bypass); var_hprint(readahead);
sysfs_print(running, atomic_read(&dc->running)); @@ -346,6 +348,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); + sysfs_strtoul_clamp(read_bypass, + dc->read_bypass, + 0, 1); d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats) @@ -511,6 +516,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, + &sysfs_read_bypass, &sysfs_clear_stats, &sysfs_running, &sysfs_state,
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
Add a list to save all prefetch requests. When an IO request comes, check if the request has overlap with some of prefetch requests. If it das have, block the request until the prefetch request is end.
Add a switch to control whether to enable this. If not enabled, count the overlapped IO request as a fake hit for performance analysis.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/acache.c | 113 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 10 +++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/request.c | 8 +++ drivers/md/bcache/stats.c | 13 ++++ drivers/md/bcache/stats.h | 3 + drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 6 ++ include/trace/events/bcache.h | 11 ++++ 9 files changed, 166 insertions(+)
diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c index ff3e120d9619..a3f5c4f1ba7c 100644 --- a/drivers/md/bcache/acache.c +++ b/drivers/md/bcache/acache.c @@ -31,6 +31,12 @@ int acache_prefetch_workers = 1000; module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests");
+struct inflight_list_head { + struct list_head entry; + spinlock_t io_lock; + bool initialized; +}; + struct prefetch_worker { struct acache_info s; struct work_struct work; @@ -50,6 +56,8 @@ struct acache_device {
struct acache_circ *acache_info_circ;
+ struct inflight_list_head inflight_list; + struct workqueue_struct *wq; struct prefetch_worker *prefetch_workers; struct list_head prefetch_workers_free; @@ -295,6 +303,7 @@ int acache_dev_init(void) int major; struct device *dev;
+ inflight_list_ops.init(); major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); if (major < 0) { pr_err("failed to allocate chrdev region: %d\n", major); @@ -377,6 +386,7 @@ int acache_dev_init(void) fail_class: unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); fail_allocdev: + inflight_list_ops.exit(); return ret; }
@@ -395,9 +405,112 @@ void acache_dev_exit(void) kfree(adev.mem_regionp); unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); class_destroy(adev.class); + inflight_list_ops.exit(); kfree(adev.prefetch_workers); }
+static struct search *__inflight_list_lookup_locked(struct search *s) +{ + struct search *iter; + struct bio *bio, *sbio; + + if (!adev.inflight_list.initialized) + return NULL; + sbio = &s->bio.bio; + list_for_each_entry(iter, &adev.inflight_list.entry, list_node) { + bio = &iter->bio.bio; + if (sbio->bi_disk == bio->bi_disk && + sbio->bi_iter.bi_sector < bio_end_sector(bio) && + bio_end_sector(sbio) > bio->bi_iter.bi_sector) { + return iter; + } + } + return NULL; +} + +static void inflight_list_init(void) +{ + INIT_LIST_HEAD(&adev.inflight_list.entry); + spin_lock_init(&adev.inflight_list.io_lock); + adev.inflight_list.initialized = true; +} + +static void inflight_list_exit(void) +{ + if (!list_empty(&adev.inflight_list.entry)) + pr_err("existing with inflight list not empty\n"); +} + +static int inflight_list_insert(struct search *s) +{ + if (!adev.inflight_list.initialized) + return -1; + + init_waitqueue_head(&s->wqh); + spin_lock(&adev.inflight_list.io_lock); + list_add_tail(&s->list_node, &adev.inflight_list.entry); + spin_unlock(&adev.inflight_list.io_lock); + + trace_bcache_inflight_list_insert(s->d, s->orig_bio); + return 0; +} + +static int inflight_list_remove(struct search *s) +{ + if (!adev.inflight_list.initialized) + return -1; + + spin_lock(&adev.inflight_list.io_lock); + list_del_init(&s->list_node); + spin_unlock(&adev.inflight_list.io_lock); + + wake_up_interruptible_all(&s->wqh); + + trace_bcache_inflight_list_remove(s->d, s->orig_bio); + return 0; +} + +static bool inflight_list_wait(struct search *s) +{ + struct search *pfs = NULL; + struct cached_dev *dc; + DEFINE_WAIT(wqe); + + if (!adev.inflight_list.initialized) + return false; + + spin_lock(&adev.inflight_list.io_lock); + pfs = __inflight_list_lookup_locked(s); + if (pfs == NULL) { + spin_unlock(&adev.inflight_list.io_lock); + return false; + } + + dc = container_of(pfs->d, struct cached_dev, disk); + if (!dc->inflight_block_enable) { + spin_unlock(&adev.inflight_list.io_lock); + return true; + } + + prepare_to_wait(&pfs->wqh, &wqe, TASK_INTERRUPTIBLE); + + /* unlock here to ensure pfs not changed. */ + spin_unlock(&adev.inflight_list.io_lock); + schedule(); + + finish_wait(&pfs->wqh, &wqe); + + return true; +} + +const struct inflight_queue_ops inflight_list_ops = { + .init = inflight_list_init, + .exit = inflight_list_exit, + .insert = inflight_list_insert, + .remove = inflight_list_remove, + .wait = inflight_list_wait, +}; + struct cached_dev *get_cached_device_by_dev(dev_t dev) { struct cache_set *c, *tc; diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h index dea6e8cb0a05..3c6453d0c4da 100644 --- a/drivers/md/bcache/acache.h +++ b/drivers/md/bcache/acache.h @@ -66,4 +66,14 @@ void acache_dev_exit(void); struct acache_info *fetch_circ_item(struct acache_circ *circ); void save_circ_item(struct acache_info *data);
+struct inflight_queue_ops { + void (*init)(void); + void (*exit)(void); + + int (*insert)(struct search *s); + int (*remove)(struct search *s); + bool (*wait)(struct search *s); +}; +extern const struct inflight_queue_ops inflight_list_ops; + #endif diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 8b10bd5df364..53e07c958924 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -376,6 +376,7 @@ struct cached_dev { unsigned char writeback_percent; unsigned int writeback_delay;
+ unsigned int inflight_block_enable; unsigned int read_bypass;
uint64_t writeback_rate_target; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2e9ff76b877b..fd381da32464 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -706,6 +706,9 @@ void search_free(struct closure *cl) if (s->iop.bio) bio_put(s->iop.bio);
+ if (s->prefetch) + inflight_list_ops.remove(s); + bio_complete(s); closure_debug_destroy(cl); mempool_free(s, &s->iop.c->search); @@ -974,6 +977,11 @@ void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;
+ if (s->prefetch) + inflight_list_ops.insert(s); + else if (inflight_list_ops.wait(s)) + bch_mark_cache_prefetch_fake_hit(s->iop.c, s->d); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..c7a6c93aa9e9 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -48,6 +48,7 @@ read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); +read_attribute(cache_prefetch_fake_hits); read_attribute(bypassed);
SHOW(bch_stats) @@ -66,6 +67,7 @@ SHOW(bch_stats)
var_print(cache_readaheads); var_print(cache_miss_collisions); + var_print(cache_prefetch_fake_hits); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var return 0; @@ -88,6 +90,7 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_hit_ratio, &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, + &sysfs_cache_prefetch_fake_hits, &sysfs_bypassed, NULL }; @@ -147,6 +150,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_bypass_misses); scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); + scale_stat(&stats->cache_prefetch_fake_hits); scale_stat(&stats->sectors_bypassed); } } @@ -170,6 +174,7 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_bypass_misses); move_stat(cache_readaheads); move_stat(cache_miss_collisions); + move_stat(cache_prefetch_fake_hits); move_stat(sectors_bypassed);
scale_stats(&acc->total, 0); @@ -225,6 +230,14 @@ void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) atomic_inc(&c->accounting.collector.cache_miss_collisions); }
+void bch_mark_cache_prefetch_fake_hit(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + atomic_inc(&dc->accounting.collector.cache_prefetch_fake_hits); + atomic_inc(&c->accounting.collector.cache_prefetch_fake_hits); +} + void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, int sectors) { diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..302b76e982b4 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -9,6 +9,7 @@ struct cache_stat_collector { atomic_t cache_bypass_misses; atomic_t cache_readaheads; atomic_t cache_miss_collisions; + atomic_t cache_prefetch_fake_hits; atomic_t sectors_bypassed; };
@@ -21,6 +22,7 @@ struct cache_stats { unsigned long cache_bypass_misses; unsigned long cache_readaheads; unsigned long cache_miss_collisions; + unsigned long cache_prefetch_fake_hits; unsigned long sectors_bypassed;
unsigned int rescale; @@ -58,6 +60,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d); +void bch_mark_cache_prefetch_fake_hit(struct cache_set *c, struct bcache_device *d); void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, int sectors); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e96174ca10d1..38afb2a58f14 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1439,6 +1439,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
dc->sequential_cutoff = 4 << 20; + dc->inflight_block_enable = 1; dc->read_bypass = 0;
for (io = dc->io; io < dc->io + RECENT_IO; io++) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 39c1e7a544e5..515539520428 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -109,6 +109,7 @@ rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff); rw_attribute(read_bypass); +rw_attribute(inflight_block_enable); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(readahead_cache_policy); @@ -253,6 +254,7 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff); + var_print(inflight_block_enable); var_print(read_bypass); var_hprint(readahead);
@@ -351,6 +353,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(read_bypass, dc->read_bypass, 0, 1); + sysfs_strtoul_clamp(inflight_block_enable, + dc->inflight_block_enable, + 0, 1); d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats) @@ -517,6 +522,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_read_bypass, + &sysfs_inflight_block_enable, &sysfs_clear_stats, &sysfs_running, &sysfs_state, diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index f7be8c6e7cff..38986cdf52cc 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -81,6 +81,17 @@ DEFINE_EVENT(bcache_request, bcache_prefetch_request, TP_ARGS(d, bio) );
+/* interface.c */ +DEFINE_EVENT(bcache_request, bcache_inflight_list_insert, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + +DEFINE_EVENT(bcache_request, bcache_inflight_list_remove, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start,
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
In writearound cache mode, read request quickly followed by write request may overwrite the invalidate bkey inserted by the write request.
The function bch_data_insert() is invoked asynchronously as the bio subbmited to backend block device, therefore there may be a read request subbmited after the bch_data_insert() done and ended before the backend bio is end. This read request will read data from the backend block device, and insert dirty data to cache device. However by writearound cache mode, bcache will not invalidate data again, so that read request after will read dirty data from the cache, causing a data corruption.
By this patch we delay the invalidation to end of backend bio to avoid this corruption.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/request.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fd381da32464..04a779573fdd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -993,8 +993,11 @@ static void cached_dev_write_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+ if (!s->iop.bypass) + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + up_read_non_owner(&dc->writeback_lock); - cached_dev_bio_complete(cl); + continue_at(cl, cached_dev_bio_complete, NULL); }
static void cached_dev_write(struct cached_dev *dc, struct search *s) @@ -1077,7 +1080,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) }
insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + if (!s->iop.bypass) + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); }
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
Add a sample for new prefetch frame on bcache. As a sample, the program just reads every read request received by bcache device and send a prefetch request for each read request. The length of the prefetch request is equal as the read request and the position of the prefetch request is follow the read request.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- samples/acache_client/Makefile | 13 +++ samples/acache_client/connect.c | 144 ++++++++++++++++++++++++++++++++ samples/acache_client/connect.h | 74 ++++++++++++++++ samples/acache_client/main.c | 133 +++++++++++++++++++++++++++++ 4 files changed, 364 insertions(+) create mode 100644 samples/acache_client/Makefile create mode 100644 samples/acache_client/connect.c create mode 100644 samples/acache_client/connect.h create mode 100644 samples/acache_client/main.c
diff --git a/samples/acache_client/Makefile b/samples/acache_client/Makefile new file mode 100644 index 000000000000..13e5485b3d2f --- /dev/null +++ b/samples/acache_client/Makefile @@ -0,0 +1,13 @@ +.PHONY: client clean + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -g + + +OBJ = main.o connect.o +client: ${OBJ} + $(CC) $(CFLAGS) $^ -o acache_client +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ +clean: + rm -f *.o acache_client diff --git a/samples/acache_client/connect.c b/samples/acache_client/connect.c new file mode 100644 index 000000000000..2dd442415ee2 --- /dev/null +++ b/samples/acache_client/connect.c @@ -0,0 +1,144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <unistd.h> +#include <errno.h> + +#include "connect.h" + +static int ACACHE_READWRITE_CAPACITY = 4096; +static struct connection readwrite_conn; +static struct readwrite_conn_metadata { + int initialized; + int fd; +} private; + +void *initialize(struct connection *self) +{ + long ret; + + private.fd = open(acache_path, O_RDWR | O_SYNC); + if (private.fd == -1) { + fprintf(stderr, "error opening device: %s\n", strerror(errno)); + exit(-1); + } + + struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; + } acache_metadata; +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + ret = ioctl(private.fd, ACACHE_GET_METADATA, &acache_metadata); + if (ret) { + fprintf(stderr, "error getting device memory length: %s\n", strerror(errno)); + exit(-1); + } + if (acache_metadata.magic != ACACHE_MAGIC) { + fprintf(stderr, "version not match; client: %u kernel: %u\n", + ACACHE_MAGIC, acache_metadata.magic); + exit(-1); + } + if (acache_metadata.conntype != ACACHE_READWRITE_CONN) { + fprintf(stderr, "connect type not match; client: %u kernel: %u\n", + ACACHE_READWRITE_CONN, acache_metadata.conntype); + exit(-1); + } + printf("got dev size %u\n", acache_metadata.devsize); + private.initialized = 1; + + return (void *)&private; +} + +struct readwrite_conn_metadata* get_metadata(struct connection *self) +{ + struct readwrite_conn_metadata *metadata; + + if (self == NULL) { + fprintf(stderr, "connenction uninitailized\n"); + return NULL; + } + + metadata = (struct readwrite_conn_metadata *)self->private; + + if (metadata->initialized == 0) { + fprintf(stderr, "connenction uninitailized\n"); + return NULL; + } + return metadata; +} + +int send_items(struct connection *self, struct acache_info *infos, + size_t count) +{ + long ret; + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + ret = write(metadata->fd, (void*)infos, count * sizeof(struct acache_info)); + if (ret < 0) { + fprintf(stderr, "error writing data: %ld\n", ret); + return 0; + } + if (ret % sizeof(struct acache_info)) { + fprintf(stderr, "error writing data: data length is not multiple of sizeof(struct acache_info): %ld %ld\n", + ret, sizeof(struct acache_info)); + return 0; + } + return ret / sizeof(struct acache_info); +} + +int fetch_items(struct connection *self, struct acache_info *infos, + size_t count) +{ + long ret; + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + ret = read(metadata->fd, (void*)infos, count * sizeof(struct acache_info)); + if (ret < 0) { + fprintf(stderr, "error reading data: %ld\n", ret); + return 0; + } + if (ret % sizeof(struct acache_info)) { + fprintf(stderr, "error reading data: data length is not multiple of sizeof(struct acache_info): %ld %ld\n", + ret, sizeof(struct acache_info)); + return 0; + } + return ret / sizeof(struct acache_info); +} + +int get_capacity() { + return ACACHE_READWRITE_CAPACITY; +} + +int close_conn(struct connection *self) +{ + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + close(metadata->fd); + return 0; + +} + +struct connection *initialize_conn_rw(void) +{ + readwrite_conn.ops.close = close_conn; + readwrite_conn.ops.initialize = initialize; + readwrite_conn.ops.send_items = send_items; + readwrite_conn.ops.fetch_items = fetch_items; + readwrite_conn.ops.get_capacity = get_capacity; + readwrite_conn.private = initialize(&readwrite_conn); + return &readwrite_conn; +} diff --git a/samples/acache_client/connect.h b/samples/acache_client/connect.h new file mode 100644 index 000000000000..b0357c78c8c4 --- /dev/null +++ b/samples/acache_client/connect.h @@ -0,0 +1,74 @@ +#ifndef ACACHE_CONNENECT_H +#define ACACHE_CONNENECT_H +#include <stdint.h> + +#define ACACHE_MAGIC 2 +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_RINGBUFFER_CONN, + ACACHE_READWRITE_CONN, +}; +#define acache_path "/dev/acache" + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + uint32_t dev; + int opcode; +}; + +struct connection; +struct connection_operations { + + /* + * initialize connnection + * parameters: none + * return values: + * - void *: private data for connection + */ + void *(*initialize)(struct connection *self); + /* + * send_items send items to peer side + * parameters: + * - infos: data to send + * - count: data length + * return values: + * - number of sent items + */ + int (*send_items)(struct connection *self, struct acache_info *infos, + size_t count); + /* + * send_items recieve items from peer side + * paremeters: + * - infos: buffer to place recieved items + * - count: length of buffer + * return values: + * - number of recieved items + */ + int (*fetch_items)(struct connection *self, struct acache_info *infos, + size_t count); + /* + * close closes the connection + */ + int (*close)(struct connection *self); + + /* + * get_capacity return the capacity of items that can send and revice at once + */ + int (*get_capacity)(struct connection *self); + +}; + +struct connection { + /* + * private data for specific connnetion + */ + void *private; + struct connection_operations ops; +}; + +struct connection *initialize_conn_rw(void); + +#endif + diff --git a/samples/acache_client/main.c b/samples/acache_client/main.c new file mode 100644 index 000000000000..929c70798cfb --- /dev/null +++ b/samples/acache_client/main.c @@ -0,0 +1,133 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> + +#include "connect.h" + +/* + * dev_t in userspace is 8-bytes long but 4-byte long in kernel + * work around this + */ +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) +#define MKDEV(ma, mi) ((ma)<<MINORBITS | (mi)) +#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) +#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) + +struct acache_info *inbuf, *outbuf; +struct connection *conn; + +void print_infos(const char *prefix, struct acache_info *infos, size_t length) +{ + size_t i; + struct acache_info *info; + + for (i = 0; i < length; i++) { + info = infos + i; + + printf("%4s,%20lu,%8u,%8u,%15lu,%12lu\n", + prefix, info->start_time, MAJOR(info->dev), + MINOR(info->dev), info->offset, info->length); + } +} + +int malloc_buffers(struct acache_info **inbuf, struct acache_info **outbuf, + size_t capacity) +{ + /* prepare buffers to store incoming or outgoing items */ + *inbuf = (struct acache_info *)malloc(sizeof(struct acache_info) * capacity); + *outbuf = (struct acache_info *)malloc(sizeof(struct acache_info) * capacity); + + if (!*inbuf || !*outbuf) { + fprintf(stderr, "error malloc memory: %s\n, size: %lu, %lu\n", + strerror(errno), + sizeof(struct acache_info) * capacity, + sizeof(struct acache_info) * capacity); + return -errno; + } + return 0; +} + +void free_buffer(struct acache_info **buf) +{ + if (buf && *buf) { + free(*buf); + *buf = NULL; + } +} + +void elegant_exit(int sig) { + printf("exiting..."); + free_buffer(&inbuf); + free_buffer(&outbuf); + conn->ops.close(conn); + exit(0); +} + +int main(int argc, char **argv) +{ + int debug = 0; + int ret; + int outbuf_tail; + size_t capacity; + + conn = initialize_conn_rw(); + + if (conn == NULL) { + fprintf(stderr, "error initialzied connnection\n"); + return -1; + } + + if (argc > 1 && strcmp("-d", argv[1]) == 0) + debug = 1; + + /* prepare buffers to store incoming or outgoing items */ + capacity = conn->ops.get_capacity(conn); + ret = malloc_buffers(&inbuf, &outbuf, capacity); + + if (ret < 0) + return ret; + + if (debug) { + printf("%4s,%20s,%8s,%8s,%15s,%12s\n", + "op","time(ns)","majorDev","minorDev","offset(B)","length(B)"); + } + /* main loop */ + if (signal(SIGINT, elegant_exit) == SIG_ERR) { + fprintf(stderr, "error handling SIGINT: %s\n", strerror(errno)); + } + if (signal(SIGTERM, elegant_exit) == SIG_ERR) { + fprintf(stderr, "error handling SIGTERM: %s\n", strerror(errno)); + } + while (1) { + unsigned int i, inlen; + + inlen = conn->ops.fetch_items(conn, inbuf, capacity); + if (!inlen) { + usleep(100); + continue; + } + + outbuf_tail = 0; + for (i = 0; i < inlen; i++) { + /* customize prefetch strategy here */ + memcpy(outbuf + outbuf_tail, inbuf + i, sizeof(struct acache_info)); + outbuf[outbuf_tail].offset += outbuf[outbuf_tail].length >> 9; + outbuf_tail++; + } + if (debug) { + print_infos("R", inbuf, inlen); + print_infos("P", outbuf, outbuf_tail); + } + if (outbuf_tail) { + conn->ops.send_items(conn, outbuf, outbuf_tail); + } + } + return 0; +}
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
------------------------------
The recently pushd bugfix patch "Delay to invalidate cache data in writearound write" has a stupid copy&paste bug, which causes bypassed write request will never invalidate data in cache device, causing a data corruption. This patch fixes this corruption. This patch also ensures that the writeback lock is released after data insert.
Fixes: 6a1d9c41b367 ("bcache: Delay to invalidate cache data in writearound write") Signed-off-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Song Chao chao.song@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/request.c | 15 +++++++++++---- drivers/md/bcache/request.h | 13 +++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 04a779573fdd..bad70906e8a2 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -735,6 +735,8 @@ struct search *search_alloc(struct bio *bio, /* Count on the bcache device */ s->start_time = part_start_io_acct(d->disk, &s->part, bio); s->prefetch = prefetch; + s->write_inval_data_putoff = false; + s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -755,6 +757,10 @@ static void cached_dev_bio_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+ /* eusure this lock is released after data_insert */ + if (s->write_inval_data_putoff) + up_read_non_owner(&dc->writeback_lock); + cached_dev_put(dc); search_free(cl); } @@ -993,10 +999,10 @@ static void cached_dev_write_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- if (!s->iop.bypass) + if (s->write_inval_data_putoff) closure_call(&s->iop.cl, bch_data_insert, NULL, cl); - - up_read_non_owner(&dc->writeback_lock); + else + up_read_non_owner(&dc->writeback_lock); continue_at(cl, cached_dev_bio_complete, NULL); }
@@ -1048,6 +1054,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bio->bi_end_io = backing_request_endio; closure_bio_submit(s->iop.c, bio, cl);
+ s->write_inval_data_putoff = true; } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; @@ -1080,7 +1087,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) }
insert_data: - if (!s->iop.bypass) + if (!s->write_inval_data_putoff) closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 21678037d215..42bf280d4625 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -65,6 +65,19 @@ struct search { unsigned long start_time; /* for prefetch, we do not need copy data to bio */ bool prefetch; + /* + * The function bch_data_insert() is invoked asynchronously as the bio + * subbmited to backend block device, therefore there may be a read + * request subbmited after the bch_data_insert() done and ended before + * the backend bio is end. This read request will read data from the + * backend block device, and insert dirty data to cache device. However + * by writearound cache mode, bcache will not invalidate data again, + * so that read request after will read dirty data from the cache, + * causing a data corruption. + * So that we should put off this invalidation. This switch is for this + * put off. + */ + bool write_inval_data_putoff; struct list_head list_node; wait_queue_head_t wqh; struct acache_info smp;
From: Li Ruilin liruilin4@huawei.com
euleros/rtos inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
--------------------------------
commit 6947676c374("bcache: add a framework to perform prefetch") collects data insert info which includes device info got from bio. However, bio created by write_moving here has no device info, causing a null pointer dereference.
[ 1497.991768] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 1497.991869] PGD 0 P4D 0 [ 1497.991912] Oops: 0000 [#1] SMP PTI [ 1497.991962] CPU: 2 PID: 733 Comm: kworker/2:3 Not tainted 4.19.90+ #33 [ 1497.992030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 1497.992137] Workqueue: bcache_gc write_moving [bcache] [ 1497.992219] RIP: 0010:bch_data_insert+0x4c/0x140 [bcache] ... [ 1497.993367] Call Trace: [ 1497.993427] ? cached_dev_read_error+0x140/0x140 [bcache] [ 1497.993526] write_moving+0x19e/0x1b0 [bcache] [ 1497.993621] process_one_work+0x1fd/0x440 [ 1497.993742] worker_thread+0x34/0x410 [ 1497.993811] kthread+0x121/0x140 [ 1497.993873] ? process_one_work+0x440/0x440 [ 1497.993946] ? kthread_create_worker_on_cpu+0x70/0x70 [ 1497.994043] ret_from_fork+0x35/0x40
Signed-off-by: Li Ruilin liruilin4@huawei.com Review-by: Song Chao chao.song@huawei.com Review-by: Xu Wei xuwei56@huawei.com Signed-off-by: Li Ruilin liruilin4@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/request.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index bad70906e8a2..66605e7dcc42 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -315,12 +315,14 @@ void bch_data_insert(struct closure *cl) trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass);
- msg.offset = op->bio->bi_iter.bi_sector; - msg.length = op->bio->bi_iter.bi_size; - msg.type = ACACHE_INFO_CACHE_INSERT; - msg.dev = bio_dev(op->bio); - msg.start_time = ktime_get_ns(); - save_circ_item(&msg); + if (op->bio->bi_disk) { + msg.offset = op->bio->bi_iter.bi_sector; + msg.length = op->bio->bi_iter.bi_size; + msg.type = ACACHE_INFO_CACHE_INSERT; + msg.dev = bio_dev(op->bio); + msg.start_time = ktime_get_ns(); + save_circ_item(&msg); + }
bch_keylist_init(&op->insert_keys); bio_get(op->bio);
From: Li Ruilin liruilin4@huawei.com
euleros/rtos inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA
--------------------------------
Before this patch bypassed I/O request will not record start time, therefore when calculating latency data, start time will be calculated wrongly. This patch makes start time always be recorded to fix this.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Song Chao chao.song@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Guangxing Deng dengguangxing@huawei.com Reviewed-by: chao song chao.song@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/bcache/request.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 66605e7dcc42..d0999a56bcae 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1252,14 +1252,13 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else { s->iop.bypass = check_should_bypass(dc, bio);
- if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { - s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; - s->smp.length = bio->bi_iter.bi_size; - s->smp.type = rw; - s->smp.dev = dc->bdev->bd_dev; - s->smp.start_time = ktime_get_ns(); + s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; + s->smp.length = bio->bi_iter.bi_size; + s->smp.type = rw; + s->smp.dev = dc->bdev->bd_dev; + s->smp.start_time = ktime_get_ns(); + if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) save_circ_item(&s->smp); - }
if (rw) cached_dev_write(dc, s);
From: Yao Jin jinyao5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4OAFT CVE: NA
-------------------------------------------------
Perf script was failed to print the phys_addr for SPE profiling. One 'dummy' event is added by SPE profiling but it doesn't have PHYS_ADDR attribute set, perf script then exits with error.
Now referring to 'addr', use evsel__do_check_stype() to check the type.
Before:
# perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr Samples for 'dummy:u' event do not have PHYS_ADDR attribute set. Cannot print 'phys_addr' field.
After:
# perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr 4064384/4064384 ffff802f921be0d0 2f921be0d0 4064384/4064384 ffff802f921be0d0 2f921be0d0
Signed-off-by: Yao Jin jinyao5@huawei.com Signed-off-by: Wei Li liwei391@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 1d727387cb20..cf6dcc51375e 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -496,7 +496,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL;
if (PRINT_FIELD(PHYS_ADDR) && - evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR)) + evsel__do_check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR, allow_user_set)) return -EINVAL;
return 0;
From: Xingang Wang wangxingang5@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4L735 CVE: NA
---------------------------------------------------
To support device tree boot for arm64 mpam, the __init macro might be used by the dts driver. This remove the necessary __init macro for the related functions.
Signed-off-by: Xingang Wang wangxingang5@huawei.com Reviewed-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/resctrl.h | 2 +- arch/arm64/kernel/mpam/mpam_device.c | 14 +++++++------- arch/arm64/kernel/mpam/mpam_internal.h | 2 +- arch/arm64/kernel/mpam/mpam_resctrl.c | 2 +- fs/resctrlfs.c | 2 +- include/linux/arm_mpam.h | 14 +++++++------- 6 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h index f43fee368098..1175c3515c92 100644 --- a/arch/arm64/include/asm/resctrl.h +++ b/arch/arm64/include/asm/resctrl.h @@ -421,7 +421,7 @@ int resctrl_update_groups_config(struct rdtgroup *rdtgrp);
#define RESCTRL_MAX_CLOSID 32
-int __init resctrl_group_init(void); +int resctrl_group_init(void);
void post_resctrl_mount(void);
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 890db6a0ccaf..e887d32c8451 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -534,7 +534,7 @@ static void mpam_disable_irqs(void) * Scheduled by mpam_discovery_complete() once all devices have been created. * Also scheduled when new devices are probed when new CPUs come online. */ -static void __init mpam_enable(struct work_struct *work) +static void mpam_enable(struct work_struct *work) { int err; unsigned long flags; @@ -761,7 +761,7 @@ static struct mpam_class * __init mpam_class_get(u8 level_idx, * class/component structures may be allocated. * Returns the new device, or an ERR_PTR(). */ -struct mpam_device * __init +struct mpam_device * __mpam_device_create(u8 level_idx, enum mpam_class_types type, int component_id, const struct cpumask *fw_affinity, phys_addr_t hwpage_address) @@ -810,7 +810,7 @@ __mpam_device_create(u8 level_idx, enum mpam_class_types type, return dev; }
-void __init mpam_device_set_error_irq(struct mpam_device *dev, u32 irq, +void mpam_device_set_error_irq(struct mpam_device *dev, u32 irq, u32 flags) { unsigned long irq_save_flags; @@ -821,7 +821,7 @@ void __init mpam_device_set_error_irq(struct mpam_device *dev, u32 irq, spin_unlock_irqrestore(&dev->lock, irq_save_flags); }
-void __init mpam_device_set_overflow_irq(struct mpam_device *dev, u32 irq, +void mpam_device_set_overflow_irq(struct mpam_device *dev, u32 irq, u32 flags) { unsigned long irq_save_flags; @@ -864,7 +864,7 @@ static inline u16 mpam_cpu_max_pmg(void) /* * prepare for initializing devices. */ -int __init mpam_discovery_start(void) +int mpam_discovery_start(void) { if (!mpam_cpus_have_feature()) return -EOPNOTSUPP; @@ -1094,7 +1094,7 @@ static int mpam_cpu_offline(unsigned int cpu) return 0; }
-int __init mpam_discovery_complete(void) +int mpam_discovery_complete(void) { int ret = 0;
@@ -1111,7 +1111,7 @@ int __init mpam_discovery_complete(void) return ret; }
-void __init mpam_discovery_failed(void) +void mpam_discovery_failed(void) { struct mpam_class *class, *tmp;
diff --git a/arch/arm64/kernel/mpam/mpam_internal.h b/arch/arm64/kernel/mpam/mpam_internal.h index cfaef82428aa..7b84ea54975a 100644 --- a/arch/arm64/kernel/mpam/mpam_internal.h +++ b/arch/arm64/kernel/mpam/mpam_internal.h @@ -329,7 +329,7 @@ int mpam_resctrl_setup(void); struct raw_resctrl_resource * mpam_get_raw_resctrl_resource(u32 level);
-int __init mpam_resctrl_init(void); +int mpam_resctrl_init(void);
int mpam_resctrl_set_default_cpu(unsigned int cpu); void mpam_resctrl_clear_default_cpu(unsigned int cpu); diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 0bfcd0b6a032..86752a7a71a8 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -2167,7 +2167,7 @@ static int __init mpam_setup(char *str) } __setup("mpam", mpam_setup);
-int __init mpam_resctrl_init(void) +int mpam_resctrl_init(void) { mpam_init_padding();
diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index 7ca9fe3ee4a4..a18933a11437 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -1156,7 +1156,7 @@ static int __init resctrl_group_setup_root(void) * * Return: 0 on success or -errno */ -int __init resctrl_group_init(void) +int resctrl_group_init(void) { int ret = 0;
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 5c061e5383ad..d32c553ae473 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -16,7 +16,7 @@ enum mpam_class_types { MPAM_CLASS_UNKNOWN, /* Everything else, e.g. TLBs etc */ };
-struct mpam_device * __init +struct mpam_device * __mpam_device_create(u8 level_idx, enum mpam_class_types type, int component_id, const struct cpumask *fw_affinity, phys_addr_t hwpage_address); @@ -54,9 +54,9 @@ mpam_device_create_memory(int nid, phys_addr_t hwpage_address) return __mpam_device_create(~0, MPAM_CLASS_MEMORY, nid, &dev_affinity, hwpage_address); } -int __init mpam_discovery_start(void); -int __init mpam_discovery_complete(void); -void __init mpam_discovery_failed(void); +int mpam_discovery_start(void); +int mpam_discovery_complete(void); +void mpam_discovery_failed(void);
enum mpam_enable_type { MPAM_ENABLE_DENIED = 0, @@ -71,12 +71,12 @@ extern enum mpam_enable_type mpam_enabled; #define mpam_irq_flags_to_acpi(x) ((x & MPAM_IRQ_MODE_LEVEL) ? \ ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE)
-void __init mpam_device_set_error_irq(struct mpam_device *dev, u32 irq, +void mpam_device_set_error_irq(struct mpam_device *dev, u32 irq, u32 flags); -void __init mpam_device_set_overflow_irq(struct mpam_device *dev, u32 irq, +void mpam_device_set_overflow_irq(struct mpam_device *dev, u32 irq, u32 flags);
-static inline int __init mpam_register_device_irq(struct mpam_device *dev, +static inline int mpam_register_device_irq(struct mpam_device *dev, u32 overflow_interrupt, u32 overflow_flags, u32 error_interrupt, u32 error_flags) {
From: Xingang Wang wangxingang5@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4L735 CVE: NA
---------------------------------------------------
For now, only ACPI boot is supported for the arm64 mpam init. This introduce device tree support, treat the mpam device as a platform device, add a platform driver and use of interface to parse the dts node. Add a common init function to call the device tree or ACPI init procedure, according to whether ACPI is disabled and the boot arguments.
Signed-off-by: Xingang Wang wangxingang5@huawei.com Reviewed-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_device.c | 170 ++++++++++++++++++++++++++ arch/arm64/kernel/mpam/mpam_resctrl.c | 2 + drivers/acpi/arm64/mpam.c | 6 - include/linux/arm_mpam.h | 3 + 4 files changed, 175 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index e887d32c8451..85b5c415fdc2 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -32,6 +32,8 @@ #include <linux/cpu.h> #include <linux/cacheinfo.h> #include <linux/arm_mpam.h> +#include <linux/of.h> +#include <linux/of_platform.h>
#include "mpam_resource.h" #include "mpam_device.h" @@ -1698,3 +1700,171 @@ void mpam_component_get_config(struct mpam_component *comp, { mpam_component_get_config_local(comp, args, result); } + +#define ARM_MPAM_PDEV_NAME "arm-mpam" + +static const struct of_device_id arm_mpam_of_device_ids[] = { + {.compatible = "arm,mpam"}, + { } +}; + +static int of_mpam_parse_irq(struct platform_device *pdev, + struct mpam_device *dev) +{ + struct device_node *node = pdev->dev.of_node; + u32 overflow_interrupt, overflow_flags; + u32 error_interrupt, error_interrupt_flags; + + of_property_read_u32(node, "overflow-interrupt", &overflow_interrupt); + of_property_read_u32(node, "overflow-flags", &overflow_flags); + of_property_read_u32(node, "error-interrupt", &error_interrupt); + of_property_read_u32(node, "error-interrupt-flags", + &error_interrupt_flags); + + return mpam_register_device_irq(dev, + overflow_interrupt, overflow_flags, + error_interrupt, error_interrupt_flags); +} + +static int of_mpam_parse_cache(struct platform_device *pdev) +{ + struct mpam_device *dev; + struct device_node *node = pdev->dev.of_node; + int cache_level, cache_id; + struct resource *res; + + if (of_property_read_u32(node, "cache-level", &cache_level)) { + dev_err(&pdev->dev, "missing cache level property\n"); + return -EINVAL; + } + + if (of_property_read_u32(node, "cache-id", &cache_id)) { + dev_err(&pdev->dev, "missing cache id property\n"); + return -EINVAL; + } + + /* Base address */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "missing io resource property\n"); + return -EINVAL; + } + + dev = mpam_device_create_cache(cache_level, cache_id, NULL, res->start); + if (IS_ERR(dev)) { + dev_err(&pdev->dev, "Failed to create cache node\n"); + return -EINVAL; + } + + return of_mpam_parse_irq(pdev, dev); +} + +static int of_mpam_parse_memory(struct platform_device *pdev) +{ + struct mpam_device *dev; + struct device_node *node = pdev->dev.of_node; + int numa_id; + struct resource *res; + + if (of_property_read_u32(node, "numa-node-id", &numa_id)) { + dev_err(&pdev->dev, "missing numa node id property\n"); + return -EINVAL; + } + + /* Base address */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "missing io resource property\n"); + return -EINVAL; + } + + dev = mpam_device_create_memory(numa_id, res->start); + if (IS_ERR(dev)) { + dev_err(&pdev->dev, "Failed to create memory node\n"); + return -EINVAL; + } + + return of_mpam_parse_irq(pdev, dev); +} + +static int of_mpam_parse(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + enum mpam_class_types type; + + if (!node || !of_match_node(arm_mpam_of_device_ids, pdev->dev.of_node)) + return -EINVAL; + + if (of_property_read_u32(dev->of_node, "type", &type)) { + dev_err(dev, "missing type property\n"); + return -EINVAL; + } + + switch (type) { + case MPAM_CLASS_CACHE: + return of_mpam_parse_cache(pdev); + case MPAM_CLASS_MEMORY: + return of_mpam_parse_memory(pdev); + default: + pr_warn_once("Unknown node type %u.\n", type); + return -EINVAL; + /* fall through */ + case MPAM_CLASS_SMMU: + /* not yet supported */ + /* fall through */ + case MPAM_CLASS_UNKNOWN: + break; + } + + return 0; +} + +static int arm_mpam_device_probe(struct platform_device *pdev) +{ + int ret; + + if (!cpus_have_const_cap(ARM64_HAS_MPAM)) + return 0; + + if (!acpi_disabled || mpam_enabled != MPAM_ENABLE_OF) + return 0; + + ret = mpam_discovery_start(); + if (ret) + return ret; + + ret = of_mpam_parse(pdev); + + if (ret) { + mpam_discovery_failed(); + } else { + ret = mpam_discovery_complete(); + if (!ret) + pr_info("Successfully init mpam by DT.\n"); + } + + return ret; +} + +static struct platform_driver arm_mpam_driver = { + .driver = { + .name = ARM_MPAM_PDEV_NAME, + .of_match_table = arm_mpam_of_device_ids, + }, + .probe = arm_mpam_device_probe, +}; + +static int __init arm_mpam_driver_init(void) +{ + if (acpi_disabled) + return platform_driver_register(&arm_mpam_driver); + else + return acpi_mpam_parse(); +} + +/* + * We want to run after cacheinfo_sysfs_init() has caused the cacheinfo + * structures to be populated. That runs as a device_initcall. + */ +device_initcall_sync(arm_mpam_driver_init); diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 86752a7a71a8..53789acaae20 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -2162,6 +2162,8 @@ static int __init mpam_setup(char *str) { if (!strcmp(str, "=acpi")) mpam_enabled = MPAM_ENABLE_ACPI; + else if (!strcmp(str, "=of")) + mpam_enabled = MPAM_ENABLE_OF;
return 1; } diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 51419473f63b..6f4572193eb2 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -240,9 +240,3 @@ int __init acpi_mpam_parse(void)
return ret; } - -/* - * We want to run after cacheinfo_sysfs_init() has caused the cacheinfo - * structures to be populated. That runs as a device_initcall. - */ -device_initcall_sync(acpi_mpam_parse); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index d32c553ae473..01498a5c06ba 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -61,6 +61,7 @@ void mpam_discovery_failed(void); enum mpam_enable_type { MPAM_ENABLE_DENIED = 0, MPAM_ENABLE_ACPI, + MPAM_ENABLE_OF, };
extern enum mpam_enable_type mpam_enabled; @@ -115,4 +116,6 @@ static inline int mpam_register_device_irq(struct mpam_device *dev, return ret; }
+int __init acpi_mpam_parse(void); + #endif
From: Xingang Wang wangxingang5@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4L735 CVE: NA
---------------------------------------------------
Add devicetree bindings document for arm64/mpam, this add basic description for the main properties of mpam devicetree definition.
Signed-off-by: Xingang Wang wangxingang5@huawei.com Reviewed-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../devicetree/bindings/arm/arm,mpam.txt | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/arm,mpam.txt
diff --git a/Documentation/devicetree/bindings/arm/arm,mpam.txt b/Documentation/devicetree/bindings/arm/arm,mpam.txt new file mode 100644 index 000000000000..65c1e6809685 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/arm,mpam.txt @@ -0,0 +1,54 @@ +Memory System Resource Partitioning and Monitoring (MPAM), for Armv8-A +---------------------------------------------------------- + +The MPAM is used to limit memory bandwidth and cache usage for ARM platform. +The required properties for driver is: + compatible = "arm,mpam"; /* MPAM for Arm */ + reg = <>; /* mpam device base register */ + +The property type must be included, it is used to indicate type of mpam +device for the node. There are several type of mpam device: + MPAM_CLASS_SMMU = 0, + MPAM_CLASS_CACHE, /* Well known caches, e.g. L2 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. TLBs etc */ + +The type of memory is set as: + type = <2>; +The type of cache is set as: + type = <1>; + +MPAM support interrupt for error and overflow, the error-interrupt and +overflow-interrupt are defined in "Memory System Resource Partitioning +and Monitoring (MPAM), for Armv8-A", MPAM interrupts(section 8.8). + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + +Example: + +mpam_memory0 { + compatible = "arm,mpam"; + reg = <0x0 0x10000000 0x0 0x10000>; + type = <2>; /* memory type */ + numa-node-id = <0>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; +}; + +mpam_cache0 { + compatible = "arm,mpam"; + reg = <0x0 0x20000000 0x0 0x10000>; + type = <1>; /* cache type */ + cache-id = <0>; + cache-level = <3>; + overflow-interrupt = <0>; + overflow-flags = <0>; + error-interrupt = <0>; + error-interrupt-flags = <0>; + not-ready-max = <0>; +};
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: bugfix Bugzilla: https://gitee.com/openeuler/kernel/issues/I4K2U5 CVE: N/A
---------------------------------------
Hisi when designing ascend chip, connect the serial port interrupt signal lines to mbigen equipment, mbigen write GICD_SETSPI_NSR register trigger the SPI interrupt. This can result in serial port drop interrupts.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/tty/serial/Kconfig | 18 +++++++++ drivers/tty/serial/amba-pl011.c | 66 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+)
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 28f22e58639c..f66710bd4733 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -73,6 +73,24 @@ config SERIAL_AMBA_PL011_CONSOLE your boot loader (lilo or loadlin) about how to pass options to the kernel at boot time.)
+if ASCEND_FEATURES + +config SERIAL_ATTACHED_MBIGEN + bool "Serial port interrupt signal lines connected to the mbigen" + depends on SERIAL_AMBA_PL011=y + default n + help + Say Y here when the interrupt signal line of the serial port is + connected to the mbigne. The mbigen device has the function of + clearing interrupts automatically. However, the interrupt processing + function of the serial port driver may process multiple interrupts + at a time. The mbigen device cannot adapt to this scenario. + As a result, interrupts are lost.Because it maybe discard interrupt. + + If unsure, say N. + +endif + config SERIAL_EARLYCON_ARM_SEMIHOST bool "Early console using ARM semihosting" depends on ARM64 || ARM diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 51ca2d4a8bb3..6c2180ced867 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -1466,6 +1466,65 @@ static void check_apply_cts_event_workaround(struct uart_amba_port *uap) pl011_read(uap, REG_ICR); }
+#ifdef CONFIG_SERIAL_ATTACHED_MBIGEN +struct workaround_oem_info { + char oem_id[ACPI_OEM_ID_SIZE + 1]; + char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + u32 oem_revision; +}; + +static bool pl011_enable_hisi_wkrd; +static struct workaround_oem_info pl011_wkrd_info[] = { + { + .oem_id = "HISI ", + .oem_table_id = "HIP08 ", + .oem_revision = 0x300, + }, { + .oem_id = "HISI ", + .oem_table_id = "HIP08 ", + .oem_revision = 0x301, + }, { + .oem_id = "HISI ", + .oem_table_id = "HIP08 ", + .oem_revision = 0x400, + }, { + .oem_id = "HISI ", + .oem_table_id = "HIP08 ", + .oem_revision = 0x401, + }, { + .oem_id = "HISI ", + .oem_table_id = "HIP08 ", + .oem_revision = 0x402, + } +}; + +static void pl011_check_hisi_workaround(void) +{ + struct acpi_table_header *tbl; + acpi_status status = AE_OK; + int i; + + status = acpi_get_table(ACPI_SIG_MADT, 0, &tbl); + if (ACPI_FAILURE(status) || !tbl) + return; + + for (i = 0; i < ARRAY_SIZE(pl011_wkrd_info); i++) { + if (!memcmp(pl011_wkrd_info[i].oem_id, tbl->oem_id, ACPI_OEM_ID_SIZE) && + !memcmp(pl011_wkrd_info[i].oem_table_id, tbl->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) && + pl011_wkrd_info[i].oem_revision == tbl->oem_revision) { + pl011_enable_hisi_wkrd = true; + break; + } + } +} + +#else + +#define pl011_enable_hisi_wkrd 0 +static inline void pl011_check_hisi_workaround(void){ } + +#endif + static irqreturn_t pl011_int(int irq, void *dev_id) { struct uart_amba_port *uap = dev_id; @@ -1503,6 +1562,11 @@ static irqreturn_t pl011_int(int irq, void *dev_id) handled = 1; }
+ if (pl011_enable_hisi_wkrd) { + pl011_write(0, uap, REG_IMSC); + pl011_write(uap->im, uap, REG_IMSC); + } + spin_unlock_irqrestore(&uap->port.lock, flags);
return IRQ_RETVAL(handled); @@ -1680,6 +1744,8 @@ static int pl011_hwinit(struct uart_port *port) if (plat->init) plat->init(); } + + pl011_check_hisi_workaround(); return 0; }
From: Xiongfeng Wang wangxiongfeng2@huawei.com
mainline inclusion from mainline-v5.16-rc3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HYY4 CVE: NA
-------------------------------
When I hot added a CPU, I found 'cpufreq' directory was not created below /sys/devices/system/cpu/cpuX/.
It is because get_cpu_device() failed in add_cpu_dev_symlink().
cpufreq_add_dev() is the .add_dev callback of a CPU subsys interface. It will be called when the CPU device registered into the system. The call chain is as follows:
register_cpu() ->device_register() ->device_add() ->bus_probe_device() ->cpufreq_add_dev()
But only after the CPU device has been registered, we can get the CPU device by get_cpu_device(), otherwise it will return NULL.
Since we already have the CPU device in cpufreq_add_dev(), pass it to add_cpu_dev_symlink().
I noticed that the 'kobj' of the CPU device has been added into the system before cpufreq_add_dev().
Fixes: 2f0ba790df51 ("cpufreq: Fix creation of symbolic links to policy directories") Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Acked-by: Viresh Kumar viresh.kumar@linaro.org Cc: All applicable stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/cpufreq/cpufreq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index ebee0ad559fa..8e159fb6af9c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1004,10 +1004,9 @@ static struct kobj_type ktype_cpufreq = { .release = cpufreq_sysfs_release, };
-static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu) +static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu, + struct device *dev) { - struct device *dev = get_cpu_device(cpu); - if (unlikely(!dev)) return;
@@ -1391,7 +1390,7 @@ static int cpufreq_online(unsigned int cpu) if (new_policy) { for_each_cpu(j, policy->related_cpus) { per_cpu(cpufreq_cpu_data, j) = policy; - add_cpu_dev_symlink(policy, j); + add_cpu_dev_symlink(policy, j, get_cpu_device(j)); }
policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req), @@ -1553,7 +1552,7 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) /* Create sysfs link on CPU registration */ policy = per_cpu(cpufreq_cpu_data, cpu); if (policy) - add_cpu_dev_symlink(policy, cpu); + add_cpu_dev_symlink(policy, cpu, dev);
return 0; }
From: Xiongfeng Wang wangxiongfeng2@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HYY4 CVE: NA
-------------------------------------------------
Per-CPU variables cpc_desc_ptr are initialized in acpi_cppc_processor_probe() when the processor devices are present and added into the system. But when cpu_possible_mask and cpu_present_mask is not equal, only cpc_desc_ptr in cpu_present_mask are initialized, this will cause acpi_get_psd_map() failed in cppc_cpufreq_init().
To fix this issue, we parse the _PSD method for all possible CPUs to get the P-State topology and modify acpi_get_psd_map() to rely on this information.
Signed-off-by: Xiongfeng Wang wangxiongfeng@huawei.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Conflicts: drivers/acpi/cppc_acpi.c Signed-off-by: Xiongfeng Wang wangxiongfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/acpi/cppc_acpi.c | 91 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 3 deletions(-)
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 0a2da06e9d8b..dc8ac435dea1 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -411,7 +411,7 @@ static int acpi_get_psd(struct cpc_desc *cpc_ptr, acpi_handle handle) * * Return: 0 for success or negative value for err. */ -int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) +static int __acpi_get_psd_map(struct cppc_cpudata **all_cpu_data, struct cpc_desc **cpc_pptr) { int count_target; int retval = 0; @@ -434,7 +434,7 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) continue;
pr = all_cpu_data[i]; - cpc_ptr = per_cpu(cpc_desc_ptr, i); + cpc_ptr = cpc_pptr[i]; if (!cpc_ptr) { retval = -EFAULT; goto err_ret; @@ -459,7 +459,7 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) if (i == j) continue;
- match_cpc_ptr = per_cpu(cpc_desc_ptr, j); + match_cpc_ptr = cpc_pptr[j]; if (!match_cpc_ptr) { retval = -EFAULT; goto err_ret; @@ -509,6 +509,91 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) free_cpumask_var(covered_cpus); return retval; } + +static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, + void **ret_p) +{ + struct acpi_device *adev = NULL; + struct cpc_desc *cpc_ptr, **cpc_pptr; + acpi_status status = AE_OK; + const int device_declaration = 1; + unsigned long long uid; + phys_cpuid_t phys_id; + int logical_id, ret; + int *parsed_core_num = (int *)ret_p; + + if (acpi_bus_get_device(handle, &adev)) + return AE_OK; + + if (strcmp(acpi_device_hid(adev), ACPI_PROCESSOR_DEVICE_HID)) + return AE_OK; + + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, NULL, &uid); + if (ACPI_FAILURE(status)) + return AE_OK; + phys_id = acpi_get_phys_id(handle, device_declaration, uid); + if (invalid_phys_cpuid(phys_id)) + return AE_OK; + logical_id = acpi_map_cpuid(phys_id, uid); + if (logical_id < 0) + return AE_OK; + + cpc_pptr = (struct cpc_desc **)data; + cpc_ptr = cpc_pptr[logical_id]; + cpc_ptr->cpu_id = logical_id; + + ret = acpi_get_psd(cpc_ptr, handle); + if (ret) + return ret; + + (*parsed_core_num)++; + + return AE_OK; +} + +int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) +{ + struct cpc_desc **cpc_pptr, *cpc_ptr; + int parsed_core_num = 0; + int i, ret; + + cpc_pptr = kcalloc(num_possible_cpus(), sizeof(void *), GFP_KERNEL); + if (!cpc_pptr) + return -ENOMEM; + for_each_possible_cpu(i) { + cpc_pptr[i] = kzalloc(sizeof(struct cpc_desc), GFP_KERNEL); + if (!cpc_pptr[i]) { + ret = -ENOMEM; + goto out; + } + } + + /* + * We can not use acpi_get_devices() to walk the processor devices + * because some processor device is not present. + */ + ret = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, acpi_parse_cpc, NULL, + cpc_pptr, (void **)&parsed_core_num); + if (ret) + goto out; + if (parsed_core_num != num_possible_cpus()) { + ret = -EINVAL; + goto out; + } + + ret = __acpi_get_psd_map(all_cpu_data, cpc_pptr); + +out: + for_each_possible_cpu(i) { + cpc_ptr = cpc_pptr[i]; + if (cpc_ptr) + kfree(cpc_ptr); + } + kfree(cpc_pptr); + + return ret; +} EXPORT_SYMBOL_GPL(acpi_get_psd_map);
static int register_pcc_channel(int pcc_ss_idx)
From: Ignacy Gawędzki ignacy.gawedzki@green-communications.fr
mainline inclusion from mainline-v5.16-rc7 commit ebb966d3bdfed581ecccbb4a7432341baf7619b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4PVZR?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
In commit 5648b5e1169f ("netfilter: nfnetlink_queue: fix OOB when mac header was cleared"), the test for non-empty MAC header introduced in commit 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling") has been replaced with a test for a set MAC header.
This breaks the case when the MAC header has been reset (using skb_reset_mac_header), as is the case with looped-back multicast packets. As a result, the packets ending up in NFQUEUE get a bogus hwaddr interpreted from the first bytes of the IP header.
This patch adds a test for a non-empty MAC header in addition to the test for a set MAC header. The same two tests are also implemented in nfnetlink_log.c, where the initial code of commit 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling") has not been touched, but where supposedly the same situation may happen.
Fixes: 5648b5e1169f ("netfilter: nfnetlink_queue: fix OOB when mac header was cleared") Signed-off-by: Ignacy Gawędzki ignacy.gawedzki@green-communications.fr Reviewed-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Huang Guobin huangguobin4@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nfnetlink_log.c | 3 ++- net/netfilter/nfnetlink_queue.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b35e8d9a5b37..33c13edbca4b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -557,7 +557,8 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure;
if (indev && skb->dev && - skb->mac_header != skb->network_header) { + skb_mac_header_was_set(skb) && + skb_mac_header_len(skb) != 0) { struct nfulnl_msg_packet_hw phw; int len;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 98994fe677fe..b0358f30947e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -562,7 +562,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure;
if (indev && entskb->dev && - skb_mac_header_was_set(entskb)) { + skb_mac_header_was_set(entskb) && + skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len;