Kernel
Threads by month
- ----- 2026 -----
- March
- February
- January
- ----- 2025 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 47 participants
- 23063 discussions
hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/8613
--------------------------------
This is a userspace tool for MFS to provide: programmable strategy
framework. Some basic strategies have added in test, and more
flexible strategies can be written based on this framework.
Signed-off-by: Hongbo Li <lihongbo22(a)huawei.com>
---
tools/mfs/maio-utils/CMakeLists.txt | 101 ++++
tools/mfs/maio-utils/README.md | 17 +
tools/mfs/maio-utils/build/build.sh | 91 +++
.../mfs/maio-utils/include/client/fs_client.h | 17 +
tools/mfs/maio-utils/include/inner/inner.h | 37 ++
tools/mfs/maio-utils/include/loader/loader.h | 10 +
tools/mfs/maio-utils/include/maio.h | 53 ++
.../maio-utils/include/parser/req_parser.h | 13 +
tools/mfs/maio-utils/include/policy/policy.h | 17 +
tools/mfs/maio-utils/include/uapi/mfs.h | 62 ++
tools/mfs/maio-utils/src/CMakeLists.txt | 42 ++
.../mfs/maio-utils/src/client/CMakeLists.txt | 17 +
tools/mfs/maio-utils/src/client/fs_client.c | 359 ++++++++++++
.../mfs/maio-utils/src/loader/CMakeLists.txt | 18 +
tools/mfs/maio-utils/src/loader/loader.c | 144 +++++
tools/mfs/maio-utils/src/main.c | 156 +++++
.../mfs/maio-utils/src/parser/CMakeLists.txt | 16 +
tools/mfs/maio-utils/src/parser/req_parser.c | 313 ++++++++++
.../mfs/maio-utils/src/policy/CMakeLists.txt | 16 +
tools/mfs/maio-utils/src/policy/policy.c | 114 ++++
tools/mfs/maio-utils/test/Makefile | 26 +
tools/mfs/maio-utils/test/README.md | 61 ++
tools/mfs/maio-utils/test/strategy_demo.c | 45 ++
.../mfs/maio-utils/test/strategy_generate.cpp | 296 ++++++++++
.../mfs/maio-utils/test/strategy_prefetch.cpp | 151 +++++
tools/mfs/maio-utils/test/strategy_prefetch.h | 21 +
tools/mfs/maio-utils/test/strategy_replay.cpp | 540 ++++++++++++++++++
tools/mfs/maio-utils/test/strategy_template.h | 21 +
tools/mfs/maio-utils/test/strategy_toend.c | 46 ++
tools/mfs/maio-utils/test/trace_analyse.py | 87 +++
tools/mfs/maio-utils/test/trace_gen.py | 116 ++++
31 files changed, 3023 insertions(+)
create mode 100644 tools/mfs/maio-utils/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/README.md
create mode 100644 tools/mfs/maio-utils/build/build.sh
create mode 100644 tools/mfs/maio-utils/include/client/fs_client.h
create mode 100644 tools/mfs/maio-utils/include/inner/inner.h
create mode 100644 tools/mfs/maio-utils/include/loader/loader.h
create mode 100644 tools/mfs/maio-utils/include/maio.h
create mode 100644 tools/mfs/maio-utils/include/parser/req_parser.h
create mode 100644 tools/mfs/maio-utils/include/policy/policy.h
create mode 100644 tools/mfs/maio-utils/include/uapi/mfs.h
create mode 100644 tools/mfs/maio-utils/src/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/src/client/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/src/client/fs_client.c
create mode 100644 tools/mfs/maio-utils/src/loader/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/src/loader/loader.c
create mode 100644 tools/mfs/maio-utils/src/main.c
create mode 100644 tools/mfs/maio-utils/src/parser/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/src/parser/req_parser.c
create mode 100644 tools/mfs/maio-utils/src/policy/CMakeLists.txt
create mode 100644 tools/mfs/maio-utils/src/policy/policy.c
create mode 100644 tools/mfs/maio-utils/test/Makefile
create mode 100644 tools/mfs/maio-utils/test/README.md
create mode 100644 tools/mfs/maio-utils/test/strategy_demo.c
create mode 100644 tools/mfs/maio-utils/test/strategy_generate.cpp
create mode 100644 tools/mfs/maio-utils/test/strategy_prefetch.cpp
create mode 100644 tools/mfs/maio-utils/test/strategy_prefetch.h
create mode 100644 tools/mfs/maio-utils/test/strategy_replay.cpp
create mode 100644 tools/mfs/maio-utils/test/strategy_template.h
create mode 100644 tools/mfs/maio-utils/test/strategy_toend.c
create mode 100644 tools/mfs/maio-utils/test/trace_analyse.py
create mode 100644 tools/mfs/maio-utils/test/trace_gen.py
diff --git a/tools/mfs/maio-utils/CMakeLists.txt b/tools/mfs/maio-utils/CMakeLists.txt
new file mode 100644
index 000000000000..f710e6c15f79
--- /dev/null
+++ b/tools/mfs/maio-utils/CMakeLists.txt
@@ -0,0 +1,101 @@
+#*******************************************************************************
+#*******************************************************************************
+
+cmake_minimum_required(VERSION 3.0)
+cmake_policy(SET CMP0057 NEW)
+
+project(maio)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_definitions(-D__LINUX_USR__)
+add_definitions(-D__USE_GNU)
+add_definitions(-D__POSIX_SOURCE)
+add_definitions(-D_GNU_SOURCE)
+add_definitions(-D_FILE_OFFSET_BITS=64)
+
+#*******************************************************************************
+#*******************************************************************************
+
+include(TestBigEndian)
+
+TEST_BIG_ENDIAN(BIGENDIAN)
+ if (${BIGENDIAN})
+ message(">>> Big Endian")
+ add_definitions(-D_CMAKE_BIGENDIAN)
+ else()
+ message(">>> Little Endian")
+ endif(${BIGENDIAN})
+
+#*******************************************************************************
+#*******************************************************************************
+
+EXECUTE_PROCESS(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCH)
+if(${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "aarch64")
+ message(STATUS "Architecture: ${ARCH}")
+else()
+ message(STATUS "Invaild Architecture: ${ARCH}")
+ return()
+endif()
+
+#*******************************************************************************
+#*******************************************************************************
+
+set(BASE_FLAGS "-g -Wall -Werror -fno-strict-overflow -rdynamic")
+set(SP_C_FLAGS "-std=gnu99")
+set(SP_C++_FLAGS "-std=c++11")
+set(SP_RELEASE_FLAGS "-O2")
+set(SP_DEBUG_FLAGS "-O0")
+set(SEC_REQ_FLAGS "-fstack-protector-all -fPIE -pie -fPIC -Wl,-z,relro,-z,now,-z,noexecstack")
+set(SEC_SUG_BASE_FLAGS "-Wformat=2")
+set(SEC_SUG_SP_C_FLAGS "-Wshadow")
+
+set(C_FLAGS "${BASE_FLAGS} ${SP_C_FLAGS}")
+set(C++_FLAGS "${BASE_FLAGS} ${SP_C++_FLAGS}")
+set(SEC_C_FLAGS "${SEC_REQ_FLAGS} ${SEC_SUG_BASE_FLAGS} ${SEC_SUG_SP_C_FLAGS}")
+set(SEC_C++_FLAGS "${SEC_REQ_FLAGS} ${SEC_SUG_BASE_FLAGS}")
+set(DEPC_FLAGS "-Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+
+if(CMAKE_BUILD_TYPE MATCHES "Release")
+ message(">>> Release Mode")
+ set(CMAKE_C_FLAGS "${C_FLAGS} ${SP_RELEASE_FLAGS} ${SEC_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${C++_FLAGS} ${SP_RELEASE_FLAGS} ${SEC_C++_FLAGS} ${DEPC_FLAGS}")
+else()
+ message(">>> Debug Mode")
+ add_definitions(-D_DEBUG)
+ set(CMAKE_C_FLAGS "${C_FLAGS} ${SP_DEBUG_FLAGS} ${SEC_C_FLAGS}")
+ set(CMAKE_CXX_FLAGS "${C++_FLAGS} ${SP_DEBUG_FLAGS} ${SEC_C++_FLAGS} ${DEPC_FLAGS}")
+ set(COVERAGE_FLAGS "-coverage")
+ set(COVERAGE_LINK "gcov")
+endif()
+
+message(STATUS ">>> Source Dir: " ${PROJECT_SOURCE_DIR})
+message(STATUS ">>> Binary Dir: " ${PROJECT_BINARY_DIR})
+message(STATUS ">>> C_FLAGS : " ${CMAKE_C_FLAGS})
+message(STATUS ">>> CXX_FLAGS : " ${CMAKE_CXX_FLAGS})
+
+#*******************************************************************************
+#*******************************************************************************
+
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+#*******************************************************************************
+#*******************************************************************************
+
+if(DEFINED THIRD_PARTY)
+ message(">>> third_party: ${THIRD_PARTY}")
+else()
+ set(THIRD_PARTY ${PROJECT_SOURCE_DIR}/lib)
+endif()
+
+set(INFRA_DIR ${PROJECT_SOURCE_DIR}/infrastructure)
+set(SECUREC_DIR ${THIRD_PARTY}/securec)
+set(ZLOG_DIR ${THIRD_PARTY}/zlog)
+set(LIBNUMA_DIR ${THIRD_PARTY}/libnuma)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_subdirectory(infrastructure)
+add_subdirectory(src)
diff --git a/tools/mfs/maio-utils/README.md b/tools/mfs/maio-utils/README.md
new file mode 100644
index 000000000000..4c5a66a83c60
--- /dev/null
+++ b/tools/mfs/maio-utils/README.md
@@ -0,0 +1,17 @@
+# maio-utils
+
+#### How to build
+
+1. prepare thirdparty by using:
+```
+git clone -b threadpool_bind https://github.com/hb-lee/infrastructure.git infrastructure
+git clone -b 1.2.18 https://github.com/HardySimpson/zlog.git thirdparty/zlog
+git clone https://github.com/chriszt/securec.git thirdparty/securec
+git clone -b v2.0.19 https://github.com/numactl/numactl.git thirdparty/libnuma
+```
+2. cd build
+3. bash build.sh prepare # only once.
+4. bash build.sh build
+
+#### How to run
+Run with `./maio -m ${MNTPOINT}`, here ${MNTPOINT} means the mfs mount point. If you want to use some strategies, you can run with `./maio -m ${MNTPOINT} -s ${STRATEGYLIB}` where ${STRATEGYLIB} is your strategy library. Some examples are provided in `test` directory.
diff --git a/tools/mfs/maio-utils/build/build.sh b/tools/mfs/maio-utils/build/build.sh
new file mode 100644
index 000000000000..91b49d3d6bf8
--- /dev/null
+++ b/tools/mfs/maio-utils/build/build.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+LOCAL_PATH="$(readlink -e "$(dirname "$0")")"
+PRJ_PATH="$(readlink -e "${LOCAL_PATH}/..")"
+THIRD_SUB="${PRJ_PATH}/thirdparty"
+Lib_PATH="${PRJ_PATH}/lib"
+
+ARCH=$(uname -m)
+
+function build_zlog() {
+ local zlog_dir=${THIRD_SUB}/zlog
+ local build_path=${zlog_dir}/build
+ mkdir ${build_path}
+ mkdir -p ${Lib_PATH}/zlog
+ cd ${zlog_dir}
+ make PREFIX=`pwd`/build/zlog
+ make PREFIX=`pwd`/build/zlog install
+ cp -rdp ${build_path}/zlog/include ${Lib_PATH}/zlog
+ cp -rdp ${build_path}/zlog/lib ${Lib_PATH}/zlog
+ cd -
+}
+
+function build_securec() {
+ local securec_dir=${THIRD_SUB}/securec
+ local securec_lib=${securec_dir}/build/securec/lib
+ mkdir -p ${securec_lib}
+ mkdir -p ${Lib_PATH}/securec
+ cd ${securec_dir}
+ if [ "${ARCH}" = "aarch64" ]; then
+ make -f aarch64-so.mk
+ else
+ make -f x86-so.mk
+ fi
+ cp -rdp include build/securec
+ cp libsecurec.so build/securec/lib
+ cp -rdp build/securec/include ${Lib_PATH}/securec
+ cp -rdp build/securec/lib ${Lib_PATH}/securec
+ cd -
+}
+
+function build_libnuma() {
+ local libnuma_dir=${THIRD_SUB}/libnuma
+ local build_path=${libnuma_dir}/build
+ mkdir ${build_path}
+ mkdir -p ${Lib_PATH}/libnuma
+ cd ${libnuma_dir}
+ ./autogen.sh
+ ./configure --prefix=`pwd`/build
+ make -j4
+ make install
+ cp -rdp build/include ${Lib_PATH}/libnuma
+ cp -rdp build/lib ${Lib_PATH}/libnuma
+ cd -
+}
+
+function prepare_thirdparty() {
+ build_jsoncpp
+ build_zlog
+ build_securec
+ build_libnuma
+}
+
+function build_program() {
+ local BUILD_TYPE=$1
+
+ set -x
+ if [ "${BUILD_TYPE}" == "release" ]; then
+ cmake .. -DCMAKE_BUILD_TYPE=Release
+ else
+ cmake .. -DCMAKE_BUILD_TYPE=Debug
+ fi
+ set +x
+
+ make clean
+ make VERBOSE=1 -j
+}
+
+function main() {
+ local ACTION="$1"
+ local PARAMETER="$2"
+
+ if [ "${ACTION}" == "prepare" ]; then
+ prepare_thirdparty
+ fi
+ if [ "$ACTION" == "build" ]; then
+ build_program $PARAMETER
+ fi
+}
+
+main $@
+exit $?
diff --git a/tools/mfs/maio-utils/include/client/fs_client.h b/tools/mfs/maio-utils/include/client/fs_client.h
new file mode 100644
index 000000000000..803179ade14c
--- /dev/null
+++ b/tools/mfs/maio-utils/include/client/fs_client.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef _FS_CLIENT_H_
+#define _FS_CLIENT_H_
+
+#include <stdint.h>
+
+int fs_evict_by_path(const char*path, uint64_t off, uint64_t len);
+int fs_load_by_path(const char *path, uint64_t off, uint64_t len);
+int fs_load_by_fd(int fd, uint64_t off, uint64_t len);
+int fs_sync_by_path(const char *path, void *buf, uint64_t off, uint64_t len);
+
+int fs_client_init(const char *source);
+void fs_client_exit();
+
+#endif
diff --git a/tools/mfs/maio-utils/include/inner/inner.h b/tools/mfs/maio-utils/include/inner/inner.h
new file mode 100644
index 000000000000..c10f40a3712c
--- /dev/null
+++ b/tools/mfs/maio-utils/include/inner/inner.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef _INNER_H_
+#define _INNER_H_
+
+#include <time.h>
+#include <stdint.h>
+
+static inline uint64_t _get_ts()
+{
+ struct timespec now = {0};
+ (void)clock_gettime(CLOCK_MONOTONIC_COARSE, &now);
+ return (uint64_t)now.tv_sec;
+}
+
+/* io context for loader and writeback */
+struct io_context {
+ uint32_t id; /* msg id */
+ uint64_t off; /* io offset in fd */
+ uint64_t len; /* io length */
+ uint16_t numaid; /* numa node of this io, -1 means non-bind */
+ int fd; /* io file handle */
+ char *path; /* fullpath of this io */
+ void *buf; /* flighting buffer for writeback */
+
+ uint64_t seq;
+
+ void *private;
+ void (*end_io)(int ret, void *private);
+};
+
+void loader_update_curseq(uint64_t seq);
+int loader_evict_submit(struct io_context *ioctx);
+int loader_io_submit(struct io_context *ioctx);
+
+#endif
diff --git a/tools/mfs/maio-utils/include/loader/loader.h b/tools/mfs/maio-utils/include/loader/loader.h
new file mode 100644
index 000000000000..ff280989f9f9
--- /dev/null
+++ b/tools/mfs/maio-utils/include/loader/loader.h
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef _LOADER_H_
+#define _LOADER_H_
+
+int loader_init();
+void loader_exit();
+
+#endif
diff --git a/tools/mfs/maio-utils/include/maio.h b/tools/mfs/maio-utils/include/maio.h
new file mode 100644
index 000000000000..a508d5b1fa27
--- /dev/null
+++ b/tools/mfs/maio-utils/include/maio.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef _MAIO_H_
+#define _MAIO_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+struct maio_entry {
+ char *fpath;
+ uint64_t toff;
+ uint64_t tlen;
+ uint8_t tnuma;
+ uint64_t seq;
+};
+
+struct maio {
+ /* IN */
+ int fd;
+ uint64_t off;
+ uint64_t len;
+ int pid;
+ uint8_t op;
+ uint64_t ts;
+ uint32_t cksz;
+
+ /* OUT */
+ uint16_t flags;
+#define MAIO_WITH_SEQ 0x0001
+ uint64_t curseq;
+ struct maio_entry entries[];
+};
+
+struct maio_operation {
+ int max_io;
+ int (*init) (void);
+ void (*exit) (void);
+ int (*load) (struct maio **io);
+ int (*evict) (struct maio **io);
+};
+
+char *_get_fullpath(char *fpath, int fd);
+void maio_preload(struct maio *maio, int nio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/mfs/maio-utils/include/parser/req_parser.h b/tools/mfs/maio-utils/include/parser/req_parser.h
new file mode 100644
index 000000000000..908e7297558a
--- /dev/null
+++ b/tools/mfs/maio-utils/include/parser/req_parser.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef __REQ_PARSER_H__
+#define __REQ_PARSER_H__
+
+#include <stdint.h>
+
+int parser_init(uint32_t fs_mode, const char *mntpoint);
+void parser_destory();
+int maio_parse_req(void *buf, int size);
+
+#endif
diff --git a/tools/mfs/maio-utils/include/policy/policy.h b/tools/mfs/maio-utils/include/policy/policy.h
new file mode 100644
index 000000000000..aef05c956b3b
--- /dev/null
+++ b/tools/mfs/maio-utils/include/policy/policy.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#ifndef _POLICY_H_
+#define _POLICY_H_
+
+#include "maio.h"
+
+int policy_max_io();
+int policy_load(struct maio **io);
+int policy_evict(struct maio **io);
+int policy_register(const char *path);
+void policy_unregister();
+int policy_init();
+void policy_exit();
+
+#endif
diff --git a/tools/mfs/maio-utils/include/uapi/mfs.h b/tools/mfs/maio-utils/include/uapi/mfs.h
new file mode 100644
index 000000000000..a7d5882b5e50
--- /dev/null
+++ b/tools/mfs/maio-utils/include/uapi/mfs.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_MFS_H
+#define _UAPI_LINUX_MFS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+enum mfs_opcode {
+ MFS_OP_READ = 0,
+ MFS_OP_FAULT,
+ MFS_OP_FAROUND,
+};
+
+enum {
+ MFS_MODE_NONE = 0,
+ MFS_MODE_LOCAL,
+ MFS_MODE_REMOTE,
+};
+
+struct mfs_ioc_ra {
+ __u64 off;
+ __u64 len;
+};
+
+struct mfs_ioc_done {
+ __u32 id;
+ __u32 ret;
+};
+
+struct mfs_ioc_rpath {
+ __u16 max;
+ __u16 len;
+ __u8 d[];
+};
+
+#define MFS_IOC_RA _IOW(0xbc, 1, struct mfs_ioc_ra)
+#define MFS_IOC_DONE _IOW(0xbc, 2, struct mfs_ioc_done)
+#define MFS_IOC_RPATH _IOWR(0xbc, 3, struct mfs_ioc_rpath)
+
+struct mfs_ioc_fsinfo {
+ __u8 mode; /* 0: none, 1: local, 2: remote */
+};
+
+#define MFS_IOC_FSINFO _IOR(0xbd, 1, struct mfs_ioc_fsinfo)
+
+struct mfs_msg {
+ __u8 version;
+ __u8 opcode;
+ __u16 len;
+ __u32 fd;
+ __u32 id;
+ __u8 data[];
+};
+
+struct mfs_read {
+ __u64 off;
+ __u64 len;
+ __s32 pid;
+};
+
+#endif /* _UAPI_LINUX_MFS_H */
diff --git a/tools/mfs/maio-utils/src/CMakeLists.txt b/tools/mfs/maio-utils/src/CMakeLists.txt
new file mode 100644
index 000000000000..fd053a8bd081
--- /dev/null
+++ b/tools/mfs/maio-utils/src/CMakeLists.txt
@@ -0,0 +1,42 @@
+#*******************************************************************************
+#*******************************************************************************
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/include/inner)
+include_directories(${PROJECT_SOURCE_DIR}/include/uapi)
+include_directories(${PROJECT_SOURCE_DIR}/include/client)
+include_directories(${PROJECT_SOURCE_DIR}/include/loader)
+include_directories(${PROJECT_SOURCE_DIR}/include/policy)
+include_directories(${PROJECT_SOURCE_DIR}/include/monitor)
+include_directories(${PROJECT_SOURCE_DIR}/include/parser)
+
+include_directories(${INFRA_DIR}/include)
+include_directories(${SECUREC_DIR}/include)
+include_directories(${ZLOG_DIR}/include)
+include_directories(${LIBNUMA_DIR}/include)
+
+#*******************************************************************************
+#*******************************************************************************
+
+link_directories(${PROJECT_BINARY_DIR}/lib)
+link_directories(${SECUREC_DIR}/lib)
+link_directories(${ZLOG_DIR}/lib)
+link_directories(${LIBNUMA_DIR}/lib)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_subdirectory(client)
+add_subdirectory(loader)
+add_subdirectory(policy)
+add_subdirectory(parser)
+
+add_executable(maio-utils main.c)
+target_link_libraries(maio-utils "-Wl,--whole-archive"
+ client
+ loader
+ policy
+ parser
+ "-Wl,--no-whole-archive"
+ infra
+ dl)
diff --git a/tools/mfs/maio-utils/src/client/CMakeLists.txt b/tools/mfs/maio-utils/src/client/CMakeLists.txt
new file mode 100644
index 000000000000..0fb67d64eb03
--- /dev/null
+++ b/tools/mfs/maio-utils/src/client/CMakeLists.txt
@@ -0,0 +1,17 @@
+#*******************************************************************************
+#*******************************************************************************
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+#*******************************************************************************
+#*******************************************************************************
+
+#aux_source_directory(. CLIENT_SRC)
+set(CLIENT_SRC fs_client.c)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_library(client ${CLIENT_SRC})
+target_link_libraries(client infra
+ securec)
diff --git a/tools/mfs/maio-utils/src/client/fs_client.c b/tools/mfs/maio-utils/src/client/fs_client.c
new file mode 100644
index 000000000000..5d359b8f22e1
--- /dev/null
+++ b/tools/mfs/maio-utils/src/client/fs_client.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "fs_client.h"
+#include "mfs.h"
+#include "inner.h"
+#include "hashmap.h"
+#include "hashfunc.h"
+#include "stimer.h"
+#include "atomic.h"
+#include "log.h"
+#include "list.h"
+#include "spinlock.h"
+#include "sysdef.h"
+#include "securec.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#define TIMER_CYCLE 10000 /* 10s */
+#define FD_TMOUT 3600 /* 60s */
+
+struct fs_client {
+ int rootfd;
+ hashmap_t *map;
+ stimer_t *timer;
+ list_head_t head;
+ spinlock_t lock;
+ uint64_t tmout;
+};
+
+struct fs_client g_client = {0};
+
+typedef struct {
+ uint32_t plen;
+ char *path;
+} fdkey_t;
+
+typedef struct {
+ fdkey_t key;
+ int fd;
+ int ref;
+ uint64_t ts;
+ list_head_t link;
+ hashlink_t hash;
+} fd_t;
+
+static inline int fd_cmp(void *first, void *second)
+{
+ fdkey_t *key1 = (fdkey_t *)first;
+ fdkey_t *key2 = (fdkey_t *)second;
+
+ if (key1->plen != key2->plen)
+ return -1;
+
+ return strcmp(key1->path, key2->path);
+}
+
+static inline uint32_t fd_hash(void *args)
+{
+ fdkey_t *key = (fdkey_t *)args;
+ uint64_t hash = hashstr(key->path, key->plen);
+
+ return (uint32_t)hash;
+}
+
+static inline void fd_inc(void *args, hashlink_t *link)
+{
+ fd_t *f = container_of(link, fd_t, hash);
+ int ref = atomic_s32_inc(&f->ref);
+ if (ref <= 0) {
+ log_error("fd(%d) for path(%s) with invalid ref(%d)",
+ f->fd, f->key.path, ref);
+ sys_assert(0);
+ }
+}
+
+static inline int fd_dec(void *args, hashlink_t *link)
+{
+ fd_t *f = container_of(link, fd_t, hash);
+ int ref = atomic_s32_dec(&f->ref);
+
+ if (ref < 0) {
+ log_error("fd(%d) for path(%s) with invalid ref(%d)",
+ f->fd, f->key.path, ref);
+ sys_assert(0);
+ }
+ /* update access timestamp for the last one */
+ if (ref == 0)
+ f->ts = _get_ts();
+ return -1;
+}
+
+static inline int fd_expire(void *args, hashlink_t *link)
+{
+ fd_t *f = container_of(link, fd_t, hash);
+ int ref = atomic_s32_fetch(&f->ref);
+ uint64_t cur;
+
+ if (ref > 0)
+ return -1;
+ if (ref < 0) {
+ log_error("fd(%d) for path(%s) with invalid ref(%d)",
+ f->fd, f->key.path, ref);
+ sys_assert(0);
+ }
+ cur = _get_ts();
+ return ((cur - f->ts) >= g_client.tmout) ? 0: -1;
+}
+
+static fd_t *fd_alloc(const char *path)
+{
+ fd_t *f = (fd_t *)malloc(sizeof(fd_t));
+ if (!f) {
+ log_error("alloc fd_t fail");
+ return NULL;
+ }
+
+ f->key.path = strdup(path);
+ if (!f->key.path) {
+ log_error("strdup(%s) fail", path);
+ free(f);
+ return NULL;
+ }
+
+ f->key.plen = strlen(path) + 1;
+ f->fd = -1;
+ f->ref = 1;
+ f->hash.key = &f->key;
+ list_init(&f->link);
+ return f;
+}
+
+static void fd_free(fd_t *f)
+{
+ if (f->fd > 0) {
+ int ret = close(f->fd);
+ if (ret != 0) {
+ log_error("close(%d) fail, err(%s)", f->fd, strerror(errno));
+ }
+ f->fd = -1;
+ }
+
+ if (f->key.path) {
+ free(f->key.path);
+ f->key.path = NULL;
+ }
+ list_del(&f->link);
+ free(f);
+}
+
+static int fd_fetch(const char *path, fd_t **f_res, int dirfd)
+{
+ fdkey_t key;
+ key.plen = strlen(path) + 1;
+ key.path = (char *)path;
+
+ /* fd in cache */
+ hashlink_t *data = NULL;
+ if (EEXIST == hashmap_search(g_client.map, &key, &data, NULL, fd_inc)) {
+ *f_res = container_of(data, fd_t, hash);
+ (*f_res)->ts = _get_ts();
+ return 0;
+ }
+
+ /* new fd */
+ fd_t *f = fd_alloc(path);
+ if (!f)
+ return -ENOMEM;
+
+ /* open file */
+ if (dirfd > 0)
+ f->fd = openat(dirfd, path, O_RDONLY, S_IRUSR | S_IRGRP);
+ else
+ f->fd = open(path, O_RDONLY, S_IRUSR | S_IRGRP);
+ if (f->fd < 0) {
+ if (errno != ENOENT)
+ log_error("open(%d,%s) fail, err(%s)", dirfd, path, strerror(errno));
+ fd_free(f);
+ return errno;
+ }
+
+ /* add to fd cache */
+ hashlink_t *old = NULL;
+ if (EEXIST == hashmap_insert(g_client.map, &f->hash, &old, NULL, fd_inc)) {
+ fd_free(f);
+ f = container_of(old, fd_t, hash);
+ }
+
+ spinlock_lock(&g_client.lock);
+ list_add_tail(&f->link, &g_client.head);
+ spinlock_unlock(&g_client.lock);
+ f->ts = _get_ts();
+ *f_res = f;
+ return 0;
+}
+
+static inline void fd_restore(fd_t *f)
+{
+ (void)hashmap_protect(g_client.map, &f->key, NULL, fd_dec);
+}
+
+static void _fd_gc(void *args)
+{
+ list_head_t head, *curr, *next;
+ hashlink_t *data = NULL;
+
+ list_init(&head);
+ spinlock_lock(&g_client.lock);
+ list_splice(&g_client.head, &head);
+ spinlock_unlock(&g_client.lock);
+
+ list_foreach_safe(curr, next, &head)
+ {
+ fd_t *f = container_of(curr, fd_t, link);
+ if (atomic_s32_fetch(&f->ref) != 0)
+ continue;
+ /* try to destroy */
+ if (0 != hashmap_delete(g_client.map, &f->key, &data, NULL, fd_expire))
+ continue;
+ list_del(&f->link);
+ log_info("path:%s with fd:%d is expire.", f->key.path, f->fd);
+ fd_free(f);
+ }
+
+ spinlock_lock(&g_client.lock);
+ list_splice(&head, &g_client.head);
+ spinlock_unlock(&g_client.lock);
+}
+
+int fs_evict_by_path(const char*path, uint64_t off, uint64_t len)
+{
+ int ret;
+ fd_t *f;
+ uint64_t step = 4096 * 4;
+
+ ret = fd_fetch(path, &f, 0);
+ if (ret)
+ return ret;
+
+ while (len) {
+ if (step > len)
+ step = len;
+ ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
+ if (ret < 0)
+ break;
+ len -= step;
+ off += step;
+ }
+
+ fd_restore(f);
+ return ret;
+}
+
+int fs_load_by_path(const char *path, uint64_t off, uint64_t len)
+{
+ int ret;
+ fd_t *f;
+ uint64_t step = 16 * 4096;
+ char *buf = NULL;
+
+ ret = fd_fetch(path, &f, 0);
+ if (ret)
+ return ret;
+
+ buf = (char *)malloc(step);
+
+ while (len) {
+ if (step > len)
+ step = len;
+ ret = pread(f->fd, buf, step, off);
+ if (ret <= 0)
+ break;
+ len -= step;
+ off += step;
+ }
+ free(buf);
+ fd_restore(f);
+ return ret;
+}
+
+int fs_load_by_fd(int fd, uint64_t off, uint64_t len)
+{
+ struct mfs_ioc_ra ra;
+
+ ra.off = off;
+ ra.len = len;
+ return ioctl(fd, MFS_IOC_RA, &ra);
+}
+
+int fs_sync_by_path(const char *path, void *buf, uint64_t off, uint64_t len)
+{
+ ssize_t rsize = 0, ret;
+ uint64_t pos = 0;
+ fd_t *f;
+
+ ret = fd_fetch(path + 1, &f, g_client.rootfd);
+ if (ret)
+ return ret;
+
+ do {
+ ret = pread(f->fd, buf + pos, len - pos, off + pos);
+ if (ret == 0)
+ break;
+ if (ret < 0) {
+ log_error("failed to read:%s", strerror(errno));
+ fd_restore(f);
+ return -1;
+ }
+ pos += ret;
+ rsize += ret;
+ } while (rsize < len);
+ fd_restore(f);
+ return 0;
+}
+
+int fs_client_init(const char *source)
+{
+ int ret;
+
+ g_client.rootfd = -1;
+ if (source) {
+ g_client.rootfd = open(source, O_RDONLY | O_DIRECTORY);
+ if (g_client.rootfd <= 0) {
+ log_error("open %s failed", source);
+ return -1;
+ }
+ }
+ ret = hashmap_create(1024, fd_cmp, fd_hash, &g_client.map);
+ if (ret) {
+ log_error("client fd map alloc failed");
+ close(g_client.rootfd);
+ return ret;
+ }
+
+ g_client.timer = stimer_create("LfsGc", TIMER_CYCLE, NULL, _fd_gc);
+ if (!g_client.timer) {
+ log_error("timer create failed");
+ hashmap_destroy(g_client.map, NULL, NULL);
+ close(g_client.rootfd);
+ return -1;
+ }
+ list_init(&g_client.head);
+ spinlock_init(&g_client.lock);
+ g_client.tmout = FD_TMOUT;
+ return 0;
+}
+
+void fs_client_exit()
+{
+ spinlock_destroy(&g_client.lock);
+ stimer_destroy(g_client.timer);
+ hashmap_destroy(g_client.map, NULL, NULL);
+ close(g_client.rootfd);
+}
+
diff --git a/tools/mfs/maio-utils/src/loader/CMakeLists.txt b/tools/mfs/maio-utils/src/loader/CMakeLists.txt
new file mode 100644
index 000000000000..08e1b637be26
--- /dev/null
+++ b/tools/mfs/maio-utils/src/loader/CMakeLists.txt
@@ -0,0 +1,18 @@
+#*******************************************************************************
+#*******************************************************************************
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+#*******************************************************************************
+#*******************************************************************************
+
+aux_source_directory(. LOADER_SRC)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_library(loader ${LOADER_SRC})
+target_link_libraries(loader infra
+ numa
+ securec)
+
diff --git a/tools/mfs/maio-utils/src/loader/loader.c b/tools/mfs/maio-utils/src/loader/loader.c
new file mode 100644
index 000000000000..61c5302204fc
--- /dev/null
+++ b/tools/mfs/maio-utils/src/loader/loader.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "loader.h"
+
+#include "fs_client.h"
+#include "threadpool.h"
+#include "log.h"
+#include "atomic.h"
+#include "inner.h"
+#include "sysdef.h"
+
+#include "securec.h"
+#include "numa.h"
+#include <errno.h>
+
+struct loader_mgr {
+ int numa_num;
+ threadpool_t **numa_thp;
+ uint64_t curseq;
+};
+
+struct loader_mgr g_local_mgr;
+
+static threadpool_t *choose_worker_pool(struct io_context *ioctx)
+{
+ static uint64_t cid = 0;
+ int nid;
+
+ if (ioctx->numaid == (uint16_t)-1)
+ nid = cid++ % g_local_mgr.numa_num;
+ else
+ nid = ioctx->numaid % g_local_mgr.numa_num;
+ return g_local_mgr.numa_thp[nid];
+}
+
+static void handle_io(void *args)
+{
+ struct io_context *ioctx = (struct io_context *)args;
+ int ret;
+
+ if (g_local_mgr.curseq > ioctx->seq) {
+ if (ioctx->end_io)
+ ioctx->end_io(0, ioctx->private);
+ return;
+ }
+ ret = ioctx->path ? fs_load_by_path(ioctx->path, ioctx->off, ioctx->len)
+ : fs_load_by_fd(ioctx->fd, ioctx->off, ioctx->len);
+ if (ioctx->end_io)
+ ioctx->end_io(ret, ioctx->private);
+}
+
+int loader_io_submit(struct io_context *ioctx)
+{
+ threadpool_t *worker_pools = choose_worker_pool(ioctx);
+
+ threadpool_submit(worker_pools, ioctx, handle_io);
+ return 0;
+}
+
+void loader_update_curseq(uint64_t seq)
+{
+ uint64_t curseq = atomic_u64_fetch(&g_local_mgr.curseq);
+ uint64_t old;
+
+ while (seq > curseq) {
+ if (atomic_u64_cas(&g_local_mgr.curseq, curseq, seq, &old))
+ break;
+ curseq = old;
+ }
+}
+
+static void handle_evict(void *args)
+{
+ struct io_context *ioctx = (struct io_context *)args;
+ int ret;
+
+ ret = ioctx->path ? fs_evict_by_path(ioctx->path, ioctx->off, ioctx->len) : 0;
+ if (ioctx->end_io)
+ ioctx->end_io(ret, ioctx->private);
+}
+
+int loader_evict_submit(struct io_context *ioctx)
+{
+ threadpool_t *worker_pools = choose_worker_pool(ioctx);
+
+ threadpool_submit(worker_pools, ioctx, handle_evict);
+ return 0;
+}
+
+static void _free_loader()
+{
+ int i;
+
+ if (!g_local_mgr.numa_thp)
+ return;
+
+ for (i = 0; i < g_local_mgr.numa_num; i++)
+ threadpool_destroy(g_local_mgr.numa_thp[i]);
+ free(g_local_mgr.numa_thp);
+ g_local_mgr.numa_thp = NULL;
+}
+
+int loader_init()
+{
+ int i, ret, wnum = 8;
+ char name[THD_NAME + 1] = {0};
+
+ if (numa_available() < 0) {
+ log_error("Current system does not support NUAM api");
+ return -1;
+ }
+
+ g_local_mgr.numa_num = numa_max_node() + 1;
+ g_local_mgr.curseq = 0;
+ ret = 0;
+ do {
+ g_local_mgr.numa_thp =
+ calloc(g_local_mgr.numa_num, sizeof(threadpool_t *));
+ if (!g_local_mgr.numa_thp) {
+ log_error("alloc for numa threadpool failed");
+ ret = -1;
+ break;
+ }
+ for (i = 0; i < g_local_mgr.numa_num; i++) {
+ sprintf_s(name, sizeof(name), "Loader-%d", i);
+ g_local_mgr.numa_thp[i] = threadpool_create(name, wnum, i);
+ if (!g_local_mgr.numa_thp[i]) {
+ log_error("alloc Loader-%d failed", i);
+ ret = -1;
+ break;
+ }
+ }
+ } while (0);
+
+ if (ret != 0)
+ _free_loader();
+ return ret;
+}
+
+void loader_exit()
+{
+ _free_loader();
+}
diff --git a/tools/mfs/maio-utils/src/main.c b/tools/mfs/maio-utils/src/main.c
new file mode 100644
index 000000000000..854c5b298521
--- /dev/null
+++ b/tools/mfs/maio-utils/src/main.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+
+#include "loader.h"
+#include "log.h"
+#include "maio.h"
+#include "policy.h"
+#include "req_parser.h"
+#include "fs_client.h"
+#include "mfs.h"
+#include "securec.h"
+
+#include <sys/ioctl.h>
+#include <sys/statfs.h>
+
+#define MAX_BUF_SIZE 1024
+#define MFS_SUPER_MAGIC 0x85428370
+
+static int read_req(int fd)
+{
+ char *buf;
+ int ret;
+
+ buf = calloc(MAX_BUF_SIZE, sizeof(char));
+ if (!buf) {
+ log_error("failed to alloc buf\n");
+ return -1;
+ }
+ ret = read(fd, buf, MAX_BUF_SIZE);
+ if (ret <= 0) {
+ if (ret < 0)
+ log_error("failed to read, ret:%d\n", ret);
+ return -1;
+ }
+ return maio_parse_req(buf, ret);
+}
+
+static void dev_monitor(int dfd)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.fd = dfd;
+ pfd.events = POLLIN;
+ while (1) {
+ ret = poll(&pfd, 1, -1);
+ if (ret < 0) {
+ log_error("poll failed\n");
+ return;
+ }
+ if (ret == 0 || !(pfd.revents & POLLIN)) {
+ log_error("poll events error, ret:%d, revents:%x\n", ret, pfd.revents);
+ continue;
+ }
+ while(!read_req(pfd.fd)) {}
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ const char *mntpoint = NULL, *strategylib = NULL;
+ struct mfs_ioc_fsinfo fsinfo = {0};
+ int dfd, ret, opt;
+ struct statfs buf;
+ char devname[10];
+
+ while ((opt = getopt(argc, argv, "m:s:")) != -1) {
+ switch (opt) {
+ case 'm':
+ mntpoint = optarg;
+ break;
+ case 's':
+ strategylib = optarg;
+ break;
+ default:
+ fprintf(stderr, "Usage: %s -m ${mnt} [-s ${strategylib}]\n", argv[0]);
+ return -EINVAL;
+ }
+ }
+ if (!mntpoint) {
+ fprintf(stderr, "mount point should specify\n");
+ fprintf(stderr, "Usage: %s -m ${mnt} [-s ${strategylib}]\n", argv[0]);
+ return -1;
+ }
+
+ ret = statfs(mntpoint, &buf);
+ if (ret) {
+ log_error("statfs %s failed, errstr:%s\n", mntpoint, strerror(errno));
+ return -1;
+ }
+ if (buf.f_type != MFS_SUPER_MAGIC) {
+ log_error("fstype(%x) is invalid, please check the mountpoint", buf.f_type);
+ return -1;
+ }
+
+ sprintf_s(devname, sizeof(devname), "/dev/mfs%ld", buf.f_spare[0]);
+ /* Open with O_CLOEXEC to avoid potential fd leaks */
+ dfd = open(devname, O_RDWR | O_CLOEXEC);
+ if (dfd < 0) {
+ log_error("open %s failed errstr:%s\n", devname, strerror(errno));
+ return -1;
+ }
+
+ ret = ioctl(dfd, MFS_IOC_FSINFO, (unsigned long)&fsinfo);
+ if (ret < 0) {
+ log_error("ioctl get fsinfo failed, ret:%d\n", ret);
+ goto close_fd;
+ }
+ if (fsinfo.mode != 1) {
+ log_error("fs runing mode(%d) is not supported", fsinfo.mode);
+ goto close_fd;
+ }
+ ret = loader_init();
+ if (ret) {
+ log_error("loader init failed");
+ goto close_fd;
+ }
+ ret = fs_client_init(NULL);
+ if (ret) {
+ log_error("clients init failed");
+ goto free_loader;
+ }
+ ret = parser_init(fsinfo.mode, mntpoint);
+ if (ret) {
+ log_error("parser init failed");
+ goto free_client;
+ }
+ ret = policy_init();
+ if (ret) {
+ log_error("policy init failed");
+ goto free_parser;
+ }
+ ret = policy_register(strategylib);
+ if (ret) {
+ log_info("register new policy failed, use default policy");
+ }
+
+ dev_monitor(dfd);
+free_parser:
+ parser_destory();
+free_client:
+ fs_client_exit();
+free_loader:
+ loader_exit();
+close_fd:
+ close(dfd);
+ return ret;
+}
diff --git a/tools/mfs/maio-utils/src/parser/CMakeLists.txt b/tools/mfs/maio-utils/src/parser/CMakeLists.txt
new file mode 100644
index 000000000000..72e5849da8e4
--- /dev/null
+++ b/tools/mfs/maio-utils/src/parser/CMakeLists.txt
@@ -0,0 +1,16 @@
+#*******************************************************************************
+#*******************************************************************************
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+#*******************************************************************************
+#*******************************************************************************
+
+aux_source_directory(. PARSER_SRC)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_library(parser ${PARSER_SRC})
+target_link_libraries(parser infra
+ securec)
diff --git a/tools/mfs/maio-utils/src/parser/req_parser.c b/tools/mfs/maio-utils/src/parser/req_parser.c
new file mode 100644
index 000000000000..83a975276bc2
--- /dev/null
+++ b/tools/mfs/maio-utils/src/parser/req_parser.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+
+#include "inner.h"
+#include "log.h"
+#include "mfs.h"
+#include "policy.h"
+#include "securec.h"
+#include "threadpool.h"
+
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+
+struct parser_mgr {
+ uint32_t fs_mode;
+ uint32_t chunk_size;
+ const char *mntpoint;
+ threadpool_t *parser;
+};
+
+struct parser_mgr g_parser_mgr;
+
+#define PARSER_NUM 8
+#define MAX_PATH_SIZE 1024
+
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+bool is_power_of_2(unsigned long n)
+{
+ return (n != 0 && ((n & (n - 1)) == 0));
+}
+
+void ctx_cb(int ret, void *args)
+{
+ struct io_context *ctx = (struct io_context *)args;
+
+ if (ctx->path)
+ free(ctx->path);
+ if (ctx->buf)
+ free(ctx->buf);
+ free(ctx);
+}
+
+void _submit_local_io(int fd, struct maio_entry *entry)
+{
+ struct io_context *ioctx;
+
+ ioctx = calloc(1, sizeof(struct io_context));
+ if (!ioctx) {
+ log_error("ioctx alloc failed");
+ return;
+ }
+ ioctx->off = entry->toff;
+ ioctx->len = entry->tlen;
+ ioctx->numaid = entry->tnuma;
+ ioctx->fd = fd;
+ ioctx->path = entry->fpath;
+ ioctx->seq = entry->seq;
+ ioctx->private = ioctx;
+ ioctx->end_io = ctx_cb;
+ loader_io_submit(ioctx);
+}
+
+char *get_file_path(int fd)
+{
+ struct mfs_ioc_rpath *rp;
+ char *path;
+ int ret;
+
+ rp = malloc(sizeof(struct mfs_ioc_rpath) + MAX_PATH_SIZE);
+ if (!rp) {
+ log_error("rpath alloc failed");
+ return NULL;
+ }
+ rp->max = MAX_PATH_SIZE;
+ ret = ioctl(fd, MFS_IOC_RPATH, (unsigned long)rp);
+ if (ret) {
+ log_error("realpath failed for fd:%d, ret:%d", fd, ret);
+ free(rp);
+ return NULL;
+ }
+ rp->d[rp->len] = '\0';
+
+ path = malloc(strlen(g_parser_mgr.mntpoint) + rp->len);
+ if (!path) {
+ log_error("malloc path %s %s failed", g_parser_mgr.mntpoint, (const char *)rp->d);
+ return NULL;
+ }
+ sprintf(path, "%s%s", g_parser_mgr.mntpoint, (const char *)rp->d);
+ free(rp);
+ return path;
+}
+
+char *_get_fullpath(char *fpath, int fd)
+{
+ struct mfs_ioc_rpath *rp;
+ char *path;
+ int ret;
+
+ if (fpath)
+ return fpath;
+ rp = malloc(sizeof(struct mfs_ioc_rpath) + MAX_PATH_SIZE);
+ if (!rp) {
+ log_error("rpath alloc failed");
+ return NULL;
+ }
+ rp->max = MAX_PATH_SIZE;
+ ret = ioctl(fd, MFS_IOC_RPATH, (unsigned long)rp);
+ if (ret) {
+ log_error("realpath failed for fd:%d, ret:%d", fd, ret);
+ free(rp);
+ return NULL;
+ }
+ rp->d[rp->len] = '\0';
+ path = strdup((const char *)rp->d);
+ if (!path)
+ log_error("strdup path(%s) failed", (const char *)rp->d);
+ free(rp);
+ return path;
+}
+
+void maio_preload(struct maio *maio, int nio)
+{
+ struct maio_entry *entry;
+
+ for (int i = 0; i < nio; ++i) {
+ entry = &maio->entries[i];
+ _submit_local_io(0, entry);
+ }
+}
+
+uint8_t pid_to_numaid(int pid)
+{
+ return (uint8_t)-1;
+}
+
+void _process_read_io(uint32_t id, struct maio *maio, int nio)
+{
+ struct maio_entry *entry;
+ int i;
+
+ /* process the successor io */
+ for (i = 0; i < nio; i++) {
+ entry = &maio->entries[i];
+ _submit_local_io(maio->fd, entry);
+ }
+}
+
+void _submit_evict(struct maio_entry *entry)
+{
+ struct io_context *ioctx;
+
+ ioctx = calloc(1, sizeof(struct io_context));
+ if (!ioctx) {
+ log_error("ioctx alloc failed");
+ return;
+ }
+ ioctx->off = entry->toff;
+ ioctx->len = entry->tlen;
+ ioctx->numaid = -1;
+ ioctx->path = entry->fpath;
+ ioctx->private = ioctx;
+ ioctx->end_io = ctx_cb;
+ loader_evict_submit(ioctx);
+}
+
+void _process_evict(struct mfs_msg *msg)
+{
+ struct maio_entry *entry;
+ struct mfs_read *read;
+ uint64_t soff, eoff;
+ struct maio *maio;
+ int ret;
+
+ if (g_parser_mgr.fs_mode != MFS_MODE_LOCAL)
+ return;
+
+ maio = calloc(1, sizeof(struct maio));
+ if (!maio) {
+ log_error("failed to alloc maio");
+ return;
+ }
+
+ read = (void*)msg->data;
+ maio->cksz = g_parser_mgr.chunk_size;
+ soff = round_down(read->off, maio->cksz);
+ eoff = round_up(read->off + read->len, maio->cksz);
+ maio->fd = msg->fd;
+ maio->off = read->off;
+ maio->len = eoff - soff;
+ maio->pid = read->pid;
+ maio->op = msg->opcode;
+
+ ret = policy_evict(&maio);
+ if (ret <= 0)
+ goto out;
+
+ for (int i = 0; i < ret; ++i) {
+ entry = &maio->entries[i];
+ _submit_evict(entry);
+ }
+out:
+ free(maio);
+}
+
+void _parse_read_req(struct mfs_msg *msg)
+{
+ struct mfs_read *read;
+ struct maio *maio;
+ uint64_t soff, eoff, ts;
+ int ret, io_num = policy_max_io();
+
+ _process_evict(msg);
+
+ /* alloc maio slots */
+ maio = calloc(1, sizeof(struct maio) + io_num * sizeof(struct maio_entry));
+ if (!maio) {
+ log_error("failed to alloc maio, num:%d", io_num);
+ return;
+ }
+ read = (void *)msg->data;
+ maio->cksz = g_parser_mgr.chunk_size;
+ soff = round_down(read->off, maio->cksz);
+ eoff = round_up(read->off + read->len, maio->cksz);
+ ts = _get_ts();
+ maio->fd = msg->fd;
+ maio->off = soff;
+ maio->len = eoff - soff;
+ maio->pid = read->pid;
+ maio->op = msg->opcode;
+ maio->ts = ts;
+
+ /* fill slots from policy */
+ ret = policy_load(&maio);
+ if (ret < 0) {
+ char *path = _get_fullpath(NULL, maio->fd);
+ if (path) {
+ log_warn("io on path:%s%s offset:%lu length:%lu dropped",
+ g_parser_mgr.mntpoint, path, read->off, read->len);
+ free(path);
+ }
+ free(maio);
+ return;
+ }
+
+ if (maio->flags & MAIO_WITH_SEQ)
+ loader_update_curseq(maio->curseq);
+
+ /* keep the input parameter */
+ maio->fd = msg->fd;
+ maio->off = soff;
+ maio->len = eoff - soff;
+ maio->cksz = g_parser_mgr.chunk_size;
+ maio->pid = read->pid;
+ maio->op = msg->opcode;
+ maio->ts = ts;
+
+ /* process each maio in slots */
+ _process_read_io(msg->id, maio, ret);
+ free(maio);
+}
+
+void _parse_one_req(void *buf)
+{
+ struct mfs_msg *msg = (struct mfs_msg *)buf;
+
+ if (msg->opcode == MFS_OP_READ || msg->opcode == MFS_OP_FAULT)
+ _parse_read_req(msg);
+
+ free(buf);
+}
+
+int maio_parse_req(void *buf, int size)
+{
+ struct mfs_msg *msg = (struct mfs_msg *)buf;
+
+ if (msg->len != size)
+ return -1;
+
+ threadpool_submit(g_parser_mgr.parser, buf, _parse_one_req);
+ return 0;
+}
+
+int parser_init(uint32_t fs_mode, const char *mntpoint)
+{
+ g_parser_mgr.fs_mode = fs_mode;
+ g_parser_mgr.mntpoint = strdup(mntpoint);
+ g_parser_mgr.chunk_size = 4096;
+ if (!is_power_of_2(g_parser_mgr.chunk_size)) {
+ log_error("chunk size must be power of 2");
+ return -1;
+ }
+ g_parser_mgr.parser = threadpool_create("ReqParser", PARSER_NUM, -1);
+ if (!g_parser_mgr.parser) {
+ log_error("failed to alloc parser threadpool");
+ return -1;
+ }
+
+ return 0;
+}
+
+void parser_destory()
+{
+ threadpool_destroy(g_parser_mgr.parser);
+}
diff --git a/tools/mfs/maio-utils/src/policy/CMakeLists.txt b/tools/mfs/maio-utils/src/policy/CMakeLists.txt
new file mode 100644
index 000000000000..714023475e07
--- /dev/null
+++ b/tools/mfs/maio-utils/src/policy/CMakeLists.txt
@@ -0,0 +1,16 @@
+#*******************************************************************************
+#*******************************************************************************
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+#*******************************************************************************
+#*******************************************************************************
+
+aux_source_directory(. POLICY_SRC)
+
+#*******************************************************************************
+#*******************************************************************************
+
+add_library(policy ${POLICY_SRC})
+target_link_libraries(policy infra
+ securec)
diff --git a/tools/mfs/maio-utils/src/policy/policy.c b/tools/mfs/maio-utils/src/policy/policy.c
new file mode 100644
index 000000000000..404358ff9b46
--- /dev/null
+++ b/tools/mfs/maio-utils/src/policy/policy.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "policy.h"
+
+#include "maio.h"
+#include "log.h"
+#include "securec.h"
+#include <dlfcn.h>
+
+typedef struct maio_operation* (*strategy_init)();
+
+struct policy_mgr {
+ void *handle;
+ struct maio_operation *strategy;
+};
+
+struct policy_mgr g_policy;
+
+static int default_init(void)
+{
+ return 0;
+}
+
+static void default_exit(void)
+{
+}
+
+static int default_load(struct maio **io)
+{
+ return 0;
+}
+
+static int default_evict(struct maio **io)
+{
+ return 0;
+}
+
+struct maio_operation default_ops = {
+ .max_io = 1,
+ .init = default_init,
+ .exit = default_exit,
+ .load = default_load,
+ .evict = default_evict,
+};
+
+static void set_default_strategy()
+{
+ g_policy.strategy = &default_ops;
+}
+
+int policy_max_io(void)
+{
+ return g_policy.strategy->max_io;
+}
+
+/* return the filled num of io */
+int policy_load(struct maio **io)
+{
+ return g_policy.strategy->load(io);
+}
+
+int policy_evict(struct maio **io)
+{
+ return g_policy.strategy->evict(io);
+}
+
+int policy_register(const char *path)
+{
+ strategy_init sinit;
+
+ if (!path || strlen(path) == 0)
+ return -1;
+
+ g_policy.handle = dlopen(path, RTLD_LAZY);
+ if (!g_policy.handle) {
+ log_error("dlopen failed:%s", dlerror());
+ return -1;
+ }
+
+ sinit = dlsym(g_policy.handle, "register_strategy");
+ if (!sinit) {
+ log_error("dlsym failed:%s", dlerror());
+ dlclose(g_policy.handle);
+ g_policy.handle = NULL;
+ return -1;
+ }
+ g_policy.strategy = sinit();
+ if (g_policy.strategy->init())
+ return -1;
+ log_info("register strategy:%s done.", path);
+ return 0;
+}
+
+void policy_unregister()
+{
+ if (g_policy.handle) {
+ g_policy.strategy->exit();
+ dlclose(g_policy.handle);
+ g_policy.handle = NULL;
+ }
+ set_default_strategy();
+}
+
+int policy_init()
+{
+ g_policy.handle = NULL;
+ set_default_strategy();
+ return 0;
+}
+
+void policy_exit()
+{
+}
diff --git a/tools/mfs/maio-utils/test/Makefile b/tools/mfs/maio-utils/test/Makefile
new file mode 100644
index 000000000000..404d9cf3d9a4
--- /dev/null
+++ b/tools/mfs/maio-utils/test/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CC := gcc
+CFLAGS := -fPIC -Wall -Werror -O2
+CPPFLAGS := -fPIC -Wall -Wno-ignored-attributes -Werror -O2 -std=c++11 -g
+LDFLAGS := -shared
+
+CFLAGS += $(addprefix -I,../include)
+CPPFLAGS += $(addprefix -I,../include)
+
+C_SRCS := $(wildcard *.c)
+CPP_SRCS := $(wildcard *.cpp)
+TARGETS := $(C_SRCS:.c=.so) $(CPP_SRCS:.cpp=.so)
+
+all: $(TARGETS)
+
+%.so: %.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
+
+%.so: %.cpp
+ $(CC) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
+
+clean:
+ rm -f $(TARGETS)
+
+.PHONY: all clean
diff --git a/tools/mfs/maio-utils/test/README.md b/tools/mfs/maio-utils/test/README.md
new file mode 100644
index 000000000000..245fc622c075
--- /dev/null
+++ b/tools/mfs/maio-utils/test/README.md
@@ -0,0 +1,61 @@
+Here, we provide some strategies library for some cases.
+
+- strategy_demo
+A simple strategy library shows how to write a demo.
+
+- strategy_prefetch
+A prefetch strategy library to prefetch data.
+
+- strategy_toend
+A readahead strategy until the end.
+
+- strategy_generate
+A tools using maio-util to generate the io trace.
+
+- strategy_replay
+A strategy to replay the io trace.
+
+### How to build
+
+Using `make -j` to build the libraries.
+
+### How to use
+
+Set the library path as the `-s` parameter's value to run with maio-utils. Here we show usage of the advanced replay strategy:
+
+- generate the io trace
+
+> assume you have mounted the mfs at /mnt/mfs, and you are in build directory.
+
+This is the tracing period.
+
+```
+export MAIO_TRACE_FILE=trace.txt
+export MAIO_MNTPOINT=/mnt/mfs # /mnt/mfs is the MFS mountpoint
+./bin/maio-utils -m /mnt/mfs -s ../test/strategy_generate.so
+```
+Then, you can trigger the io. After finishing, you would get the trace file with the content likes:
+
+```
+xxx/model-00001-of-000008.safetensors 0 32768 0
+xxx/model-00001-of-000008.safetensors 32768 32768 0
+xxx/model-00001-of-000008.safetensors 65536 32768 0
+xxx/model-00001-of-000008.safetensors 98304 32768 0
+xxx/model-00001-of-000008.safetensors 131072 32768 0
+...
+
+```
+
+The format is "<path> <offset> <length> <numaid>".
+
+- replay the io trace
+
+> assume you have mounted the mfs at /mnt/mfs and begin to launch the model.
+
+This is the running period.
+
+```
+export MAIO_TRACE_FILE=trace.txt
+export MAIO_MNTPOINT=/mnt/mfs # /mnt/mfs is the MFS mountpoint
+./bin/maio-utils -m /mnt/mfs -s ../test/strategy_replay.so
+```
diff --git a/tools/mfs/maio-utils/test/strategy_demo.c b/tools/mfs/maio-utils/test/strategy_demo.c
new file mode 100644
index 000000000000..54554b609f85
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_demo.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "maio.h"
+#include <stddef.h>
+
+int demo_init(void)
+{
+ return 0;
+}
+
+void demo_exit(void)
+{
+}
+
+int demo_load(struct maio **io)
+{
+ struct maio_entry *entry;
+
+ entry = (*io)->entries;
+ entry[0].toff = (*io)->off + (*io)->len;
+ entry[0].tlen = (*io)->cksz;
+ entry[0].fpath = NULL;
+ entry[0].tnuma = -1;
+ return 1;
+}
+
+int demo_evict(struct maio **io)
+{
+ ((void)io);
+ return 0;
+}
+
+const struct maio_operation demo_strategy = {
+ .max_io = 1,
+ .init = demo_init,
+ .exit = demo_exit,
+ .load = demo_load,
+ .evict = demo_evict,
+};
+
+struct maio_operation *register_strategy()
+{
+ return (struct maio_operation *)&demo_strategy;
+}
diff --git a/tools/mfs/maio-utils/test/strategy_generate.cpp b/tools/mfs/maio-utils/test/strategy_generate.cpp
new file mode 100644
index 000000000000..929be30e6a42
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_generate.cpp
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "maio.h"
+#include "strategy_template.h"
+
+#include <array>
+#include <atomic>
+#include <cstring>
+#include <condition_variable>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+#include <unistd.h>
+
+using namespace std;
+
+#define MAIO_MNTPOINT "MAIO_MNTPOINT"
+#define MAIO_TRACE_FILE "MAIO_TRACE_FILE"
+#define CACHEDIR "cachedir"
+
+#define MAXLINE 1024
+
+/* Data struct definition */
+template<typename T>
+class SafeQueue {
+private:
+ queue<T> data_queue;
+ mutex mtx;
+ condition_variable cv;
+
+public:
+ // 添加数据到队列
+ void push(T item) {
+ lock_guard<mutex> lock(mtx);
+ data_queue.push(move(item));
+ cv.notify_one();
+ }
+
+ // 从队列取出数据(阻塞直到有数据)
+ T pop() {
+ unique_lock<mutex> lock(mtx);
+ cv.wait(lock, [this]{ return !data_queue.empty(); });
+ T item = move(data_queue.front());
+ data_queue.pop();
+ return item;
+ }
+
+ // 检查队列是否为空
+ bool empty() {
+ lock_guard<mutex> lock(mtx);
+ return data_queue.empty();
+ }
+};
+
+struct io {
+ string name;
+ uint64_t off;
+ uint64_t len;
+ uint8_t npu;
+
+ io(const string& _name, uint64_t _off, uint64_t _len, uint8_t _npu)
+ : name(_name), off(_off), len(_len), npu(_npu) {}
+};
+
+/* Global variable definition */
+unordered_map<int, char*> fd_map; /* map fd to path */
+mutex fd_mutex;
+string cachedir;
+
+vector<io> iovec; /* io traces */
+mutex iovec_mutex;
+SafeQueue<vector<io>> iovec_queue;
+
+ofstream tfile; /* trace file */
+
+thread writer;
+
+unordered_map<int, int> process_npu_map;
+mutex pnpu_mutex;
+
+atomic<int> npu_smi_enable(-1);
+
+/* fd_map helper function */
+char* fd_path_find(int iofd) {
+ lock_guard<mutex> lock(fd_mutex);
+ auto it = fd_map.find(iofd);
+ if (it != fd_map.end()) {
+ return it->second;
+ } else {
+ char *path = _get_fullpath(NULL, iofd);
+ fd_map.insert({iofd, path});
+ return path;
+ }
+}
+
+/* iovec helper function */
+void io_insert_data(string name, uint64_t off, uint64_t len, uint8_t npu)
+{
+ lock_guard<mutex> lock(iovec_mutex);
+ iovec.emplace_back(name, off, len, npu);
+
+ vector<io> copy_iovec = move(iovec);
+ iovec.clear();
+
+ iovec_queue.push(move(copy_iovec));
+}
+
+void get_mount_option(const char* mntpoint, const char* option)
+{
+ ifstream mounts("/proc/mounts");
+ string line, target_mp;
+
+ if (!mounts.is_open()) {
+ cerr << "Error: could not open /proc/mounts" << endl;
+ return;
+ }
+
+ while (getline(mounts, line)) {
+ istringstream iss(line);
+ string device, mp, fs_type, options;
+
+ if (!(iss >> device >> mp >> fs_type >> options))
+ continue;
+
+ if (mp.back() != '/')
+ mp += '/';
+ target_mp = mntpoint;
+ if (target_mp.back() != '/')
+ target_mp += '/';
+
+ if (target_mp == mp) {
+ istringstream opt_stream(options);
+ string opt;
+
+ while (getline(opt_stream, opt, ',')) {
+ if (opt.find(option) == 0) {
+ if (opt[strlen(option)] == '=') {
+ cachedir = opt.substr(strlen(option) + 1);
+ }
+ }
+ }
+ break;
+ }
+ }
+}
+
+void writer_thread() {
+ while (true) {
+ vector<io> iovec_to_write = iovec_queue.pop();
+
+ for (const auto &item : iovec_to_write)
+ tfile << cachedir << item.name << " " << item.off << " " << item.len << " " << (unsigned int)item.npu << endl;
+ tfile.flush();
+ }
+}
+
+string exec_cmd(const char* cmd) {
+ array<char, 128> buffer;
+ string result;
+ unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+
+ if (!pipe) {
+ throw std::runtime_error("popen() failed!");
+ }
+ while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+ result += buffer.data();
+ }
+ return result;
+}
+
+bool isNumber(const std::string& str) {
+ return !str.empty() && str.find_first_not_of("0123456789") == std::string::npos;
+}
+
+void get_npu_info(string &npuinfo)
+{
+ istringstream iss(npuinfo);
+ string line;
+
+ while (getline(iss, line)) {
+ istringstream lineStream(line);
+ string token;
+ vector<string> tokens;
+ while (getline(lineStream, token, ' ')) {
+ if (!token.empty()) {
+ tokens.push_back(token);
+ }
+ }
+ if (tokens.size() >= 3) {
+ if (!isNumber(tokens[1]) || !isNumber(tokens[4]))
+ continue;
+ int npuId = stoi(tokens[1]);
+ int processId = stoi(tokens[4]);
+ process_npu_map[processId] = npuId;
+ }
+ }
+}
+
+int process_npu_find(int processid) {
+ lock_guard<mutex> lock(pnpu_mutex);
+ auto it = process_npu_map.find(processid);
+
+ if (npu_smi_enable.load() == -1) {
+ if (exec_cmd("command -v npu-smi").empty())
+ npu_smi_enable.store(0);
+ else
+ npu_smi_enable.store(1);
+ }
+
+ if (npu_smi_enable.load() == 0)
+ return 0;
+
+ if (it != process_npu_map.end()) {
+ return it->second;
+ } else {
+ string result = exec_cmd("npu-smi info");
+ get_npu_info(result);
+ auto it = process_npu_map.find(processid);
+ if (it != process_npu_map.end())
+ return it->second;
+ else
+ return 0;
+ }
+}
+
+int strategy_init(void)
+{
+ char *tfilename = getenv(MAIO_TRACE_FILE);
+ char *mntpoint = getenv(MAIO_MNTPOINT);
+
+ if(!tfilename || !mntpoint) {
+ printf("no MAIO_TRACE_FILE or MAIO_MNTPOINT env variable\n");
+ return -1;
+ }
+ get_mount_option(mntpoint, CACHEDIR);
+ string stfilename = tfilename;
+ tfile.open(stfilename, ios::trunc);
+ writer = thread(writer_thread);
+ if (!tfile.is_open()) {
+ cerr << "Error: could not open" << stfilename << endl;
+ return -1;
+ }
+ return 0;
+}
+
+void strategy_exit(void)
+{
+ lock_guard<mutex> lock(iovec_mutex);
+ vector<io> copy_iovec = move(iovec);
+ iovec.clear();
+ iovec_queue.push(move(copy_iovec));
+
+ tfile.close();
+ writer.join();
+}
+
+int strategy_load(struct maio **io)
+{
+ char *path;
+ string pathStr;
+ int ret = 0;
+
+ if (!tfile.is_open())
+ return -1;
+
+ path = fd_path_find((*io)->fd);
+ pathStr.assign(path);
+ io_insert_data(pathStr, (*io)->off, (*io)->len, process_npu_find((*io)->pid));
+
+ return ret;
+}
+
+int strategy_evict(struct maio **io)
+{
+ ((void)io);
+ return 0;
+}
+
+const struct maio_operation maio_strategy = {
+ .max_io = 1,
+ .init = strategy_init,
+ .exit = strategy_exit,
+ .load = strategy_load,
+ .evict = strategy_evict,
+};
+
+struct maio_operation *register_strategy()
+{
+ return (struct maio_operation *)&maio_strategy;
+}
diff --git a/tools/mfs/maio-utils/test/strategy_prefetch.cpp b/tools/mfs/maio-utils/test/strategy_prefetch.cpp
new file mode 100644
index 000000000000..c8a990925e3e
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_prefetch.cpp
@@ -0,0 +1,151 @@
+#include "strategy_prefetch.h"
+#include "maio.h"
+#include <iostream>
+#include <map>
+#include <pthread.h>
+#include <cstring>
+#include <string>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+using namespace std;
+
+struct prefetch_mgr {
+ map<string, uint64_t> files;
+};
+
+struct prefetch_mgr g_prefetch;
+
+int get_files(const char *parent)
+{
+ DIR *dir;
+ struct dirent *entry;
+ string filepath;
+ struct stat buf;
+
+ if ((dir = opendir(parent)) == NULL) {
+ cout<<"opening directory failed"<<endl;
+ return -1;
+ }
+
+ while ((entry = readdir(dir)) != NULL) {
+ if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+ continue;
+
+ filepath = string(parent) + "/" + string(entry->d_name);
+ if (stat(filepath.c_str(), &buf) == -1) {
+ cout<<"stat path:"<<filepath<<" failed"<<endl;
+ continue;
+ }
+ if (S_ISDIR(buf.st_mode))
+ continue;
+ cout<<"Find file:"<<filepath<<" size:"<<buf.st_size<<endl;
+ g_prefetch.files.insert(pair<string, uint64_t>(filepath, buf.st_size));
+ }
+
+ map<string, uint64_t>::iterator it;
+ for (it = g_prefetch.files.begin(); it != g_prefetch.files.end(); ++it) {
+ cout<<"path:"<<it->first<<" size:"<<it->second<<endl;
+ }
+ return 0;
+}
+
+int prefetch_init(void)
+{
+ char *parent = getenv("MODEL_WEIGHT_DIR");
+ if (!parent) {
+ cout<<"please set the env MODEL_WEIGHT_DIR"<<endl;
+ return -1;
+ }
+
+ if (get_files(parent) != 0) {
+ cout<<"get all files failed"<<endl;
+ return -1;
+ }
+ return 0;
+}
+
+void prefetch_exit(void)
+{
+}
+
+struct thread_ctx {
+ char *path;
+ uint64_t off;
+ uint64_t len;
+};
+
+void *fault(void *arg)
+{
+ struct thread_ctx *ctx = (struct thread_ctx *)arg;
+ int fd = open(ctx->path, O_RDONLY);
+ if (fd < 0) {
+ cout<<"open file:%s"<<ctx->path<<" failed"<<endl;
+ free(ctx);
+ return NULL;
+ }
+
+ cout<<"FAULT path:"<<ctx->path<<" fd:"<<fd<<" off:"<<ctx->off<<" len:"<<ctx->len<<endl;
+ void *addr = mmap(NULL, ctx->len, PROT_READ, MAP_SHARED, fd, 0);
+ uint64_t idx;
+ char tmp, total, *buffer = (char *)addr;
+ total = 'B';
+ for (idx = 0; idx < ctx->len; idx += 4096) {
+ tmp = buffer[idx];
+ total += tmp;
+ }
+ cout<<"Fault calculate total:"<<total<<endl;
+ munmap(addr, ctx->len);
+ close(fd);
+ free(ctx);
+ return NULL;
+}
+
+int prefetch_load(struct maio **io)
+{
+ map<string, uint64_t>::iterator it;
+ struct thread_ctx *ctx;
+ pthread_t t0;
+ int ret;
+
+ for (it = g_prefetch.files.begin(); it != g_prefetch.files.end(); ++it) {
+ ctx = (struct thread_ctx *)malloc(sizeof(struct thread_ctx));
+ if (!ctx) {
+ cout<<"malloc ctx failed"<<endl;
+ continue;
+ }
+ ctx->path = strdup(it->first.c_str());
+ ctx->off = 0;
+ ctx->len = it->second;
+ ret = pthread_create(&t0, NULL, fault, ctx);
+ if (ret == 0)
+ pthread_detach(t0);
+ }
+ g_prefetch.files.clear();
+
+ return 0;
+}
+
+int prefetch_evict(struct maio **io)
+{
+ ((void)io);
+ return 0;
+}
+
+const struct maio_operation prefetch_strategy = {
+ .max_io = 1,
+ .init = prefetch_init,
+ .exit = prefetch_exit,
+ .load = prefetch_load,
+ .evict = prefetch_evict,
+};
+
+struct maio_operation *register_strategy()
+{
+ return (struct maio_operation *)&prefetch_strategy;
+}
+
diff --git a/tools/mfs/maio-utils/test/strategy_prefetch.h b/tools/mfs/maio-utils/test/strategy_prefetch.h
new file mode 100644
index 000000000000..2c2475dc91f0
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_prefetch.h
@@ -0,0 +1,21 @@
+#ifndef STRATEGY_PREFETCH_H
+#define STRATEGY_PREFETCH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int prefetch_init(void);
+void prefetch_exit(void);
+int prefetch_load(struct maio *io);
+int prefetch_evict(struct maio **io);
+
+extern const struct maio_operation prefetch_strategy;
+
+struct maio_operation *register_strategy();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/mfs/maio-utils/test/strategy_replay.cpp b/tools/mfs/maio-utils/test/strategy_replay.cpp
new file mode 100644
index 000000000000..85c1c7defedd
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_replay.cpp
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "maio.h"
+#include "strategy_template.h"
+
+#include <atomic>
+#include <array>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+#include <pthread.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+using namespace std;
+
+#define MAIO_MNTPOINT "MAIO_MNTPOINT"
+#define MAIO_TRACE_FILE "MAIO_TRACE_FILE"
+#define CACHEDIR "cachedir"
+#define MAXLINE 1024
+
+/* Data structure definition */
+enum {
+ IO_NONE = 0,
+ IO_LOADED,
+ IO_EVICTED,
+};
+
+struct io {
+ string name;
+ uint64_t off;
+ uint64_t len;
+ uint8_t npu;
+ int flag;
+
+ explicit io(const string& _name, uint64_t _off, uint64_t _len, uint8_t _npu)
+ : name(_name), off(_off), len(_len), npu(_npu), flag(IO_NONE) {}
+};
+
+
+struct ioKey {
+ string name;
+ uint64_t off;
+ uint64_t len;
+ uint8_t npu;
+
+ ioKey(const string& _name, uint64_t _off, uint64_t _len, uint8_t _npu)
+ : name(_name), off(_off), len(_len), npu(_npu) {}
+
+ bool operator<(const ioKey &other) const {
+ return name < other.name || (name == other.name && off < other.off) ||
+ (name == other.name && off == other.off && len < other.len);
+ }
+};
+
+vector<io> iovec; /* io traces */
+map<ioKey, int> io_to_index;
+unordered_map<int, char*> fd_map;
+mutex fd_mutex;
+string cachedir;
+
+unordered_map<int, int> npu_numa_map;
+
+unordered_map<int, int> process_npu_map;
+mutex pnpu_mutex;
+
+/* for evict */
+#define EVICT_DIST 10000
+#define EVICT_IO_MAX 5000
+thread_local size_t evict_idx = 0;
+
+atomic<int> npu_smi_enable(-1);
+atomic<uint64_t> memory_usage(0);
+
+uint64_t memory_limit;
+uint64_t memory_limit_bound;
+string usage_in_bytes_path = "memory.usage_in_bytes";
+string limit_in_bytes_path = "memory.limit_in_bytes";
+string cgroup_mem="";
+
+string get_cgroup_path()
+{
+ fstream cgroup_file("/proc/self/cgroup");
+ if (!cgroup_file.is_open()) {
+ cerr << "Failed to open /proc/self/cgroup" << endl;
+ return "";
+ }
+
+ string line;
+ string cgroup_path;
+ while (getline(cgroup_file, line)) {
+ istringstream iss(line);
+ string hierarchy, controller, path;
+ if (!getline(iss, hierarchy, ':'))
+ continue;
+ if (!getline(iss, controller, ':'))
+ continue;
+ if (!getline(iss, path, ':'))
+ continue;
+ if (controller == "memory") {
+ cgroup_path = path;
+ break;
+ }
+ }
+ cgroup_file.close();
+ return cgroup_path;
+}
+
+/* read from /sys/fs/cgroup/memory/memory.[usage_in_bytes|limit_in_bytes] */
+uint64_t read_cgroup_memory(const string &file_path)
+{
+ if (cgroup_mem == "")
+ return -1;
+ string whole_path = "/sys/fs/cgroup/memory"+cgroup_mem+"/"+file_path;
+ ifstream cgroup_file(whole_path);
+
+ if(!cgroup_file.is_open()) {
+ cerr << "Failed to open " << file_path << endl;
+ return -1;
+ }
+
+ string line;
+ while(getline(cgroup_file, line)) {
+ try {
+ return stoll(line);
+ } catch (...) {
+ cerr << "Failed to parse line" << line << endl;
+ }
+ }
+
+ cgroup_file.close();
+ return -1;
+}
+
+/* mount helper */
+void get_mount_option(const char* mntpoint, const char* option)
+{
+ ifstream mounts("/proc/mounts");
+ string line, target_mp;
+
+ if (!mounts.is_open()) {
+ cerr << "Error: could not open /proc/mounts" << endl;
+ return;
+ }
+
+ while (getline(mounts, line)) {
+ istringstream iss(line);
+ string device, mp, fs_type, options;
+
+ if (!(iss >> device >> mp >> fs_type >> options))
+ continue;
+
+ if (mp.back() != '/')
+ mp += '/';
+ target_mp = mntpoint;
+ if (target_mp.back() != '/')
+ target_mp += '/';
+
+ if (target_mp == mp) {
+ istringstream opt_stream(options);
+ string opt;
+
+ while (getline(opt_stream, opt, ',')) {
+ if (opt.find(option) == 0) {
+ if (opt[strlen(option)] == '=') {
+ cachedir = opt.substr(strlen(option) + 1);
+ }
+ }
+ }
+ break;
+ }
+ }
+}
+
+/* fd_map helper function */
+char* fd_path_find(int iofd) {
+ lock_guard<mutex> lock(fd_mutex);
+ auto it = fd_map.find(iofd);
+ if (it != fd_map.end()) {
+ return it->second;
+ } else {
+ char *path = _get_fullpath(NULL, iofd);
+ size_t len = strlen(path) + cachedir.length() + 1;
+ char *realpath = (char*)malloc(len);
+ snprintf(realpath, len, "%s%s", cachedir.c_str(), path);
+ fd_map.insert({iofd, realpath});
+ return realpath;
+ }
+}
+
+string exec_cmd(const char* cmd) {
+ array<char, 128> buffer;
+ string result = "";
+ unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+
+ if (!pipe) {
+ throw std::runtime_error("popen() failed!");
+ }
+ while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+ result += buffer.data();
+ }
+ return result;
+}
+
+bool isNumber(const std::string& str) {
+ return !str.empty() && str.find_first_not_of("0123456789") == std::string::npos;
+}
+
+/* cpu_numa_map builder from lscpu */
+unordered_map<string, int> build_cpu_numa_map() {
+ unordered_map<string, int> cpuNumaMap;
+ string result = exec_cmd("lscpu");
+ istringstream iss(result);
+ string line;
+
+ while (getline(iss, line)) {
+ if (line.find("NUMA node") != string::npos) {
+ istringstream lineStream(line);
+ string key, value;
+ int numaNode;
+ getline(lineStream, value, ':');
+ getline(lineStream, key);
+ /* what if numa is more than 10 */
+ value = value.substr(value.find("NUMA node") + 9, 1);
+ if (!isNumber(value))
+ continue;
+ numaNode = stoi(value);
+ key.erase(0, key.find_first_not_of(" \t"));
+ key.erase(key.find_last_not_of(" \t") + 1);
+ cpuNumaMap[key]=numaNode;
+ }
+ }
+ return cpuNumaMap;
+}
+
+/* npu_numa_map builder, from npu-smi info -t topo */
+void build_npu_map_map()
+{
+ if (npu_smi_enable.load() == -1) {
+ if (exec_cmd("command -v npu-smi").empty())
+ npu_smi_enable.store(0);
+ else
+ npu_smi_enable.store(1);
+ }
+
+ if (npu_smi_enable.load() == 0)
+ return;
+
+ unordered_map<string, int> cpuNumaMap = build_cpu_numa_map();
+ string result = exec_cmd("npu-smi info -t topo");
+ istringstream iss(result);
+ string line;
+ size_t target_token_size = 0;
+
+ cout << "NPU\tCPU_AFFINITY\tNUMA_NODE" << endl;
+ while (getline(iss, line)) {
+ istringstream lineStream(line);
+ string npuStr, numaStr, token;
+ vector<string> tokens;
+
+ while (lineStream >> token) {
+ tokens.push_back(token);
+ }
+
+ if (tokens.size() >= 2 && tokens.back() == "Affinity") {
+ target_token_size = tokens.size() - 1 + 1;
+ continue;
+ }
+
+ if (tokens.size() != target_token_size)
+ continue;
+
+ int npuId = stoi(tokens[0].substr(3));
+
+ numaStr = tokens.back();
+ cout << npuId << "\t" << numaStr << "\t\t" << cpuNumaMap[numaStr] << endl;
+ npu_numa_map[npuId] = cpuNumaMap[numaStr];
+ }
+}
+
+/* process_npu_map builder, from npu-smi info */
+void get_npu_info(string &npuinfo)
+{
+ istringstream iss(npuinfo);
+ string line;
+
+ while (getline(iss, line)) {
+ istringstream lineStream(line);
+ string token;
+ vector<string> tokens;
+ while (getline(lineStream, token, ' ')) {
+ if (!token.empty()) {
+ tokens.push_back(token);
+ }
+ }
+ if (tokens.size() >= 3) {
+ if (!isNumber(tokens[1]) || !isNumber(tokens[4]))
+ continue;
+ int npuId = stoi(tokens[1]);
+ int processId = stoi(tokens[4]);
+ process_npu_map[processId] = npuId;
+ }
+ }
+}
+
+/* process_npu_map helper */
+int process_npu_find(int processid) {
+ lock_guard<mutex> lock(pnpu_mutex);
+ auto it = process_npu_map.find(processid);
+
+ if (npu_smi_enable.load() == 0)
+ return 0;
+
+ if (it != process_npu_map.end()) {
+ return it->second;
+ } else {
+ string result = exec_cmd("npu-smi info");
+ get_npu_info(result);
+ auto it = process_npu_map.find(processid);
+ if (it != process_npu_map.end())
+ return it->second;
+ else
+ return 0;
+ }
+}
+
+int strategy_init(void)
+{
+ const char *mntpoint = getenv(MAIO_MNTPOINT);
+ char *tfilename = getenv(MAIO_TRACE_FILE);
+ if(!tfilename || !mntpoint) {
+ fprintf(stderr, "no MAIO_TRACE_FILE or MAIO_MNTPOINT env variable\n");
+ return -1;
+ }
+ ifstream tfile(tfilename);
+ string line;
+
+ cgroup_mem=get_cgroup_path();
+ memory_limit = read_cgroup_memory(limit_in_bytes_path);
+ memory_limit_bound = memory_limit * 4 / 5;
+ cout << "memory_limit: " << memory_limit << " memory_limit_bound: " << memory_limit_bound << endl;
+ get_mount_option(mntpoint, CACHEDIR);
+ build_npu_map_map();
+
+ if(!tfile.is_open()) {
+ cerr << "Failed to open file:" << tfilename << endl;
+ return -1;
+ }
+
+ while(getline(tfile, line)) {
+ istringstream iss(line);
+
+ string name;
+ uint64_t off;
+ uint64_t len;
+ unsigned int npu;
+ if (iss >> name >> off >> len >> npu) {
+ iovec.emplace_back(name, off, len, npu);
+ io_to_index[ioKey(name, off, len, npu)] = iovec.size() - 1;
+ } else {
+ cerr << "Failed to parse line:" << line << endl;
+ }
+ }
+
+ tfile.close();
+ return 0;
+}
+
+void strategy_exit(void)
+{
+}
+
+int strategy_load(struct maio **io)
+{
+ int npuid = process_npu_find((*io)->pid);
+ struct maio_entry *entry;
+ string pathStr;
+ char *path;
+ uint64_t idx = iovec.size();
+ int ret = 0;
+ struct maio *new_io;
+
+ path = fd_path_find((*io)->fd);
+ pathStr.assign(path);
+
+ ioKey key(pathStr, (*io)->off, (*io)->len, npuid);
+ auto io_it = io_to_index.lower_bound(key);
+ if (io_it == io_to_index.end()) {
+ entry = (*io)->entries;
+ entry[0].toff = (*io)->off + (*io)->len;
+ entry[0].tlen = 0;
+ entry[0].fpath = NULL;
+ entry[0].tnuma = npu_numa_map[npuid];
+ return 1;
+ }
+ if (key < io_it->first) {
+ if (io_it != io_to_index.begin()) {
+ --io_it;
+ if (io_it->first.name != pathStr) {
+ ++io_it;
+ }
+ }
+ }
+
+ idx = io_it->second;
+ (*io)->flags |= MAIO_WITH_SEQ;
+ (*io)->curseq = idx;
+ entry = (*io)->entries;
+ int ios = 0;
+ uint64_t memory_usage = read_cgroup_memory(usage_in_bytes_path);
+ vector<int> io_extra;
+
+ /* fill ios below max_io */
+ while (idx < iovec.size()) {
+ if (iovec[idx].flag == IO_LOADED) {
+ idx++;
+ continue;
+ }
+ memory_usage += iovec[idx].len / 2;
+ if (memory_usage > memory_limit_bound)
+ break;
+ if (ios >= maio_strategy.max_io) {
+ io_extra.push_back(idx);
+ goto next_io;
+ }
+ entry[ios].seq = idx;
+ if (pathStr == iovec[idx].name) {
+ entry[ios].toff = iovec[idx].off;
+ entry[ios].tlen = iovec[idx].len;
+ entry[ios].tnuma = npu_numa_map[npuid];
+ } else {
+ entry[ios].fpath = strdup(iovec[idx].name.c_str());
+ entry[ios].toff = iovec[idx].off;
+ entry[ios].tlen = iovec[idx].len;
+ entry[ios].tnuma = npu_numa_map[iovec[idx].npu];
+ }
+next_io:
+ iovec[idx].flag = IO_LOADED;
+ idx++;
+ ios++;
+ }
+
+ /* fill ios above max_io */
+ if (ios > maio_strategy.max_io) {
+ new_io = (struct maio *)realloc(*io, sizeof(struct maio) + ios * sizeof(struct maio_entry));
+ if (!new_io) {
+ printf("failed to alloc ios:%d maio\n", ios);
+ return -1;
+ }
+ *io = new_io;
+ entry = (*io)->entries;
+ for (int i = maio_strategy.max_io; i < ios; ++i) {
+ int io_idx = io_extra[i - maio_strategy.max_io];
+ entry[i].seq = io_idx;
+ if (pathStr == iovec[io_idx].name) {
+ entry[i].toff = iovec[io_idx].off;
+ entry[i].tlen = iovec[io_idx].len;
+ entry[i].tnuma = npu_numa_map[npuid];
+ } else {
+ entry[i].fpath = strdup(iovec[io_idx].name.c_str());
+ entry[i].toff = iovec[io_idx].off;
+ entry[i].tlen = iovec[io_idx].len;
+ entry[i].tnuma = npu_numa_map[iovec[io_idx].npu];
+ }
+ }
+ }
+
+ // printf("file:%s off:%lu len:%lu generate_ios:%d from %d",path, (*io)->off, (*io)->len, ios, io_it->second);
+ ret = ios;
+ return ret;
+}
+
+int strategy_evict(struct maio **io)
+{
+ string pathStr;
+ char *path;
+ path = fd_path_find((*io)->fd);
+ pathStr.assign(path);
+ ioKey key(pathStr, (*io)->off, (*io)->len, (*io)->pid);
+ auto io_it = io_to_index.lower_bound(key);
+ int dist = 0;
+ size_t i = 0, io_idx = io_it->second;
+ struct maio_entry *entry;
+ struct maio *new_io;
+ uint64_t evict_memory = 0;
+ uint64_t memory_usage = read_cgroup_memory(usage_in_bytes_path);
+
+ if (memory_usage < (memory_limit_bound * 3 / 4))
+ return 0;
+ dist = io_idx - evict_idx;
+ if (dist < 0)
+ return 0;
+ if (static_cast<size_t>(dist) < iovec.size() / 8 && io_it != io_to_index.end())
+ return 0;
+ size_t evict_win = 0;
+ if (io_it != io_to_index.end()) {
+ evict_win = min(iovec.size() / 10, io_idx - 1);
+ } else {
+ evict_win = dist;
+ }
+
+ new_io = (struct maio*)realloc(*io, sizeof(struct maio) + evict_win * sizeof(struct maio_entry));
+ if (!new_io)
+ return -1;
+ *io = new_io;
+ entry = new_io->entries;
+
+ for (i = 0; i < evict_win && evict_idx < iovec.size(); ++i) {
+ evict_memory += iovec[evict_idx].len;
+ if (evict_memory > memory_limit_bound / 8)
+ break;
+ entry[i].fpath = strdup(iovec[evict_idx].name.c_str());
+ entry[i].toff = iovec[evict_idx].off;
+ entry[i].tlen = iovec[evict_idx].len;
+ evict_idx++;
+ iovec[evict_idx].flag = IO_EVICTED;
+ }
+ return i;
+}
+
+const struct maio_operation maio_strategy = {
+ .max_io = 500,
+ .init = strategy_init,
+ .exit = strategy_exit,
+ .load = strategy_load,
+ .evict = strategy_evict,
+};
+
+struct maio_operation *register_strategy()
+{
+ return (struct maio_operation *)&maio_strategy;
+}
diff --git a/tools/mfs/maio-utils/test/strategy_template.h b/tools/mfs/maio-utils/test/strategy_template.h
new file mode 100644
index 000000000000..6599b68754c9
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_template.h
@@ -0,0 +1,21 @@
+#ifndef STRATEGY_TEMPLATE_H
+#define STRATEGY_TEMPLATE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int strategy_init(void);
+void strategy_exit(void);
+int strategy_load(struct maio **io);
+int strategy_evict(struct maio *io);
+
+extern const struct maio_operation maio_strategy;
+
+struct maio_operation *register_strategy();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/mfs/maio-utils/test/strategy_toend.c b/tools/mfs/maio-utils/test/strategy_toend.c
new file mode 100644
index 000000000000..f2208281f8ee
--- /dev/null
+++ b/tools/mfs/maio-utils/test/strategy_toend.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2025 Huawei Technologies Co., Ltd
+ */
+#include "maio.h"
+#include <stddef.h>
+
+int demo_init(void)
+{
+ return 0;
+}
+
+void demo_exit(void)
+{
+}
+
+int demo_load(struct maio **io)
+{
+ struct maio_entry *entry;
+
+ entry = (*io)->entries;
+ entry[0].toff = (*io)->off + (*io)->len;
+ /* Read Ahead until the end */
+ entry[0].tlen = 0;
+ entry[0].fpath = NULL;
+ entry[0].tnuma = -1;
+ return 1;
+}
+
+int demo_evict(struct maio **io)
+{
+ ((void)io);
+ return 0;
+}
+
+const struct maio_operation demo_strategy = {
+ .max_io = 1,
+ .init = demo_init,
+ .exit = demo_exit,
+ .load = demo_load,
+ .evict = demo_evict,
+};
+
+struct maio_operation *register_strategy()
+{
+ return (struct maio_operation *)&demo_strategy;
+}
diff --git a/tools/mfs/maio-utils/test/trace_analyse.py b/tools/mfs/maio-utils/test/trace_analyse.py
new file mode 100644
index 000000000000..c25d1ce364df
--- /dev/null
+++ b/tools/mfs/maio-utils/test/trace_analyse.py
@@ -0,0 +1,87 @@
+def merge_intervals(intervals):
+ """
+ 合并区间
+ :param intervals: 区间列表,每个区间是一个元组 (start, end)
+ :return: 合并后的区间列表
+ """
+ if not intervals:
+ return []
+
+ # 按起始位置排序
+ intervals.sort(key=lambda x: x[0])
+
+ merged = [intervals[0]]
+ for current in intervals[1:]:
+ last_merged = merged[-1]
+ if current[0] <= last_merged[1]: # 有重叠
+ merged[-1] = (last_merged[0], max(last_merged[1], current[1]))
+ else:
+ merged.append(current)
+
+ return merged
+
+def process_file(input_file):
+ """
+ 处理输入文件
+ :param input_file: 输入文件路径
+ :return: 每个文件的合并区间和总大小
+ """
+ from collections import defaultdict
+
+ # 使用字典按文件名分组
+ file_intervals = defaultdict(list)
+
+ with open(input_file, 'r') as f:
+ for line in f:
+ parts = line.strip().split()
+ if len(parts) < 3:
+ print(f"跳过无效行: {line.strip()}")
+ continue
+
+ # 解析文件名、起始位置、大小
+ filename, start, size = parts[:3]
+ start = int(start)
+ size = int(size)
+
+ # 计算区间
+ end = start + size
+ file_intervals[filename].append((start, end))
+
+ # 合并每个文件的区间
+ merged_results = {}
+ for filename, intervals in file_intervals.items():
+ merged_intervals = merge_intervals(intervals)
+ # 计算合并后的区间总大小
+ total_size = sum(end - start for start, end in merged_intervals)
+ merged_results[filename] = {
+ "merged_intervals": merged_intervals,
+ "total_size": total_size
+ }
+
+ return merged_results
+
+def main():
+ import sys
+
+ if len(sys.argv) != 2:
+ print("用法: python script.py <输入文件>")
+ sys.exit(1)
+
+ input_file = sys.argv[1]
+
+ merged_results = process_file(input_file)
+
+ whole_result=0
+
+ for filename, result in merged_results.items():
+ print(f"文件: {filename}")
+ # print("合并后的区间:")
+ # for interval in result["merged_intervals"]:
+ # print(f"起始位置: {interval[0]}, 结束位置: {interval[1]}")
+ print(f"所有区间内数据的总大小: {result['total_size']} 字节")
+ print("-" * 40)
+ whole_result+=result['total_size']
+ print(f"所有文件内数据的总大小: {whole_result} 字节")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/mfs/maio-utils/test/trace_gen.py b/tools/mfs/maio-utils/test/trace_gen.py
new file mode 100644
index 000000000000..d050fa48801c
--- /dev/null
+++ b/tools/mfs/maio-utils/test/trace_gen.py
@@ -0,0 +1,116 @@
+import argparse
+import os
+import sys
+
+FILENAME_IDX = 0
+OFF_IDX = 1
+LEN_IDX = 2
+NUMA_IDX = 3
+OFF_BOUND = 1024 * 1024 * 4096
+
+class ReplayHintGenerator(object):
+ def __init__(self, trace_file, output_file):
+ self.output_file = output_file
+ self.trace_file = trace_file
+ self.data_list = []
+
+ def generate_hint(self):
+ with open(self.output_file, 'w') as f:
+ for data in self.data_list:
+ data_off = data[OFF_IDX]
+ data_len = data[LEN_IDX]
+ print(data_off)
+ step = (512 * 1024 * 1024)
+ while data_len:
+ if data_len < step:
+ step = data_len
+ print(f"{data[FILENAME_IDX]} {data_off} {step} {data[NUMA_IDX]}", file=f)
+ data_len-=step
+ data_off+=step
+
+ def parse_trace(self):
+ file_map_index={}
+ with open(self.trace_file, 'r') as f:
+ for line in f:
+ parts = line.strip().split(' ')
+
+ filename, off, length, numaid = parts
+ off = int(off)
+ length = int(length)
+ numaid = int(numaid)
+ if self.data_list:
+ tail_data = self.data_list[-1]
+ '''
+ 首先尝试跟末尾的数据合并:
+ 如若跟末尾的数据是同一个文件跟numaid,
+ 两者之间的距离小于设定值,
+ 就进行合并
+ '''
+ if (tail_data[FILENAME_IDX] == filename and tail_data[NUMA_IDX] == numaid):
+ tail_off = tail_data[OFF_IDX]
+ tail_end = tail_data[OFF_IDX] + tail_data[LEN_IDX]
+ end = off + length
+ abs_off = max(abs(off - tail_end), abs(tail_off - end))
+ if abs_off < OFF_BOUND:
+ tail_data[OFF_IDX] = min(tail_off, off)
+ tail_data[LEN_IDX] = max(tail_end, end) - tail_data[OFF_IDX]
+ continue
+ else:
+ '''
+ 否则跟之前同一文件和numaid的数据进行合并
+ 如若两者之间距离小于设定值,
+ 就进行合并
+ '''
+ key = (filename, numaid)
+ if key in file_map_index:
+ index = file_map_index[key]
+ data = self.data_list[index]
+ data_off = data[OFF_IDX]
+ data_end = data[OFF_IDX] + data[LEN_IDX]
+ end = off + length
+ abs_off = max(abs(off - data_end), abs(data_off - end))
+ if abs_off < OFF_BOUND:
+ data[OFF_IDX] = min(data_off, off)
+ data[LEN_IDX] = max(data_end, end) - data[OFF_IDX]
+ continue
+ self.data_list.append([filename, off, length, numaid])
+ file_map_index[(filename, numaid)] = len(self.data_list) - 1
+
+
+def main(agrv):
+ """
+ Parse the argument
+ """
+ parser = argparse.ArgumentParser('model trace parser')
+ parser.add_argument('--trace_file',
+ required=True,
+ type=str,
+ help='trace source')
+ parser.add_argument('--output_file',
+ required=True,
+ type=str,
+ help='output of hint file')
+ try:
+ args = parser.parse_args()
+ trace_file = args.trace_file
+ output_file = args.output_file
+ if not os.path.exists(trace_file):
+ print("{trace_file} not exist")
+ return -1
+
+ hint_generator=ReplayHintGenerator(trace_file, output_file)
+ hint_generator.parse_trace()
+ hint_generator.generate_hint()
+ return 0
+ except Exception as e:
+ print(f"Parse model trace exception:{str(e)}")
+ return -1
+
+if __name__ == '__main__':
+ try:
+ ret = main(sys.argv[1:])
+ except Exception as main_e:
+ print(str(main_e))
+ ret = -1
+ sys.exit(ret)
+
--
2.34.1
2
1
[PATCH OLK-6.6] bpf, arm64: Force 8-byte alignment for JIT buffer to prevent atomic tearing
by Pu Lehui 19 Mar '26
by Pu Lehui 19 Mar '26
19 Mar '26
From: Fuad Tabba <tabba(a)google.com>
mainline inclusion
from mainline-v7.0-rc2
commit ef06fd16d48704eac868441d98d4ef083d8f3d07
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/8762
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
struct bpf_plt contains a u64 target field. Currently, the BPF JIT
allocator requests an alignment of 4 bytes (sizeof(u32)) for the JIT
buffer.
Because the base address of the JIT buffer can be 4-byte aligned (e.g.,
ending in 0x4 or 0xc), the relative padding logic in build_plt() fails
to ensure that target lands on an 8-byte boundary.
This leads to two issues:
1. UBSAN reports misaligned-access warnings when dereferencing the
structure.
2. More critically, target is updated concurrently via WRITE_ONCE() in
bpf_arch_text_poke() while the JIT'd code executes ldr. On arm64,
64-bit loads/stores are only guaranteed to be single-copy atomic if
they are 64-bit aligned. A misaligned target risks a torn read,
causing the JIT to jump to a corrupted address.
Fix this by increasing the allocation alignment requirement to 8 bytes
(sizeof(u64)) in bpf_jit_binary_pack_alloc(). This anchors the base of
the JIT buffer to an 8-byte boundary, allowing the relative padding math
in build_plt() to correctly align the target field.
Fixes: b2ad54e1533e ("bpf, arm64: Implement bpf_arch_text_poke() for arm64")
Signed-off-by: Fuad Tabba <tabba(a)google.com>
Acked-by: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20260226075525.233321-1-tabba@google.com
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Conflicts:
arch/arm64/net/bpf_jit_comp.c
[The conflicts were due to not merge commit 1dad391daef1]
Signed-off-by: Pu Lehui <pulehui(a)huawei.com>
---
arch/arm64/net/bpf_jit_comp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f98ae8bf4e19..2494a5c96489 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1648,7 +1648,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align);
image_size = extable_offset + extable_size;
header = bpf_jit_binary_alloc(image_size, &image_ptr,
- sizeof(u32), jit_fill_hole);
+ sizeof(u64), jit_fill_hole);
if (header == NULL) {
prog = orig_prog;
goto out_off;
--
2.34.1
2
1
[PATCH OLK-6.6] mm/shrinker: Implementation of register_shrinker() and unregister_shrinker()
by Zhang Qilong 19 Mar '26
by Zhang Qilong 19 Mar '26
19 Mar '26
hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/8743
--------------------------------
The commit 6a317cb001d1 ("mm: shrinker: remove old APIs") removed
old shrinker API. In order to maintain the compatibility of the
KABI interface, it is necessary to adapt the old interface to
connect to the new shrinking function.
Signed-off-by: Zhang Qilong <zhangqilong3(a)huawei.com>
---
include/linux/shrinker.h | 5 +++--
mm/vmscan.c | 44 +++++++++++++++++++++++++++++++++++-----
2 files changed, 42 insertions(+), 7 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 7091c5d21a9f..3467b18a8dbb 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -72,10 +72,11 @@ struct shrink_control {
};
#define SHRINK_STOP (~0UL)
#define SHRINK_EMPTY (~0UL - 1)
+struct shrinker;
/*
* A callback you can register to apply pressure to ageable caches.
*
* @count_objects should return the number of freeable items in the cache. If
* there are no objects to free, it should return SHRINK_EMPTY, while 0 is
@@ -118,11 +119,11 @@ struct shrinker_v2 {
struct dentry *debugfs_entry;
#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
- KABI_RESERVE(1)
+ KABI_USE(1, struct shrinker *v1)
KABI_RESERVE(2)
/*
* The reference count of this shrinker. Registered shrinker have an
* initial refcount of 1, then the lookup operations are now allowed
@@ -161,11 +162,11 @@ struct shrinker {
const char *name;
struct dentry *debugfs_entry;
#endif
atomic_long_t *nr_deferred;
- KABI_RESERVE(1)
+ KABI_USE(1, struct shrinker_v2 *v2)
KABI_RESERVE(2)
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e4aedbfd85e..cf6b9e6dc8cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7608,22 +7608,56 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+static unsigned long shrinker_v2_to_v1_count_objects(struct shrinker_v2 *shk_v2,
+ struct shrink_control *sc)
+{
+ if (shk_v2->v1)
+ return shk_v2->v1->count_objects(shk_v2->v1, sc);
+
+ return 0;
+}
+
+static unsigned long shrinker_v2_to_v1_scan_objects(struct shrinker_v2 *shk_v2,
+ struct shrink_control *sc)
+{
+ if (shk_v2->v1)
+ return shk_v2->v1->count_objects(shk_v2->v1, sc);
+
+ return SHRINK_EMPTY;
+}
+
int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
{
- pr_warn_once("%s interface is deprecated. Please use instead shrinker_register()\n",
- __func__);
- return -ENOSYS;
+ struct shrinker_v2 *shk_v2;
+
+ shk_v2 = shrinker_alloc(shrinker->flags, fmt);
+ if (!shk_v2)
+ return -ENOMEM;
+
+ shk_v2->count_objects = shrinker_v2_to_v1_count_objects;
+ shk_v2->scan_objects = shrinker_v2_to_v1_scan_objects;
+ shk_v2->batch = shrinker->batch;
+ shk_v2->seeks = shrinker->seeks;
+ shk_v2->flags |= shrinker->flags;
+ shk_v2->v1 = shrinker;
+
+ shrinker_register(shk_v2);
+ shrinker->v2 = shk_v2;
+
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
- pr_warn_once("%s interface is deprecated. Please use instead shrinker_free()\n",
- __func__);
+ if (shrinker->v2) {
+ shrinker_free(shrinker->v2);
+ shrinker->v2 = NULL;
+ }
}
EXPORT_SYMBOL(unregister_shrinker);
void synchronize_shrinkers(void)
{
--
2.43.0
2
1
[PATCH OLK-5.10] mm/shrinker: Implementation of register_shrinker() and unregister_shrinker()
by Zhang Qilong 19 Mar '26
by Zhang Qilong 19 Mar '26
19 Mar '26
hulk inclusion
category: feature
bugzilla: https://atomgit.com/openeuler/kernel/issues/8743
--------------------------------
The commit 6a317cb001d1 ("mm: shrinker: remove old APIs") removed
old shrinker API. In order to maintain the compatibility of the
KABI interface, it is necessary to adapt the old interface to
connect to the new shrinking function.
Signed-off-by: Zhang Qilong <zhangqilong3(a)huawei.com>
---
include/linux/shrinker.h | 5 +++--
mm/vmscan.c | 44 +++++++++++++++++++++++++++++++++++-----
2 files changed, 42 insertions(+), 7 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 7091c5d21a9f..3467b18a8dbb 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -72,10 +72,11 @@ struct shrink_control {
};
#define SHRINK_STOP (~0UL)
#define SHRINK_EMPTY (~0UL - 1)
+struct shrinker;
/*
* A callback you can register to apply pressure to ageable caches.
*
* @count_objects should return the number of freeable items in the cache. If
* there are no objects to free, it should return SHRINK_EMPTY, while 0 is
@@ -118,11 +119,11 @@ struct shrinker_v2 {
struct dentry *debugfs_entry;
#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
- KABI_RESERVE(1)
+ KABI_USE(1, struct shrinker *v1)
KABI_RESERVE(2)
/*
* The reference count of this shrinker. Registered shrinker have an
* initial refcount of 1, then the lookup operations are now allowed
@@ -161,11 +162,11 @@ struct shrinker {
const char *name;
struct dentry *debugfs_entry;
#endif
atomic_long_t *nr_deferred;
- KABI_RESERVE(1)
+ KABI_USE(1, struct shrinker_v2 *v2)
KABI_RESERVE(2)
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9e4aedbfd85e..cf6b9e6dc8cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7608,22 +7608,56 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+static unsigned long shrinker_v2_to_v1_count_objects(struct shrinker_v2 *shk_v2,
+ struct shrink_control *sc)
+{
+ if (shk_v2->v1)
+ return shk_v2->v1->count_objects(shk_v2->v1, sc);
+
+ return 0;
+}
+
+static unsigned long shrinker_v2_to_v1_scan_objects(struct shrinker_v2 *shk_v2,
+ struct shrink_control *sc)
+{
+ if (shk_v2->v1)
+ return shk_v2->v1->count_objects(shk_v2->v1, sc);
+
+ return SHRINK_EMPTY;
+}
+
int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
{
- pr_warn_once("%s interface is deprecated. Please use instead shrinker_register()\n",
- __func__);
- return -ENOSYS;
+ struct shrinker_v2 *shk_v2;
+
+ shk_v2 = shrinker_alloc(shrinker->flags, fmt);
+ if (!shk_v2)
+ return -ENOMEM;
+
+ shk_v2->count_objects = shrinker_v2_to_v1_count_objects;
+ shk_v2->scan_objects = shrinker_v2_to_v1_scan_objects;
+ shk_v2->batch = shrinker->batch;
+ shk_v2->seeks = shrinker->seeks;
+ shk_v2->flags |= shrinker->flags;
+ shk_v2->v1 = shrinker;
+
+ shrinker_register(shk_v2);
+ shrinker->v2 = shk_v2;
+
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
- pr_warn_once("%s interface is deprecated. Please use instead shrinker_free()\n",
- __func__);
+ if (shrinker->v2) {
+ shrinker_free(shrinker->v2);
+ shrinker->v2 = NULL;
+ }
}
EXPORT_SYMBOL(unregister_shrinker);
void synchronize_shrinkers(void)
{
--
2.43.0
2
1
您好!
Kernel 邀请您参加 2026-03-20 14:00 召开的WeLink会议(自动录制)
会议主题:openEuler Kernel SIG双周例会
会议发起人: LiaoTao_Wave
会议内容:
1. 进展update
2. 议题征集中
(新增议题可直接填报至会议纪要看板)
会议链接:https://meeting.huaweicloud.com:36443/#/j/986250028
会议纪要:https://etherpad.openeuler.org/p/Kernel-meetings
更多资讯尽在:https://www.openeuler.org/zh/
Hello!
Kernel invites you to attend the WeLink conference(auto recording) will be held at 2026-03-20 14:00
The subject of the conference is openEuler Kernel SIG双周例会
The sponsor of the conference is LiaoTao_Wave
Summary:
1. 进展update
2. 议题征集中
(新增议题可直接填报至会议纪要看板)
You can join the meeting at https://meeting.huaweicloud.com:36443/#/j/986250028
Add topics at https://etherpad.openeuler.org/p/Kernel-meetings
More information: https://www.openeuler.org/en/
1
0
From: Yang Yingliang <yangyingliang(a)huawei.com>
hulk inclusion
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/8748
CVE: NA
--------------------------------------------------
The memory stat is not done in real time, it have some gap with
real value. In CPU-less NUMA node, the values of MemTotal and
MemFree can be nearly equal, the gap make cause MemFree is bigger
than MemTotal, it leads MemUsed is negative which print as a large
positive number.
cat /sys/devices/system/node/node17/meminfo
Node 17 MemTotal: 4194304 kB
Node 17 MemFree: 4195552 kB
Node 17 MemUsed: 18446744073709550368 kB
Node 17 Active: 52 kB
Node 17 Inactive: 320 kB
Node 17 Active(anon): 0 kB
Node 17 Inactive(anon): 0 kB
Node 17 Active(file): 52 kB
Node 17 Inactive(file): 320 kB
Node 17 Unevictable: 0 kB
Node 17 Mlocked: 0 kB
Node 17 Dirty: 0 kB
Node 17 Writeback: 0 kB
Node 17 FilePages: 372 kB
Node 17 Mapped: 320 kB
Node 17 AnonPages: 0 kB
Node 17 Shmem: 0 kB
Node 17 KernelStack: 0 kB
Node 17 PageTables: 0 kB
Node 17 NFS_Unstable: 0 kB
Node 17 Bounce: 0 kB
Node 17 WritebackTmp: 0 kB
Node 17 KReclaimable: 0 kB
Node 17 Slab: 0 kB
Node 17 SReclaimable: 0 kB
Node 17 SUnreclaim: 0 kB
Node 17 AnonHugePages: 79872 kB
Node 17 ShmemHugePages: 0 kB
Node 17 ShmemPmdMapped: 0 kB
Node 17 FileHugePages: 0 kB
Node 17 FilePmdMapped: 0 kB
Node 17 HugePages_Total: 0
Node 17 HugePages_Free: 0
Node 17 HugePages_Surp: 0
To avoid this exception, make MemFree equals MemTotal, when MemFree
is bigger than MemTotal
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
mm/page_alloc.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 72d5303f8000..02db21de93aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5837,6 +5837,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+ val->freeram = min(val->freeram, val->totalram);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
--
2.33.0
2
1
From: Yang Yingliang <yangyingliang(a)huawei.com>
The memory stat is not done in real time, it have some gap with
real value. In CPU-less NUMA node, the values of MemTotal and
MemFree can be nearly equal, the gap make cause MemFree is bigger
than MemTotal, it leads MemUsed is negative which print as a large
positive number.
cat /sys/devices/system/node/node17/meminfo
Node 17 MemTotal: 4194304 kB
Node 17 MemFree: 4195552 kB
Node 17 MemUsed: 18446744073709550368 kB
Node 17 Active: 52 kB
Node 17 Inactive: 320 kB
Node 17 Active(anon): 0 kB
Node 17 Inactive(anon): 0 kB
Node 17 Active(file): 52 kB
Node 17 Inactive(file): 320 kB
Node 17 Unevictable: 0 kB
Node 17 Mlocked: 0 kB
Node 17 Dirty: 0 kB
Node 17 Writeback: 0 kB
Node 17 FilePages: 372 kB
Node 17 Mapped: 320 kB
Node 17 AnonPages: 0 kB
Node 17 Shmem: 0 kB
Node 17 KernelStack: 0 kB
Node 17 PageTables: 0 kB
Node 17 NFS_Unstable: 0 kB
Node 17 Bounce: 0 kB
Node 17 WritebackTmp: 0 kB
Node 17 KReclaimable: 0 kB
Node 17 Slab: 0 kB
Node 17 SReclaimable: 0 kB
Node 17 SUnreclaim: 0 kB
Node 17 AnonHugePages: 79872 kB
Node 17 ShmemHugePages: 0 kB
Node 17 ShmemPmdMapped: 0 kB
Node 17 FileHugePages: 0 kB
Node 17 FilePmdMapped: 0 kB
Node 17 HugePages_Total: 0
Node 17 HugePages_Free: 0
Node 17 HugePages_Surp: 0
To avoid this exception, make MemFree equals MemTotal, when MemFree
is bigger than MemTotal
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
mm/page_alloc.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 72d5303f8000..02db21de93aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5837,6 +5837,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+ val->freeram = min(val->freeram, val->totalram);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
--
2.33.0
2
1
17 Mar '26
hulk inclusion
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/8696
Reference: https://lore.kernel.org/netdev/20251215145537.5085-1-m.lobanov@rosa.ru/
----------------------------------------
A reproducible rcuref - imbalanced put() warning is observed under
IPv6 L2TP (pppol2tp) traffic with blackhole routes, indicating an
imbalance in dst reference counting for routes cached in
sk->sk_dst_cache and pointing to a subtle lifetime/synchronization
issue between the helpers that validate and drop cached dst entries.
rcuref - imbalanced put()
WARNING: CPU: 0 PID: 899 at lib/rcuref.c:266 rcuref_put_slowpath+0x1ce/0x240 lib/rcuref.c:266
Modules linked in:
CPSocket connected tcp:127.0.0.1:48148,server=on <-> 127.0.0.1:33750
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:rcuref_put_slowpath+0x1ce/0x240 lib/rcuref.c:266
Call Trace:
<TASK>
__rcuref_put include/linux/rcuref.h:97 [inline]
rcuref_put include/linux/rcuref.h:153 [inline]
dst_release+0x291/0x310 net/core/dst.c:167
__sk_dst_check+0x2d4/0x350 net/core/sock.c:604
__inet6_csk_dst_check net/ipv6/inet6_connection_sock.c:76 [inline]
inet6_csk_route_socket+0x6ed/0x10c0 net/ipv6/inet6_connection_sock.c:104
inet6_csk_xmit+0x12f/0x740 net/ipv6/inet6_connection_sock.c:121
l2tp_xmit_queue net/l2tp/l2tp_core.c:1214 [inline]
l2tp_xmit_core net/l2tp/l2tp_core.c:1309 [inline]
l2tp_xmit_skb+0x1404/0x1910 net/l2tp/l2tp_core.c:1325
pppol2tp_sendmsg+0x3ca/0x550 net/l2tp/l2tp_ppp.c:302
sock_sendmsg_nosec net/socket.c:729 [inline]
__sock_sendmsg net/socket.c:744 [inline]
____sys_sendmsg+0xab2/0xc70 net/socket.c:2609
___sys_sendmsg+0x11d/0x1c0 net/socket.c:2663
__sys_sendmmsg+0x188/0x450 net/socket.c:2749
__do_sys_sendmmsg net/socket.c:2778 [inline]
__se_sys_sendmmsg net/socket.c:2775 [inline]
__x64_sys_sendmmsg+0x98/0x100 net/socket.c:2775
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x64/0x140 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7fe6960ec719
</TASK>
The race occurs between the lockless UDPv6 transmit path
(udpv6_sendmsg() -> sk_dst_check()) and the locked L2TP/pppol2tp
transmit path (pppol2tp_sendmsg() -> l2tp_xmit_skb() ->
... -> inet6_csk_xmit() → __sk_dst_check()), when both handle
the same obsolete dst from sk->sk_dst_cache: the UDPv6 side takes
an extra reference and atomically steals and releases the cached
dst, while the L2TP side, using a stale cached pointer, still
calls dst_release() on it, and together these updates produce
an extra final dst_release() on that dst, triggering
rcuref - imbalanced put().
The Race Condition:
Initial:
sk->sk_dst_cache = dst
ref(dst) = 1
Thread 1: sk_dst_check() Thread 2: __sk_dst_check()
------------------------ ----------------------------
sk_dst_get(sk):
rcu_read_lock()
dst = rcu_dereference(sk->sk_dst_cache)
rcuref_get(dst) succeeds
rcu_read_unlock()
// ref = 2
dst = __sk_dst_get(sk)
// reads same dst from sk_dst_cache
// ref still = 2 (no extra get)
[both see dst obsolete & check() == NULL]
sk_dst_reset(sk):
old = xchg(&sk->sk_dst_cache, NULL)
// old = dst
dst_release(old)
// drop cached ref
// ref: 2 -> 1
RCU_INIT_POINTER(sk->sk_dst_cache, NULL)
// cache already NULL after xchg
dst_release(dst)
// ref: 1 -> 0
dst_release(dst)
// tries to drop its own ref after final put
// rcuref_put_slowpath() -> "rcuref - imbalanced put()"
Make L2TP's IPv6 transmit path stop using inet6_csk_xmit()
(and thus __sk_dst_check()) and instead open-code the same
routing and transmit sequence using the existing sk_dst_check()
with dst reference counting and ip6_dst_lookup_flow(). The new code builds
a flowi6 from the socket fields in the same way as
inet6_csk_route_socket(), then performs dst lookup using
sk_dst_check() with proper reference counting, attaches the
resulting dst to the skb via skb_dst_set(), and finally invokes
ip6_xmit() for transmission. This makes both the UDPv6 and L2TP
IPv6 paths use the same dst-cache handling logic for a given
socket and removes the possibility that the lockless sk_dst_check()
and the locked __sk_dst_check() concurrently drop the same cached
dst and trigger the rcuref - imbalanced put() warning under
concurrent traffic.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: b0270e91014d ("ipv4: add a sock pointer to ip_queue_xmit()")
Signed-off-by: Li Xiasong <lixiasong1(a)huawei.com>
---
net/l2tp/l2tp_core.c | 60 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 57 insertions(+), 3 deletions(-)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 97350350478d..80a20c079a9a 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -995,19 +995,73 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int l2tp_xmit_ipv6(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = np->saddr;
+ fl6.flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_sport = inet->inet_sport;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.flowi6_uid = sk->sk_uid;
+
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+
+ dst = sk_dst_check(sk, np->dst_cookie);
+ if (!dst) {
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ sk->sk_err_soft = -PTR_ERR(dst);
+ sk->sk_route_caps = 0;
+ kfree_skb(skb);
+ return PTR_ERR(dst);
+ }
+
+ ip6_dst_store(sk, dst_clone(dst), NULL, NULL);
+ }
+
+ skb_dst_set(skb, dst);
+ fl6.daddr = sk->sk_v6_daddr;
+
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
+ np->tclass, sk->sk_priority);
+ rcu_read_unlock();
+ return err;
+}
+#endif
+
/* Queue the packet to IP for output: tunnel socket lock must be held */
static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl)
{
int err;
+ struct sock *sk = tunnel->sock;
skb->ignore_df = 1;
skb_dst_drop(skb);
#if IS_ENABLED(CONFIG_IPV6)
- if (l2tp_sk_is_v6(tunnel->sock))
- err = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ if (l2tp_sk_is_v6(sk))
+ err = l2tp_xmit_ipv6(sk, skb);
else
#endif
- err = ip_queue_xmit(tunnel->sock, skb, fl);
+ err = ip_queue_xmit(sk, skb, fl);
return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
--
2.34.1
2
1
17 Mar '26
hulk inclusion
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/8696
Reference: https://lore.kernel.org/netdev/20251215145537.5085-1-m.lobanov@rosa.ru/
----------------------------------------
A reproducible rcuref - imbalanced put() warning is observed under
IPv6 L2TP (pppol2tp) traffic with blackhole routes, indicating an
imbalance in dst reference counting for routes cached in
sk->sk_dst_cache and pointing to a subtle lifetime/synchronization
issue between the helpers that validate and drop cached dst entries.
rcuref - imbalanced put()
WARNING: CPU: 0 PID: 899 at lib/rcuref.c:266 rcuref_put_slowpath+0x1ce/0x240 lib/rcuref.c:266
Modules linked in:
CPSocket connected tcp:127.0.0.1:48148,server=on <-> 127.0.0.1:33750
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:rcuref_put_slowpath+0x1ce/0x240 lib/rcuref.c:266
Call Trace:
<TASK>
__rcuref_put include/linux/rcuref.h:97 [inline]
rcuref_put include/linux/rcuref.h:153 [inline]
dst_release+0x291/0x310 net/core/dst.c:167
__sk_dst_check+0x2d4/0x350 net/core/sock.c:604
__inet6_csk_dst_check net/ipv6/inet6_connection_sock.c:76 [inline]
inet6_csk_route_socket+0x6ed/0x10c0 net/ipv6/inet6_connection_sock.c:104
inet6_csk_xmit+0x12f/0x740 net/ipv6/inet6_connection_sock.c:121
l2tp_xmit_queue net/l2tp/l2tp_core.c:1214 [inline]
l2tp_xmit_core net/l2tp/l2tp_core.c:1309 [inline]
l2tp_xmit_skb+0x1404/0x1910 net/l2tp/l2tp_core.c:1325
pppol2tp_sendmsg+0x3ca/0x550 net/l2tp/l2tp_ppp.c:302
sock_sendmsg_nosec net/socket.c:729 [inline]
__sock_sendmsg net/socket.c:744 [inline]
____sys_sendmsg+0xab2/0xc70 net/socket.c:2609
___sys_sendmsg+0x11d/0x1c0 net/socket.c:2663
__sys_sendmmsg+0x188/0x450 net/socket.c:2749
__do_sys_sendmmsg net/socket.c:2778 [inline]
__se_sys_sendmmsg net/socket.c:2775 [inline]
__x64_sys_sendmmsg+0x98/0x100 net/socket.c:2775
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x64/0x140 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7fe6960ec719
</TASK>
The race occurs between the lockless UDPv6 transmit path
(udpv6_sendmsg() -> sk_dst_check()) and the locked L2TP/pppol2tp
transmit path (pppol2tp_sendmsg() -> l2tp_xmit_skb() ->
... -> inet6_csk_xmit() → __sk_dst_check()), when both handle
the same obsolete dst from sk->sk_dst_cache: the UDPv6 side takes
an extra reference and atomically steals and releases the cached
dst, while the L2TP side, using a stale cached pointer, still
calls dst_release() on it, and together these updates produce
an extra final dst_release() on that dst, triggering
rcuref - imbalanced put().
The Race Condition:
Initial:
sk->sk_dst_cache = dst
ref(dst) = 1
Thread 1: sk_dst_check() Thread 2: __sk_dst_check()
------------------------ ----------------------------
sk_dst_get(sk):
rcu_read_lock()
dst = rcu_dereference(sk->sk_dst_cache)
rcuref_get(dst) succeeds
rcu_read_unlock()
// ref = 2
dst = __sk_dst_get(sk)
// reads same dst from sk_dst_cache
// ref still = 2 (no extra get)
[both see dst obsolete & check() == NULL]
sk_dst_reset(sk):
old = xchg(&sk->sk_dst_cache, NULL)
// old = dst
dst_release(old)
// drop cached ref
// ref: 2 -> 1
RCU_INIT_POINTER(sk->sk_dst_cache, NULL)
// cache already NULL after xchg
dst_release(dst)
// ref: 1 -> 0
dst_release(dst)
// tries to drop its own ref after final put
// rcuref_put_slowpath() -> "rcuref - imbalanced put()"
Make L2TP's IPv6 transmit path stop using inet6_csk_xmit()
(and thus __sk_dst_check()) and instead open-code the same
routing and transmit sequence using the existing sk_dst_check()
with dst reference counting and ip6_dst_lookup_flow(). The new code builds
a flowi6 from the socket fields in the same way as
inet6_csk_route_socket(), then performs dst lookup using
sk_dst_check() with proper reference counting, attaches the
resulting dst to the skb via skb_dst_set(), and finally invokes
ip6_xmit() for transmission. This makes both the UDPv6 and L2TP
IPv6 paths use the same dst-cache handling logic for a given
socket and removes the possibility that the lockless sk_dst_check()
and the locked __sk_dst_check() concurrently drop the same cached
dst and trigger the rcuref - imbalanced put() warning under
concurrent traffic.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: b0270e91014d ("ipv4: add a sock pointer to ip_queue_xmit()")
Signed-off-by: Li Xiasong <lixiasong1(a)huawei.com>
---
net/l2tp/l2tp_core.c | 60 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 57 insertions(+), 3 deletions(-)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index f85b383811d5..79a5f635b049 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1000,19 +1000,73 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int l2tp_xmit_ipv6(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = np->saddr;
+ fl6.flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_sport = inet->inet_sport;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.flowi6_uid = sk->sk_uid;
+
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+
+ dst = sk_dst_check(sk, np->dst_cookie);
+ if (!dst) {
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
+ sk->sk_route_caps = 0;
+ kfree_skb(skb);
+ return PTR_ERR(dst);
+ }
+
+ ip6_dst_store(sk, dst_clone(dst), NULL, NULL);
+ }
+
+ skb_dst_set(skb, dst);
+ fl6.daddr = sk->sk_v6_daddr;
+
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
+ np->tclass, sk->sk_priority);
+ rcu_read_unlock();
+ return err;
+}
+#endif
+
/* Queue the packet to IP for output: tunnel socket lock must be held */
static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl)
{
int err;
+ struct sock *sk = tunnel->sock;
skb->ignore_df = 1;
skb_dst_drop(skb);
#if IS_ENABLED(CONFIG_IPV6)
- if (l2tp_sk_is_v6(tunnel->sock))
- err = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ if (l2tp_sk_is_v6(sk))
+ err = l2tp_xmit_ipv6(sk, skb);
else
#endif
- err = ip_queue_xmit(tunnel->sock, skb, fl);
+ err = ip_queue_xmit(sk, skb, fl);
return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
--
2.34.1
2
1
Li Xiasong (1):
mptcp: fix soft lockup in mptcp_recvmsg()
Paolo Abeni (1):
mptcp: fix MSG_PEEK stream corruption
net/mptcp/protocol.c | 44 +++++++++++++++++++++++++++++++++-----------
1 file changed, 33 insertions(+), 11 deletions(-)
--
2.34.1
2
3