From 810f52e443a226a73ba20a3dc36cf51107645173 Mon Sep 17 00:00:00 2001 From: Pengda Yang Date: Wed, 15 Mar 2023 16:58:31 +0800 Subject: [PATCH 001/198] limit the field width of 'scanf' Fixes: #2121 Signed-off-by: Pengda Yang --- criu/proc_parse.c | 6 +++--- test/zdtm/lib/fs.c | 2 +- test/zdtm/static/apparmor.c | 2 +- test/zdtm/static/apparmor_stacking.c | 2 +- test/zdtm/static/cgroup01.c | 2 +- test/zdtm/static/cgroup02.c | 2 +- test/zdtm/static/change_mnt_context.c | 2 +- test/zdtm/static/file_locks01.c | 2 +- test/zdtm/static/file_locks02.c | 2 +- test/zdtm/static/file_locks03.c | 2 +- test/zdtm/static/file_locks04.c | 2 +- test/zdtm/static/netns-dev.c | 2 +- test/zdtm/static/ofd_file_locks.c | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 92655a484d..55aefac7d7 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1418,7 +1418,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -2216,10 +2216,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd31..efcc7a1d08 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa469..dc16368217 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b49..0bc36048cf 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264d..7bfb677623 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a089..8a925c0a43 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b3..8787ae5cf9 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5d..bfdca51d93 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa2..ae4827de97 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21b..228e668925 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa70..7e0d2654e1 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea5..f268f2fece 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f52..a68fa38eeb 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); From e9cceed87b2a8fe8631b3e946b76a7310ffb03b8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 12:24:52 +0100 Subject: [PATCH 002/198] amdgpu: remove exec permissions on source files This patch fixes the following warnings that appear when building an RPM package: + /usr/lib/rpm/redhat/brp-mangle-shebangs *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.c is executable but has no shebang, removing executable bit *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.h is executable but has no shebang, removing executable bit Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_util.c | 0 plugins/amdgpu/amdgpu_plugin_util.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.c mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c old mode 100755 new mode 100644 diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h old mode 100755 new mode 100644 From 9463371787f394cc871bc7256c3349d8bad54298 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 10:59:32 +0100 Subject: [PATCH 003/198] Makefile.config: set CR_PLUGIN_DEFAULT variable By default, CRIU uses the path "/usr/lib/criu" to install and load plugins at runtime. This path is defined by the `PLUGINDIR` variable in Makefile.install and `CR_PLUGIN_DEFAULT` in `criu/include/plugin.h`. However, some distribution packages might install the CRIU plugins at "/usr/lib64/criu" instead. This patch updates the makefile to align the path defined by `CR_PLUGIN_DEFAULT` with the value of `PLUGINDIR`. Signed-off-by: Radostin Stoyanov --- Makefile.config | 4 ++++ plugins/amdgpu/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 52c250b21c..5ab689d411 100644 --- a/Makefile.config +++ b/Makefile.config @@ -59,6 +59,10 @@ endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 7d3388b80e..a20d1d1639 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,7 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) From a1db7627b900bbda7625b31b0051936adc950734 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 20:00:09 +0530 Subject: [PATCH 004/198] images: Add protobuf definition for pidfd We only use the last pid from the list in NSpid entry (from /proc//fdinfo/) while restoring pidfds. The last pid refers to the pid of the process in the most deeply nested pid namespace. Since CRIU does not currently support nested pid namespaces, this entry is the one we want. After Linux 6.9, inode numbers can be used to compare pidfds. pidfds referring to the same process will have the same inode numbers. We use inode numbers to restore pidfds that point to dead processes. Signed-off-by: Bhavik Sachdev --- images/Makefile | 1 + images/fdinfo.proto | 3 +++ images/pidfd.proto | 13 +++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 images/pidfd.proto diff --git a/images/Makefile b/images/Makefile index ca85b1a213..855d894da6 100644 --- a/images/Makefile +++ b/images/Makefile @@ -73,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860..32ec13cf48 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 0000000000..a9da3e4543 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} From 99ec62028b827af0849b860f84e6076aa4109afb Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 21:18:51 +0530 Subject: [PATCH 005/198] criu: Support C/R of pidfds Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc//fdinfo/ change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: #2258 Signed-off-by: Bhavik Sachdev --- criu/Makefile.crtools | 1 + criu/cr-restore.c | 3 +- criu/files.c | 17 +++ criu/image-desc.c | 1 + criu/include/fs-magic.h | 4 + criu/include/image-desc.h | 1 + criu/include/magic.h | 1 + criu/include/pidfd.h | 16 ++ criu/include/protobuf-desc.h | 1 + criu/pidfd.c | 287 +++++++++++++++++++++++++++++++++++ criu/proc_parse.c | 29 ++++ criu/protobuf-desc.c | 1 + 12 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 criu/include/pidfd.h create mode 100644 criu/pidfd.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd70..ba6132d2f7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6fe..d5b6c8037a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24be..a57fb860fb 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986..2d87c73815 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f48915..ffc0455d5f 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be645..79e1ac1113 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234e..6f0aff26d8 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 0000000000..4d2d71700e --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f..c4241be557 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 0000000000..fdf5dec60e --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,287 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + close(pidfd); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 55aefac7d7..95ebe3a411 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5be..e0dbfccc21 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; From f9fdfcacdcbdc7599d88db99626f73fe5e8c2b3d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 19:58:29 +0530 Subject: [PATCH 006/198] zdtm: Check pidfd fdinfo entry is consistent Ensures that entries in /proc//fdinfo/ are same. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_self.c | 140 ++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 test/zdtm/static/pidfd_self.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 1e891f0ba4..a2e852d73c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -53,6 +53,7 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 0000000000..2730ee123d --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 487853ff215b37b33d2e45116d7f14580687487a Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 20:01:00 +0530 Subject: [PATCH 007/198] zdtm: Check pidfd can send signal after C/R Ensure `pidfd_send_signal()` syscall works as expected after C/R. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_child.c | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 test/zdtm/static/pidfd_child.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a2e852d73c..0268ae4927 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_child \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 0000000000..ec559605dc --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} From 032a822a281f558781a561759efe3a0fc4228375 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 8 Jul 2024 22:25:00 +0530 Subject: [PATCH 008/198] zdtm: Check pidfd can kill descendant processes Validate that pidfds can been used to send signals to different processes after C/R using the `pidfd_send_signal()` syscall. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_kill.c | 128 ++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 test/zdtm/static/pidfd_kill.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0268ae4927..ab45b580af 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -55,6 +55,7 @@ TST_NOFILE := \ ptrace_sig \ pidfd_self \ pidfd_child \ + pidfd_kill \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 0000000000..6232d033aa --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} From 643e160210dbeadf161c1dd122604e1115b79927 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 25 Jul 2024 01:12:36 +0530 Subject: [PATCH 009/198] zdtm: Check dead pidfd is restored correctly After, C/R of pidfds that point to dead processes their inodes might change. But if two pidfds point to same dead process they should continue to do so after C/R. This test ensures that this happens by calling `statx()` on pidfds after C/R and then comparing their inode numbers. Support for comparing pidfds by using `statx()` and inode numbers was introduced alongside pidfs. So if `f_type` of pidfd is not equal to `PID_FS_MAGIC` then we skip this test. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_dead.c | 244 ++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 test/zdtm/static/pidfd_dead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab45b580af..20e4bc2721 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_dead \ pidfd_child \ pidfd_kill \ pipe00 \ diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 0000000000..9c825899d1 --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} From 98c49d0c4f4b4eeb1fa71af12ffdeb1a6f3408fe Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 16 Aug 2024 21:20:57 +0530 Subject: [PATCH 010/198] zdtm: Check fd from pidfd_getfd is C/Red correctly We get the read end of a pipe using `pidfd_getfd` and check if we can read from it after C/R. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_from_pidfd.c | 108 +++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/fd_from_pidfd.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 20e4bc2721..f4dbb1d96a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -57,6 +57,7 @@ TST_NOFILE := \ pidfd_dead \ pidfd_child \ pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/fd_from_pidfd.c b/test/zdtm/static/fd_from_pidfd.c new file mode 100644 index 0000000000..1f863d6c0e --- /dev/null +++ b/test/zdtm/static/fd_from_pidfd.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} From b88d40e33466986145e1de5cd8ccc31c82366d04 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 26 Aug 2024 20:56:14 +0530 Subject: [PATCH 011/198] zdtm: Check pidfd for thread is valid after C/R We open a pidfd to a thread using `PIDFD_THREAD` flag and after C/R ensure that we can send signals using it with `PIDFD_SIGNAL_THREAD`. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_of_thread.c | 114 ++++++++++++++++++++++++++ test/zdtm/static/pidfd_of_thread.desc | 1 + 3 files changed, 116 insertions(+) create mode 100644 test/zdtm/static/pidfd_of_thread.c create mode 100644 test/zdtm/static/pidfd_of_thread.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f4dbb1d96a..44ac64fe57 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_of_thread \ pidfd_dead \ pidfd_child \ pidfd_kill \ diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 0000000000..d232c7ac1d --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 0000000000..802caed655 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} From 0e780145a502acfd7511ac1ecfc2f488f7254c39 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 12:39:18 +0100 Subject: [PATCH 012/198] make/lint: use 'ruff check ' The command `ruff ` has been deprecated and removed: https://astral.sh/blog/ruff-v0.5.0#removed-deprecated-features Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 97b4dc2118..46d9adef32 100644 --- a/Makefile +++ b/Makefile @@ -437,7 +437,7 @@ help: ruff: @ruff --version - ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ From 87b5ac9d9f8e7a72c049a65dac1bfb93e3475800 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:58:41 +0100 Subject: [PATCH 013/198] pycriu: fix lint errors This patch fixes the following errors reported by ruff: lib/pycriu/images/pb2dict.py:307:24: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 305 | elif field.type in _basic_cast: 306 | cast = _basic_cast[field.type] 307 | if pretty and (cast == int): | ^^^^^^^^^^^ E721 308 | if is_hex: 309 | # Fields that have (criu).hex = true option set | lib/pycriu/images/pb2dict.py:379:13: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 377 | elif field.type in _basic_cast: 378 | cast = _basic_cast[field.type] 379 | if (cast == int) and is_string(value): | ^^^^^^^^^^^ E721 380 | if _marked_as_dev(field): 381 | return encode_dev(field, value) | Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 0d1a246927..e3dd95ac0a 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -304,7 +304,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -376,7 +376,7 @@ def _dict2pb_cast(field, value): return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) From adf2c5be9616ff54ec1fe43463aaa767ef56e3fc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 4 Oct 2024 12:14:29 +0100 Subject: [PATCH 014/198] images/inventory: add field for enabled plugins This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov --- criu/cr-restore.c | 6 +- criu/image.c | 124 +++++++++++++++++++++++++++++++++ criu/include/image.h | 4 ++ criu/plugin.c | 3 + images/inventory.proto | 8 +++ plugins/amdgpu/amdgpu_plugin.c | 31 +++++++++ plugins/cuda/cuda_plugin.c | 22 +++++- 7 files changed, 193 insertions(+), 5 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d5b6c8037a..646300bdb8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2354,12 +2354,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; diff --git a/criu/image.c b/criu/image.c index 9fb390ab7e..9589167fb1 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +107,19 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } } ret = 0; @@ -110,8 +131,92 @@ int check_img_inventory(bool restore) return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) diff --git a/criu/include/image.h b/criu/include/image.h index a17aae35c2..afa7d5e12f 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 58b5ea5bfe..65e79a0692 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -256,6 +256,9 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) + goto err; + exit_code = 0; err: closedir(d); diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d0..7f655031bc 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b56ba6d140..96c0861628 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; +bool plugin_added_to_inventory = false; + +bool plugin_disabled = false; + /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value) int amdgpu_plugin_init(int stage) { + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); @@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { + if (plugin_disabled) + return; + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) @@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) if (ret) pr_perror("%s(), Can't handle VMAs of input device", __func__); + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; + if (plugin_disabled) + return -ENOTSUP; + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, exit_code = 0; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 23c3f4b1ab..c4fc67fa9f 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -38,6 +38,8 @@ */ bool plugin_disabled = false; +bool plugin_added_to_inventory = false; + struct pid_info { int pid; char checkpointed; @@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid) k_rtsigset_t save_sigset; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); } } + + if (!status && !plugin_added_to_inventory) { + status = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (status) + pr_err("Failed to add CUDA plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid) char msg_buf[CUDA_CKPT_BUF_SIZE]; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -463,6 +474,13 @@ int cuda_plugin_init(int stage) { int ret; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); plugin_disabled = true; From e6ce8f4054f3a21de589cf63cbf4e7c94ccdaf77 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:36:22 +0100 Subject: [PATCH 015/198] zdtm: add inventory test plugins This patch adds two test plugins to verify that CRIU plugins listed in the inventory image are enabled, while those that are not listed can be disabled. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 1 + test/plugins/Makefile | 16 +++++++++++++++- test/plugins/inventory_test_disabled_plugin.c | 17 +++++++++++++++++ test/plugins/inventory_test_enabled_plugin.c | 17 +++++++++++++++++ test/zdtm.py | 2 +- 5 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 test/plugins/inventory_test_disabled_plugin.c create mode 100644 test/plugins/inventory_test_enabled_plugin.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 38b7b5097f..b472e954c2 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -362,5 +362,6 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled ./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/test/plugins/Makefile b/test/plugins/Makefile index 7827b655c4..4f620ad503 100644 --- a/test/plugins/Makefile +++ b/test/plugins/Makefile @@ -1,5 +1,13 @@ SRC_DIR := ../../plugins -PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so +PLUGIN_TARGETS := inventory_test_enabled_plugin.so inventory_test_disabled_plugin.so amdgpu_plugin.so cuda_plugin.so + +ARCH := x86 + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC # Silent make rules. Q := @ @@ -12,6 +20,12 @@ amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so $(Q) cp $< $@ +inventory_test_enabled_plugin.so: inventory_test_enabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + +inventory_test_disabled_plugin.so: inventory_test_disabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + clean: $(Q) $(RM) $(PLUGIN_TARGETS) diff --git a/test/plugins/inventory_test_disabled_plugin.c b/test/plugins/inventory_test_disabled_plugin.c new file mode 100644 index 0000000000..468fe924b1 --- /dev/null +++ b/test/plugins/inventory_test_disabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_disabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return 0; +} + +void inventory_test_disabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_disabled_plugin", inventory_test_disabled_plugin_init, inventory_test_disabled_plugin_fini) \ No newline at end of file diff --git a/test/plugins/inventory_test_enabled_plugin.c b/test/plugins/inventory_test_enabled_plugin.c new file mode 100644 index 0000000000..89e684e2ac --- /dev/null +++ b/test/plugins/inventory_test_enabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_enabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return !check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return add_inventory_plugin(CR_PLUGIN_DESC.name); +} + +void inventory_test_enabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_enabled_plugin", inventory_test_enabled_plugin_init, inventory_test_enabled_plugin_fini) \ No newline at end of file diff --git a/test/zdtm.py b/test/zdtm.py index 6b2132cc30..37ebe63b7b 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2877,7 +2877,7 @@ def get_cli_args(): rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") rp.add_argument("--criu-plugin", help="Run tests with CRIU plugin", - choices=['amdgpu', 'cuda'], + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], nargs='+', default=None) rp.add_argument("--mocked-cuda-checkpoint", From d8be857b4b5bf43f19f957ca86f518ad7c3ff12a Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 9 Oct 2024 09:50:28 +0100 Subject: [PATCH 016/198] pidfd: block SIGCHLD during tmp process creation This patch blocks SIGCHLD during temporary process creation to prevent a race condition between kill() and waitpid() where sigchld_handler() causes `criu restore` to fail with an error. Fixes: #2490 Signed-off-by: Bhavik Sachdev Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index fdf5dec60e..3ea3c93094 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -145,6 +145,20 @@ static int create_tmp_process(void) static int free_dead_pidfd(struct dead_pidfd *dead) { int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } if (kill(dead->pid, SIGKILL) < 0) { pr_perror("Could not kill temporary process with pid: %d", @@ -158,6 +172,12 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + if (!WIFSIGNALED(status)) { pr_err("Expected temporary process to be terminated by a signal\n"); goto err; From 0d2d23b6d099e8f451ee6c78b4e3e1654b0a26a3 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:49:50 -0700 Subject: [PATCH 017/198] include: add common header files for riscv64 Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - imported a page_size() type fix (authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/atomic.h | 109 ++++++++++++++++++ include/common/arch/riscv64/asm/bitops.h | 50 ++++++++ include/common/arch/riscv64/asm/bitsperlong.h | 6 + include/common/arch/riscv64/asm/linkage.h | 23 ++++ include/common/arch/riscv64/asm/page.h | 44 +++++++ 5 files changed, 232 insertions(+) create mode 100644 include/common/arch/riscv64/asm/atomic.h create mode 100644 include/common/arch/riscv64/asm/bitops.h create mode 100644 include/common/arch/riscv64/asm/bitsperlong.h create mode 100644 include/common/arch/riscv64/asm/linkage.h create mode 100644 include/common/arch/riscv64/asm/page.h diff --git a/include/common/arch/riscv64/asm/atomic.h b/include/common/arch/riscv64/asm/atomic.h new file mode 100644 index 0000000000..4b08bd9fdb --- /dev/null +++ b/include/common/arch/riscv64/asm/atomic.h @@ -0,0 +1,109 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef struct { + int counter; +} atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(v->counter), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(ptr->counter) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h new file mode 100644 index 0000000000..400cc3e155 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitops.h @@ -0,0 +1,50 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm-generic/bitops.h" + +#define BITS_PER_LONG 64 + +#define BIT_MASK(nr) ((1##UL) << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#define __AMO(op) "amo" #op ".d" + +#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ + ({ \ + unsigned long __res, __mask; \ + __mask = BIT_MASK(nr); \ + __asm__ __volatile__(__AMO(op) #ord " %0, %2, %1" \ + : "=r"(__res), "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(__mask)) \ + : "memory"); \ + ((__res & __mask) != 0); \ + }) + +#define __op_bit_ord(op, mod, nr, addr, ord) \ + __asm__ __volatile__(__AMO(op) #ord " zero, %1, %0" \ + : "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(BIT_MASK(nr))) \ + : "memory"); + +#define __test_and_op_bit(op, mod, nr, addr) __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) +#define __op_bit(op, mod, nr, addr) __op_bit_ord(op, mod, nr, addr, ) + +/* Bitmask modifiers */ +#define __NOP(x) (x) +#define __NOT(x) (~(x)) + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation may be reordered on other architectures than x86. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(or, __NOP, nr, addr); +} + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/include/common/arch/riscv64/asm/bitsperlong.h b/include/common/arch/riscv64/asm/bitsperlong.h new file mode 100644 index 0000000000..d95727d193 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/riscv64/asm/linkage.h b/include/common/arch/riscv64/asm/linkage.h new file mode 100644 index 0000000000..c6d40f7508 --- /dev/null +++ b/include/common/arch/riscv64/asm/linkage.h @@ -0,0 +1,23 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/riscv64/asm/page.h b/include/common/arch/riscv64/asm/page.h new file mode 100644 index 0000000000..5113cb6db6 --- /dev/null +++ b/include/common/arch/riscv64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on aarch64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on aarch64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ From 7fd95a509dd4e3d2759f58399c96ee33d97f1e24 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:58:26 -0700 Subject: [PATCH 018/198] compel: add riscv64 support Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - added a membarrier() to syscall table (fix authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- Makefile | 6 +- compel/Makefile | 4 +- .../riscv64/plugins/include/asm/prologue.h | 35 +++ .../plugins/include/asm/syscall-types.h | 28 +++ .../arch/riscv64/plugins/include/features.h | 4 + .../arch/riscv64/plugins/std/parasite-head.S | 7 + .../plugins/std/syscalls/Makefile.syscalls | 59 +++++ .../plugins/std/syscalls/gen-sys-exec-tbl.pl | 43 ++++ .../plugins/std/syscalls/gen-syscalls.pl | 99 ++++++++ .../plugins/std/syscalls/syscall-aux.S | 37 +++ .../plugins/std/syscalls/syscall-aux.h | 3 + .../plugins/std/syscalls/syscall-common.S | 17 ++ .../riscv64/plugins/std/syscalls/syscall.def | 125 ++++++++++ .../riscv64/plugins/std/syscalls/syscalls.S | 112 +++++++++ compel/arch/riscv64/scripts/compel-pack.lds.S | 32 +++ compel/arch/riscv64/src/lib/cpu.c | 78 ++++++ compel/arch/riscv64/src/lib/handle-elf-host.c | 1 + compel/arch/riscv64/src/lib/handle-elf.c | 32 +++ compel/arch/riscv64/src/lib/include/cpu.h | 0 .../arch/riscv64/src/lib/include/handle-elf.h | 12 + compel/arch/riscv64/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 15 ++ .../riscv64/src/lib/include/uapi/asm/cpu.h | 7 + .../riscv64/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 52 ++++ .../include/uapi/asm/instruction_formats.h | 26 ++ .../lib/include/uapi/asm/processor-flags.h | 4 + .../src/lib/include/uapi/asm/sigframe.h | 68 ++++++ compel/arch/riscv64/src/lib/infect.c | 222 ++++++++++++++++++ compel/src/main.c | 3 + scripts/nmk/scripts/include.mk | 1 + 31 files changed, 1141 insertions(+), 3 deletions(-) create mode 100644 compel/arch/riscv64/plugins/include/asm/prologue.h create mode 100644 compel/arch/riscv64/plugins/include/asm/syscall-types.h create mode 100644 compel/arch/riscv64/plugins/include/features.h create mode 100644 compel/arch/riscv64/plugins/std/parasite-head.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-common.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall.def create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S create mode 100644 compel/arch/riscv64/scripts/compel-pack.lds.S create mode 100644 compel/arch/riscv64/src/lib/cpu.c create mode 120000 compel/arch/riscv64/src/lib/handle-elf-host.c create mode 100644 compel/arch/riscv64/src/lib/handle-elf.c create mode 100644 compel/arch/riscv64/src/lib/include/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/handle-elf.h create mode 100644 compel/arch/riscv64/src/lib/include/syscall.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h create mode 100644 compel/arch/riscv64/src/lib/infect.c diff --git a/Makefile b/Makefile index 46d9adef32..60b78a0749 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -84,6 +84,10 @@ ifeq ($(ARCH),loongarch64) DEFINES := -DCONFIG_LOONGARCH64 endif +ifeq ($(ARCH),riscv64) + DEFINES := -DCONFIG_RISCV64 +endif + # # CFLAGS_PIE: # diff --git a/compel/Makefile b/compel/Makefile index 78ec4826af..c0b8a82a07 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -32,8 +32,8 @@ ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif -# handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) +# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) +ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h new file mode 100644 index 0000000000..5c22b7b062 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000000..b9740a9ee5 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 // number of signals +#define _NSIG_BPW 64 // number of signals per word + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h new file mode 100644 index 0000000000..274cee52a3 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S new file mode 100644 index 0000000000..3e9d272e39 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/parasite-head.S @@ -0,0 +1,7 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + jal parasite_service + ebreak +END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000000..5af35bcb40 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100755 index 0000000000..61a807eb60 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl new file mode 100755 index 0000000000..a53f1962f1 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 0000000000..04160b7ac1 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the riscv64 Linux kernel + */ + +ENTRY(sys_open) + add a3, x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + add a3,x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + addi a2, x0, 0x200 // flags = AT_REMOVEDIR + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + addi a2, x0, 0 // flags = 0 + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 0000000000..881765bbba --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +#define __NR_openat 56 +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 0000000000..fdef3b47a8 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,17 @@ +#include "common/asm/linkage.h" + +syscall_common: + ecall + ret + +.macro syscall name, nr + ENTRY(\name) + li a7, \nr + j syscall_common + END(\name) +.endm + +ENTRY(__cr_restore_rt) + li a7, __NR_rt_sigreturn + ecall +END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def new file mode 100644 index 0000000000..17f763e903 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -0,0 +1,125 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) +ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S new file mode 100644 index 0000000000..715da46122 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S @@ -0,0 +1,112 @@ +/* Autogenerated, don't edit */ +#include +#include "std/syscalls/syscall-common.S" +syscall sys_read, __NR_read +syscall sys_write, __NR_write +syscall sys_close, __NR_close +syscall sys_lseek, __NR_lseek +syscall sys_mmap, __NR_mmap +syscall sys_mprotect, __NR_mprotect +syscall sys_munmap, __NR_munmap +syscall sys_brk, __NR_brk +syscall sys_sigaction, __NR_rt_sigaction +syscall sys_sigprocmask, __NR_rt_sigprocmask +syscall sys_rt_sigreturn, __NR_rt_sigreturn +syscall sys_ioctl, __NR_ioctl +syscall sys_pread64, __NR_pread64 +syscall sys_ptrace, __NR_ptrace +syscall sys_mremap, __NR_mremap +syscall sys_mincore, __NR_mincore +syscall sys_madvise, __NR_madvise +syscall sys_shmat, __NR_shmat +syscall sys_pause, __NR_pause +syscall sys_nanosleep, __NR_nanosleep +syscall sys_getitimer, __NR_getitimer +syscall sys_setitimer, __NR_setitimer +syscall sys_getpid, __NR_getpid +syscall sys_socket, __NR_socket +syscall sys_connect, __NR_connect +syscall sys_sendto, __NR_sendto +syscall sys_recvfrom, __NR_recvfrom +syscall sys_sendmsg, __NR_sendmsg +syscall sys_recvmsg, __NR_recvmsg +syscall sys_shutdown, __NR_shutdown +syscall sys_bind, __NR_bind +syscall sys_setsockopt, __NR_setsockopt +syscall sys_getsockopt, __NR_getsockopt +syscall sys_clone, __NR_clone +syscall sys_exit, __NR_exit +syscall sys_wait4, __NR_wait4 +syscall sys_waitid, __NR_waitid +syscall sys_kill, __NR_kill +syscall sys_fcntl, __NR_fcntl +syscall sys_flock, __NR_flock +syscall sys_readlinkat, __NR_readlinkat +syscall sys_umask, __NR_umask +syscall sys_getgroups, __NR_getgroups +syscall sys_setgroups, __NR_setgroups +syscall sys_setresuid, __NR_setresuid +syscall sys_getresuid, __NR_getresuid +syscall sys_setresgid, __NR_setresgid +syscall sys_getresgid, __NR_getresgid +syscall sys_getpgid, __NR_getpgid +syscall sys_setfsuid, __NR_setfsuid +syscall sys_setfsgid, __NR_setfsgid +syscall sys_getsid, __NR_getsid +syscall sys_capget, __NR_capget +syscall sys_capset, __NR_capset +syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo +syscall sys_setpriority, __NR_setpriority +syscall sys_sched_setscheduler, __NR_sched_setscheduler +syscall sys_sigaltstack, __NR_sigaltstack +syscall sys_personality, __NR_personality +syscall sys_prctl, __NR_prctl +syscall sys_setrlimit, __NR_setrlimit +syscall sys_mount, __NR_mount +syscall sys_umount2, __NR_umount2 +syscall sys_gettid, __NR_gettid +syscall sys_futex, __NR_futex +syscall sys_set_tid_address, __NR_set_tid_address +syscall sys_restart_syscall, __NR_restart_syscall +syscall sys_timer_create, __NR_timer_create +syscall sys_timer_settime, __NR_timer_settime +syscall sys_timer_gettime, __NR_timer_gettime +syscall sys_timer_getoverrun, __NR_timer_getoverrun +syscall sys_timer_delete, __NR_timer_delete +syscall sys_clock_gettime, __NR_clock_gettime +syscall sys_exit_group, __NR_exit_group +syscall sys_set_robust_list, __NR_set_robust_list +syscall sys_get_robust_list, __NR_get_robust_list +syscall sys_signalfd4, __NR_signalfd4 +syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo +syscall sys_vmsplice, __NR_vmsplice +syscall sys_timerfd_settime, __NR_timerfd_settime +syscall sys_fanotify_init, __NR_fanotify_init +syscall sys_fanotify_mark, __NR_fanotify_mark +syscall sys_open_by_handle_at, __NR_open_by_handle_at +syscall sys_setns, __NR_setns +syscall sys_kcmp, __NR_kcmp +syscall sys_openat, __NR_openat +syscall sys_mkdirat, __NR_mkdirat +syscall sys_unlinkat, __NR_unlinkat +syscall sys_memfd_create, __NR_memfd_create +syscall sys_io_setup, __NR_io_setup +syscall sys_io_submit, __NR_io_submit +syscall sys_io_getevents, __NR_io_getevents +syscall sys_seccomp, __NR_seccomp +syscall sys_gettimeofday, __NR_gettimeofday +syscall sys_preadv_raw, __NR_preadv_raw +syscall sys_userfaultfd, __NR_userfaultfd +syscall sys_fallocate, __NR_fallocate +syscall sys_ppoll, __NR_ppoll +syscall sys_fsopen, __NR_fsopen +syscall sys_fsconfig, __NR_fsconfig +syscall sys_fsmount, __NR_fsmount +syscall sys_clone3, __NR_clone3 +syscall sys_pidfd_open, __NR_pidfd_open +syscall sys_pidfd_getfd, __NR_pidfd_getfd +syscall sys_rseq, __NR_rseq +syscall sys_move_mount, __NR_move_mount +syscall sys_open_tree, __NR_open_tree +syscall sys_openat2, __NR_openat2 +#include diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S new file mode 100644 index 0000000000..a61235b443 --- /dev/null +++ b/compel/arch/riscv64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c new file mode 100644 index 0000000000..9a0291f701 --- /dev/null +++ b/compel/arch/riscv64/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c new file mode 120000 index 0000000000..fe4611886d --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c new file mode 100644 index 0000000000..22420bc782 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf.c @@ -0,0 +1,32 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_riscv64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/cpu.h b/compel/arch/riscv64/src/lib/include/cpu.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h new file mode 100644 index 0000000000..5827705833 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/handle-elf.h @@ -0,0 +1,12 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv64 +#define ELF_RISCV +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h new file mode 100644 index 0000000000..53f10525d9 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000000..f2ba799cbc --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000000..ac58567e38 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,7 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000000..a74decc231 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000000..192810cac0 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,52 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h + * + * A thread RISC-V CPU context + */ +typedef struct user_regs_struct user_regs_struct_t; +typedef struct __riscv_d_ext_state user_fpregs_struct_t; + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(registers) ((uint64_t)(registers).a0) +#define REG_IP(registers) ((uint64_t)(registers).pc) +#define SET_REG_IP(registers, val) ((registers).pc = (val)) + +/* + * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h + * with a different meaning, and it's not used in CRIU. So we have to + * undefine it here. + */ +#ifdef REG_SP +#undef REG_SP +#endif + +#define REG_SP(registers) ((uint64_t)((registers).sp)) + +#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h new file mode 100644 index 0000000000..e231d0465b --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h @@ -0,0 +1,26 @@ +#ifndef COMPEL_RELOCATIONS_H__ +#define COMPEL_RELOCATIONS_H__ + +#include + +static inline uint32_t riscv_b_imm(uint32_t val) +{ + return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; +} + +static inline uint32_t riscv_i_imm(uint32_t val) +{ + return val << 20; +} + +static inline uint32_t riscv_u_imm(uint32_t val) +{ + return val & 0xfffff000; +} + +static inline uint32_t riscv_j_imm(uint32_t val) +{ + return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); +} + +#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 0000000000..e40fb6fce7 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000000..761a08f62c --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +#include + +#include + +/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ +/* + * Signal context structure + * + * This contains the context saved before a signal handler is invoked; + * it is restored by sys_sigreturn / sys_rt_sigreturn. + */ +// struct sigcontext { +// struct user_regs_struct sc_regs; +// union __riscv_fp_state sc_fpregs; +// /* +// * 4K + 128 reserved for vector state and future expansion. +// * This space is enough to store the vector context whose VLENB +// * is less or equal to 128. +// * (The size of the vector context is 4144 byte as VLENB is 128) +// */ +// __u8 __reserved[4224] __attribute__((__aligned__(16))); +// }; + +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/riscv/kernel/signal.c */ +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs +}; + +/* + generates inline assembly code for triggering the rt_sigreturn system call. + used to return from a signal handler back to the normal execution flow of the process. +*/ +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv sp, %0\n" \ + "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ + "ecall\n" \ + : \ + : "r"(new_sp) \ + : "a7", "memory") +/* clang-format on */ + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c new file mode 100644 index 0000000000..01395a205a --- /dev/null +++ b/compel/arch/riscv64/src/lib/infect.c @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ecall */ + 0x73, 0x00, 0x10, 0x00 /* ebreak */ +}; + +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; + sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; + sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; + sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; + sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; + sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; + sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; + sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; + sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; + sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; + sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; + sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; + sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; + sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; + sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; + sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; + sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; + sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; + sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; + sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; + sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; + sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; + sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; + sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; + sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; + sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; + sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; + sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; + sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; + sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; + sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; + sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; + + memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret = -1; + + pr_info("Dumping FPU registers for %d\n", pid); + + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + return -1; + } + + ret = save(arg, regs, fpsimd); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.a7 = (unsigned long)nr; + regs.a0 = arg1; + regs.a1 = arg2; + regs.a2 = arg3; + regs.a3 = arg4; + regs.a4 = arg5; + regs.a5 = arg6; + regs.a6 = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.a0; + return err; +} + +/* + * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. + * Used during the infection process to allocate memory for the parasite code. +*/ +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here. + */ + return true; +} + +/* + * Fetch the signal alternate stack (sigaltstack), + * sas is a separate memory area for the signal handler to run on, + * avoiding potential issues with the main process stack +*/ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Task size is the maximum virtual address space size that a process can occupy in the memory + * Refer to linux kernel arch/riscv/include/asm/pgtable.h, + * task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." +*/ +#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + +unsigned long compel_task_size(void) +{ + return TASK_SIZE; +} + +/* + * Get task registers (overwrites weak function) + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/src/main.c b/compel/src/main.c index bc16c0ab41..21e06d7dd4 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -60,6 +60,9 @@ static const flags_t flags = { #elif defined CONFIG_LOONGARCH64 .arch = "loongarch64", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_RISCV64 + .arch = "riscv64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index 55c5be307f..603c322cfa 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -21,6 +21,7 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ -e s/aarch64.*/aarch64/ \ + -e s/riscv64.*/riscv64/ \ -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH From 1a42f63d30b59ea819cfda6858240a46a9e61007 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:59:13 -0700 Subject: [PATCH 019/198] images: add riscv64 core image Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- images/Makefile | 1 + images/core-riscv64.proto | 53 +++++++++++++++++++++++++++++++++++++++ images/core.proto | 3 +++ 3 files changed, 57 insertions(+) create mode 100644 images/core-riscv64.proto diff --git a/images/Makefile b/images/Makefile index 855d894da6..1e40b8a8f0 100644 --- a/images/Makefile +++ b/images/Makefile @@ -7,6 +7,7 @@ proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o +proto-obj-y += core-riscv64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o diff --git a/images/core-riscv64.proto b/images/core-riscv64.proto new file mode 100644 index 0000000000..1ddfdd8bd8 --- /dev/null +++ b/images/core-riscv64.proto @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +// Refer to riscv-gnu-toolchain/linux-headers/include/asm/ptrace.h +message user_riscv64_regs_entry { + required uint64 pc = 1; + required uint64 ra = 2; + required uint64 sp = 3; + required uint64 gp = 4; + required uint64 tp = 5; + required uint64 t0 = 6; + required uint64 t1 = 7; + required uint64 t2 = 8; + required uint64 s0 = 9; + required uint64 s1 = 10; + required uint64 a0 = 11; + required uint64 a1 = 12; + required uint64 a2 = 13; + required uint64 a3 = 14; + required uint64 a4 = 15; + required uint64 a5 = 16; + required uint64 a6 = 17; + required uint64 a7 = 18; + required uint64 s2 = 19; + required uint64 s3 = 20; + required uint64 s4 = 21; + required uint64 s5 = 22; + required uint64 s6 = 23; + required uint64 s7 = 24; + required uint64 s8 = 25; + required uint64 s9 = 26; + required uint64 s10 = 27; + required uint64 s11 = 28; + required uint64 t3 = 29; + required uint64 t4 = 30; + required uint64 t5 = 31; + required uint64 t6 = 32; +} + +message user_riscv64_d_ext_entry { + repeated uint64 f = 1; + required uint32 fcsr = 2; +} + +message thread_info_riscv64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_riscv64_regs_entry gpregs = 3[(criu).hex = true]; + required user_riscv64_d_ext_entry fpsimd = 4; +} diff --git a/images/core.proto b/images/core.proto index 5b07b5c448..1fa23868be 100644 --- a/images/core.proto +++ b/images/core.proto @@ -9,6 +9,7 @@ import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; import "core-loongarch64.proto"; +import "core-riscv64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -126,6 +127,7 @@ message core_entry { S390 = 5; MIPS = 6; LOONGARCH64 = 7; + RISCV64 = 8; } required march mtype = 1; @@ -136,6 +138,7 @@ message core_entry { optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; optional thread_info_loongarch64 ti_loongarch64 = 12; + optional thread_info_riscv64 ti_riscv64 = 13; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; From 35b30774fc51e2b15198f2e7bd9d3f8e0118c257 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:06:00 -0700 Subject: [PATCH 020/198] criu: add riscv64 support to parasite and restorer Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- criu/arch/riscv64/Makefile | 8 + criu/arch/riscv64/cpu.c | 40 ++++ criu/arch/riscv64/crtools.c | 171 ++++++++++++++++++ criu/arch/riscv64/include/asm/dump.h | 15 ++ criu/arch/riscv64/include/asm/int.h | 6 + criu/arch/riscv64/include/asm/kerndat.h | 7 + .../riscv64/include/asm/parasite-syscall.h | 6 + criu/arch/riscv64/include/asm/parasite.h | 16 ++ criu/arch/riscv64/include/asm/restore.h | 29 +++ criu/arch/riscv64/include/asm/restorer.h | 150 +++++++++++++++ .../arch/riscv64/include/asm/thread_pointer.h | 27 +++ criu/arch/riscv64/include/asm/types.h | 40 ++++ criu/arch/riscv64/include/asm/vdso.h | 28 +++ criu/arch/riscv64/restorer.c | 14 ++ criu/arch/riscv64/sigframe.c | 8 + criu/arch/riscv64/vdso-lookup.S | 15 ++ criu/arch/riscv64/vdso-pie.c | 159 ++++++++++++++++ criu/pie/Makefile | 8 + criu/pie/Makefile.library | 4 + 19 files changed, 751 insertions(+) create mode 100644 criu/arch/riscv64/Makefile create mode 100644 criu/arch/riscv64/cpu.c create mode 100644 criu/arch/riscv64/crtools.c create mode 100644 criu/arch/riscv64/include/asm/dump.h create mode 100644 criu/arch/riscv64/include/asm/int.h create mode 100644 criu/arch/riscv64/include/asm/kerndat.h create mode 100644 criu/arch/riscv64/include/asm/parasite-syscall.h create mode 100644 criu/arch/riscv64/include/asm/parasite.h create mode 100644 criu/arch/riscv64/include/asm/restore.h create mode 100644 criu/arch/riscv64/include/asm/restorer.h create mode 100644 criu/arch/riscv64/include/asm/thread_pointer.h create mode 100644 criu/arch/riscv64/include/asm/types.h create mode 100644 criu/arch/riscv64/include/asm/vdso.h create mode 100644 criu/arch/riscv64/restorer.c create mode 100644 criu/arch/riscv64/sigframe.c create mode 100644 criu/arch/riscv64/vdso-lookup.S create mode 100644 criu/arch/riscv64/vdso-pie.c diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile new file mode 100644 index 0000000000..d198954712 --- /dev/null +++ b/criu/arch/riscv64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c new file mode 100644 index 0000000000..97a883b8ce --- /dev/null +++ b/criu/arch/riscv64/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c new file mode 100644 index 0000000000..b2d6d29512 --- /dev/null +++ b/criu/arch/riscv64/crtools.c @@ -0,0 +1,171 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include "compel/infect.h" + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save riscv64 gprs + assign_reg(core->ti_riscv64->gpregs, regs, pc); + assign_reg(core->ti_riscv64->gpregs, regs, ra); + assign_reg(core->ti_riscv64->gpregs, regs, sp); + assign_reg(core->ti_riscv64->gpregs, regs, gp); + assign_reg(core->ti_riscv64->gpregs, regs, tp); + assign_reg(core->ti_riscv64->gpregs, regs, t0); + assign_reg(core->ti_riscv64->gpregs, regs, t1); + assign_reg(core->ti_riscv64->gpregs, regs, t2); + assign_reg(core->ti_riscv64->gpregs, regs, s0); + assign_reg(core->ti_riscv64->gpregs, regs, s1); + assign_reg(core->ti_riscv64->gpregs, regs, a0); + assign_reg(core->ti_riscv64->gpregs, regs, a1); + assign_reg(core->ti_riscv64->gpregs, regs, a2); + assign_reg(core->ti_riscv64->gpregs, regs, a3); + assign_reg(core->ti_riscv64->gpregs, regs, a4); + assign_reg(core->ti_riscv64->gpregs, regs, a5); + assign_reg(core->ti_riscv64->gpregs, regs, a6); + assign_reg(core->ti_riscv64->gpregs, regs, a7); + assign_reg(core->ti_riscv64->gpregs, regs, s2); + assign_reg(core->ti_riscv64->gpregs, regs, s3); + assign_reg(core->ti_riscv64->gpregs, regs, s4); + assign_reg(core->ti_riscv64->gpregs, regs, s5); + assign_reg(core->ti_riscv64->gpregs, regs, s6); + assign_reg(core->ti_riscv64->gpregs, regs, s7); + assign_reg(core->ti_riscv64->gpregs, regs, s8); + assign_reg(core->ti_riscv64->gpregs, regs, s9); + assign_reg(core->ti_riscv64->gpregs, regs, s10); + assign_reg(core->ti_riscv64->gpregs, regs, s11); + assign_reg(core->ti_riscv64->gpregs, regs, t3); + assign_reg(core->ti_riscv64->gpregs, regs, t4); + assign_reg(core->ti_riscv64->gpregs, regs, t5); + assign_reg(core->ti_riscv64->gpregs, regs, t6); + + // Save riscv64 fprs + for (i = 0; i < 32; ++i) + assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); + assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv64 *ti_riscv64; + UserRiscv64RegsEntry *gpregs; + UserRiscv64DExtEntry *fpsimd; + + ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); + if (!ti_riscv64) + goto err; + thread_info_riscv64__init(ti_riscv64); + core->ti_riscv64 = ti_riscv64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_riscv64_regs_entry__init(gpregs); + + ti_riscv64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_riscv64_d_ext_entry__init(fpsimd); + ti_riscv64->fpsimd = fpsimd; + fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); + fpsimd->n_f = 32; + if (!fpsimd->f) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (core->ti_riscv64) { + if (core->ti_riscv64->fpsimd) { + xfree(core->ti_riscv64->fpsimd->f); + xfree(core->ti_riscv64->fpsimd); + } + xfree(core->ti_riscv64->gpregs); + xfree(core->ti_riscv64); + core->ti_riscv64 = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; + + if (fpsimd->n_f != 32) + return 1; + + for (i = 0; i < 32; ++i) + sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) +{ + f->uc.uc_mcontext.__gregs[0] = r->pc; + f->uc.uc_mcontext.__gregs[1] = r->ra; + f->uc.uc_mcontext.__gregs[2] = r->sp; + f->uc.uc_mcontext.__gregs[3] = r->gp; + f->uc.uc_mcontext.__gregs[4] = r->tp; + f->uc.uc_mcontext.__gregs[5] = r->t0; + f->uc.uc_mcontext.__gregs[6] = r->t1; + f->uc.uc_mcontext.__gregs[7] = r->t2; + f->uc.uc_mcontext.__gregs[8] = r->s0; + f->uc.uc_mcontext.__gregs[9] = r->s1; + f->uc.uc_mcontext.__gregs[10] = r->a0; + f->uc.uc_mcontext.__gregs[11] = r->a1; + f->uc.uc_mcontext.__gregs[12] = r->a2; + f->uc.uc_mcontext.__gregs[13] = r->a3; + f->uc.uc_mcontext.__gregs[14] = r->a4; + f->uc.uc_mcontext.__gregs[15] = r->a5; + f->uc.uc_mcontext.__gregs[16] = r->a6; + f->uc.uc_mcontext.__gregs[17] = r->a7; + f->uc.uc_mcontext.__gregs[18] = r->s2; + f->uc.uc_mcontext.__gregs[19] = r->s3; + f->uc.uc_mcontext.__gregs[20] = r->s4; + f->uc.uc_mcontext.__gregs[21] = r->s5; + f->uc.uc_mcontext.__gregs[22] = r->s6; + f->uc.uc_mcontext.__gregs[23] = r->s7; + f->uc.uc_mcontext.__gregs[24] = r->s8; + f->uc.uc_mcontext.__gregs[25] = r->s9; + f->uc.uc_mcontext.__gregs[26] = r->s10; + f->uc.uc_mcontext.__gregs[27] = r->s11; + f->uc.uc_mcontext.__gregs[28] = r->t3; + f->uc.uc_mcontext.__gregs[29] = r->t4; + f->uc.uc_mcontext.__gregs[30] = r->t5; + f->uc.uc_mcontext.__gregs[31] = r->t6; + + return 0; +} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h new file mode 100644 index 0000000000..c2988f9bf6 --- /dev/null +++ b/criu/arch/riscv64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h new file mode 100644 index 0000000000..642804e9b4 --- /dev/null +++ b/criu/arch/riscv64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h new file mode 100644 index 0000000000..bb70cf6cf5 --- /dev/null +++ b/criu/arch/riscv64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h new file mode 100644 index 0000000000..6008c37923 --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h new file mode 100644 index 0000000000..4798cfd8ab --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite.h @@ -0,0 +1,16 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* + * This function is used to retrieve the value of the thread pointer (tp) + * in RISC-V architecture, which is typically used for thread-local storage (TLS). + * The value is then stored in the provided tls_t pointer. + */ +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mv %0, tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h new file mode 100644 index 0000000000..e4f25a57b6 --- /dev/null +++ b/criu/arch/riscv64/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, ~15 \n" \ + "mv a0, %2 \n" \ + "jr %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "a0", "memory") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h new file mode 100644 index 0000000000..45fe847a9e --- /dev/null +++ b/criu/arch/riscv64/include/asm/restorer.h @@ -0,0 +1,150 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +// kernel arg order for clone +// unsigned long clone_flags, +// unsigned long newsp, +// int __user * parent_tidptr, +// unsigned long tls, +// int __user * child_tidptr +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld a1, %2 \n" \ + "andi a1, a1, ~15 \n" \ + "addi a1, a1, -16 \n" \ + "sd %5, 0(a1) \n" \ + "sd %6, 8(a1) \n" \ + "mv a0, %1 \n" \ + "mv a2, %3 \n" \ + "mv a3, %4 \n" \ + "li a7, "__stringify(__NR_clone)" \n" \ + "ecall \n" \ + \ + "beqz a0, thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone_end \n" \ + \ + "thread_run: \n" \ + "ld a1, 0(sp) \n" \ + "ld a0, 8(sp) \n" \ + "jr a1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "a0", "a1", "a2", "a3", "a7", "memory") + +/* + * Based on sysdeps/unix/sysv/linux/riscv/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/riscv/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mv t0, %3 /* clone_restore_fn */ \n" \ + "mv t1, %4 /* args */ \n" \ + "mv a0, %1 /* &clone_args */ \n" \ + "mv a1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "li a7, "__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "ecall \n" \ + \ + "beqz a0, clone3_thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to a0 */ \ + "mv a0, t1 \n" \ + /* Jump to clone_restore_fn */ \ + "jr t0 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "a0", "a1", "a7", "t0", "t1", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mv sp, %0 \n" \ + "li a0, 0 \n" \ + "jr x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "a0", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("mv tp, %0" : : "r"(*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h new file mode 100644 index 0000000000..f7e07066a5 --- /dev/null +++ b/criu/arch/riscv64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h new file mode 100644 index 0000000000..83bb5f65ff --- /dev/null +++ b/criu/arch/riscv64/include/asm/types.h @@ -0,0 +1,40 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserRiscv64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 + +#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) + +#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +#define AT_VECTOR_SIZE 64 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h new file mode 100644 index 0000000000..322149c6ef --- /dev/null +++ b/criu/arch/riscv64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_GTOD 2 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ + const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ + const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ + const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ + const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c new file mode 100644 index 0000000000..d605f048dd --- /dev/null +++ b/criu/arch/riscv64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) +{ + return 0; +} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c new file mode 100644 index 0000000000..8096fab663 --- /dev/null +++ b/criu/arch/riscv64/sigframe.c @@ -0,0 +1,8 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S new file mode 100644 index 0000000000..50d4ecf088 --- /dev/null +++ b/criu/arch/riscv64/vdso-lookup.S @@ -0,0 +1,15 @@ +#include "common/asm/linkage.h" + +.section .text + +/* Expects t0 to hold the index into the lookup table. */ +GLOBAL(riscv_vdso_lookup) + /* Get the beginning of the lookup table */ + la t1, riscv_vdso_lookup_end + /* Scale the index */ + slli t0, t0, 3 + add t1, t0, t1 + ld t2, 0(t1) + jr t2 + +GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c new file mode 100644 index 0000000000..aa9272fb56 --- /dev/null +++ b/criu/arch/riscv64/vdso-pie.c @@ -0,0 +1,159 @@ +#include + +#include "asm/types.h" + +#include +#include +#include +#include +#include "atomic.h" +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* These symbols are defined in vdso-lookup.S */ +extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; + +/* + * li t0, INDEX + * jal x0, riscv_vdso_lookup + */ +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) + +static inline void invalidate_caches(void) +{ + // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. + int ret; + __smp_mb(); + asm volatile("li a0, 0\n" + "li a1, 0\n" + "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ + "li a7, 259\n" /* __NR_arch_specific_syscall */ + "ecall\n" + : "=r"(ret) + : + : "a7"); +} + +static inline size_t vdso_trampoline_size(void) +{ + return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; +} + +static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) +{ + int i, j; + uint64_t total_size, trampoline_size; + uint64_t trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + trampoline_size = vdso_trampoline_size(); + total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); + invalidate_caches(); + return trampoline; + } + } + + return 0; +} + +static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) +{ + size_t trampoline_size = vdso_trampoline_size(); + uint64_t *lookup_table = NULL; + /* + * li t0, INDEX + * addi t0, x0 INDEX + * jal x0, riscv_vdso_lookup + */ + uint32_t trampoline_call[2] = { + 0x00000293, + 0x0000006f, + }; + const size_t insts_len = ARRAY_SIZE(trampoline_call); + uint32_t *call_addr = (uint32_t *)from; + // Offset from the jal instruction to the lookup trampoline. + ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); + + trampoline_call[0] = trampoline_call[0] | (idx << 24); + trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); + + for (unsigned int i = 0; i < insts_len; i++) { + call_addr[i] = trampoline_call[i]; + } + + // Set the lookup table pointer for this vdso symbol. + lookup_table = (uint64_t *)(trampoline + trampoline_size); + lookup_table[idx] = to; +} + +int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i, valid_idx = 0; + + uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, + valid_idx); + valid_idx++; + } + + invalidate_caches(); + + return 0; +} \ No newline at end of file diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 912fab24ba..60c7f1e944 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -23,6 +23,10 @@ ifeq ($(ARCH),x86) ccflags-y += -mshstk endif +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o @@ -43,6 +47,10 @@ ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif +ifeq ($(ARCH),riscv64) + restorer-obj-y += ./$(ARCH_DIR)/vdso-lookup.o +endif + define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index da2a2fab3e..d96a7ac32d 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,3 +27,7 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif + +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif \ No newline at end of file From 663678222c35a170e0a7b5bb56105a08a628bea3 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:09:16 -0700 Subject: [PATCH 021/198] zdtm: add riscv64 support Signed-off-by: Haorong Lu --- .../lib/arch/riscv64/include/asm/atomic.h | 107 ++++++++++++++++++ test/zdtm/lib/test.c | 2 +- test/zdtm/static/fanotify00.c | 2 +- test/zdtm/static/netns-nf.desc | 2 +- test/zdtm/static/netns-nft-ipt.desc | 2 +- .../static/socket-tcp-closed-last-ack.desc | 4 +- test/zdtm/static/socket-tcp-reseted.desc | 6 +- test/zdtm/static/socket-tcp-syn-sent.desc | 4 +- 8 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 test/zdtm/lib/arch/riscv64/include/asm/atomic.h diff --git a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h new file mode 100644 index 0000000000..a4faf13221 --- /dev/null +++ b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h @@ -0,0 +1,107 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)v); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + *v = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(*v), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(*ptr) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index a5ba38b2dd..95017e42ef 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64 || __riscv) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/static/fanotify00.c b/test/zdtm/static/fanotify00.c index 69ead43e7f..0400cc74bb 100644 --- a/test/zdtm/static/fanotify00.c +++ b/test/zdtm/static/fanotify00.c @@ -22,7 +22,7 @@ #elif defined(__PPC64__) #define __NR_fanotify_init 323 #define __NR_fanotify_mark 324 -#elif __aarch64__ +#elif (__aarch64__ || __riscv) #define __NR_fanotify_init 262 #define __NR_fanotify_mark 263 #elif __s390x__ diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index e7e73b1ae1..c99696d1cf 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -1,6 +1,6 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns-nft-ipt.desc b/test/zdtm/static/netns-nft-ipt.desc index 4120f74d61..6d04589b31 100644 --- a/test/zdtm/static/netns-nft-ipt.desc +++ b/test/zdtm/static/netns-nft-ipt.desc @@ -2,7 +2,7 @@ 'deps': [ '/bin/sh', '/usr/sbin/nft', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index d4cfe50643..309854fa53 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 3ebdfeef88..4aa48ad874 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -1,8 +1,8 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', - '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/riscv64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 4cc23c8fc7..71cd26d727 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', From 986376929e2eb316c411db1ab38fd626b6decf48 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:10:46 -0700 Subject: [PATCH 022/198] ci: add workflow for riscv64 Signed-off-by: Haorong Lu --- .github/workflows/cross-compile-daily.yml | 2 +- .github/workflows/cross-compile.yml | 1 + .../build/Dockerfile.riscv64-stable-cross.hdr | 5 ++ .../Dockerfile.riscv64-stable-cross.tmpl | 57 +++++++++++++++++++ scripts/build/Makefile | 2 +- scripts/ci/riscv64-cross/amd64-sources.list | 10 ++++ scripts/ci/riscv64-cross/riscv64-sources.list | 42 ++++++++++++++ 7 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.hdr create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.tmpl create mode 100644 scripts/ci/riscv64-cross/amd64-sources.list create mode 100644 scripts/ci/riscv64-cross/riscv64-sources.list diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index b8c8c86d48..c709cca00a 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross] + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] branches: [criu-dev, master] steps: diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 06b8128231..96672b2946 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -21,6 +21,7 @@ jobs: aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, + riscv64-stable-cross, ] include: - experimental: true diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.hdr b/scripts/build/Dockerfile.riscv64-stable-cross.hdr new file mode 100644 index 0000000000..d4c4140233 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.hdr @@ -0,0 +1,5 @@ +FROM ubuntu:jammy + +ENV ARCH=riscv64 +ENV DEBIAN_ARCH=riscv64 +ENV CROSS_TRIPLET=riscv64-linux-gnu diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl new file mode 100644 index 0000000000..39a0c33c6c --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -0,0 +1,57 @@ +COPY scripts/ci/apt-install /bin/apt-install + +# Add the cross compiler sources +RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 871920D1991BC93C 8D69674688B6CB36 B523E5F3FC4E5F2C + +COPY scripts/ci/riscv64-cross/amd64-sources.list /etc/apt/sources.list + +COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ + +RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ + apt-get update -y + +# Install required packages +RUN apt-get install -y --no-install-recommends \ + build-essential \ + pkg-config \ + git \ + crossbuild-essential-${DEBIAN_ARCH} \ + libc6-dev-${DEBIAN_ARCH}-cross \ + libc6-${DEBIAN_ARCH}-cross \ + libbz2-dev:${DEBIAN_ARCH} \ + libexpat1-dev:${DEBIAN_ARCH} \ + ncurses-dev:${DEBIAN_ARCH} \ + libssl-dev:${DEBIAN_ARCH} \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf \ + libnl-3-dev:${DEBIAN_ARCH} \ + libprotobuf-dev:${DEBIAN_ARCH} \ + libnet-dev:${DEBIAN_ARCH} \ + libprotobuf-c-dev:${DEBIAN_ARCH} \ + libcap-dev:${DEBIAN_ARCH} \ + libaio-dev:${DEBIAN_ARCH} \ + libnl-route-3-dev:${DEBIAN_ARCH} \ + libnftables-dev:${DEBIAN_ARCH} \ + libgnutls28-dev:${DEBIAN_ARCH} \ + iproute2:${DEBIAN_ARCH} + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Makefile b/scripts/build/Makefile index bc4a59db1c..3893152270 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,5 +1,5 @@ ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 -STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross +STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) diff --git a/scripts/ci/riscv64-cross/amd64-sources.list b/scripts/ci/riscv64-cross/amd64-sources.list new file mode 100644 index 0000000000..72dad920c2 --- /dev/null +++ b/scripts/ci/riscv64-cross/amd64-sources.list @@ -0,0 +1,10 @@ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security main restricted +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security universe +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/riscv64-cross/riscv64-sources.list b/scripts/ci/riscv64-cross/riscv64-sources.list new file mode 100644 index 0000000000..67b8067b6b --- /dev/null +++ b/scripts/ci/riscv64-cross/riscv64-sources.list @@ -0,0 +1,42 @@ +# See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to +# newer versions of the distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted + +## Major bug fix updates produced after the final release of the +## distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team. Also, please note that software in universe WILL NOT receive any +## review or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team, and may not be under a free licence. Please satisfy yourself as to +## your rights to use the software. Also, please note that software in +## multiverse WILL NOT receive any review or updates from the Ubuntu +## security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse + +## N.B. software from this repository may not have been tested as +## extensively as that contained in the main release, although it includes +## newer versions of some applications which may provide useful features. +## Also, please note that software in backports WILL NOT receive any review +## or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse + +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse \ No newline at end of file From f6baf8143b8b6490c7fca5d7a9cf948b5f5ed02c Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Mon, 14 Oct 2024 01:35:44 +0800 Subject: [PATCH 023/198] include: don't use GCC's __builtin_ffs on riscv64 Link: https://github.com/SerenityOS/serenity/commit/e300da4db42e2484d98f4982d03150d83436304e Signed-off-by: PukNgae Cryolitia --- - cherry-picked Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/bitops.h | 111 ++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h index 400cc3e155..eabab27c71 100644 --- a/include/common/arch/riscv64/asm/bitops.h +++ b/include/common/arch/riscv64/asm/bitops.h @@ -2,7 +2,116 @@ #define __CR_ASM_BITOPS_H__ #include "common/compiler.h" -#include "common/asm-generic/bitops.h" +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m"(*(volatile long *)(x)) +#else +#define BITOP_ADDR(x) "+m"(*(volatile long *)(x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr |= (1UL << (nr % BITS_PER_LONG)); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr ^= (1UL << (nr % BITS_PER_LONG)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr &= ~(1UL << (nr % BITS_PER_LONG)); +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int p = 0; + + for (; p < 8*sizeof(word); ++p) { + if (word & 1) { + break; + } + + word >>= 1; + } + + return p; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG - 1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) + #define BITS_PER_LONG 64 From f5dec056ad3960ff2494697fb84cf263b2dcf644 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Sat, 12 Oct 2024 15:29:40 +0800 Subject: [PATCH 024/198] uffd: Disable image deduplication after fork After a fork, both the child and parent processes may trigger a page fault (#PF) at the same virtual address, referencing the same position in the page image. If deduplication is enabled, the last process to trigger the page fault will fail. Therefore, deduplication should be disabled after a fork to prevent this issue. Signed-off-by: Liu Hua --- criu/include/pagemap.h | 5 +++++ criu/pagemap.c | 11 ++++++++++- criu/uffd.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 8c71805598..3ae15deb9c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -58,6 +58,9 @@ struct page_read { /* Whether or not pages can be read in PIE code */ bool pieok; + /* Whether or not disable image deduplication*/ + bool disable_dedup; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; @@ -112,6 +115,8 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); +extern void page_read_disable_dedup(struct page_read *pr); + extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) diff --git a/criu/pagemap.c b/criu/pagemap.c index 83f69bba37..85bb922596 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -261,7 +261,7 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned l break; } - if (opts.auto_dedup) { + if (opts.auto_dedup && !pr->disable_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; @@ -792,6 +792,7 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; + pr->disable_dedup = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); if (!pr->pmi) @@ -852,6 +853,14 @@ int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) #define DUP_IDS_BASE 1000 +void page_read_disable_dedup(struct page_read *pr) +{ + pr_debug("disable dedup, id: %d\n", pr->id); + pr->disable_dedup = true; + if (pr->parent) + page_read_disable_dedup(pr->parent); +} + void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; diff --git a/criu/uffd.c b/criu/uffd.c index e07b21b69c..98c2b7e075 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -1098,6 +1098,8 @@ static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) lpi_get(lpi->parent); + page_read_disable_dedup(&parent_lpi->pr); + page_read_disable_dedup(&lpi->pr); return 1; out: From dcc3b496193d3c64778c726f460ad99858e8cbe8 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Fri, 18 Oct 2024 18:51:18 +0200 Subject: [PATCH 025/198] criu: Initialize util before service worker starts When restoring dumps in new mount + pid namespaces where multiple dumps share the same network namespace, CRIU may fail due to conflicting unix socket names. This happens because the service worker creates sockets using a pattern that includes criu_run_id, but util_init() is called after cr_service_work() starts. The socket naming pattern "crtools-fd-%d-%d" uses the restore PID and criu_run_id, however criu_run_id is always 0 when not initialized, leading to conflicts when multiple restores run simultaneously either in the same CRIU process or because of multiple CRIU processes doing the same operation in different PID namespaces. Fix this by: - Moving util_init() before cr_service_work() starts - Adding a second util_init() call in the service worker fork to ensure unique IDs across multiple worker runs - Making sure that dump and restore operations have util_init() called early to generate unique socket names With this fix, socket names always include the namespace ID, preventing conflicts when multiple processes with the same pid share a network namespace. Fixes #2499 [ avagin: minore code changes ] Signed-off-by: Lorenzo Fontana Signed-off-by: Andrei Vagin --- criu/cr-service.c | 8 ++++++++ criu/crtools.c | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ffe..b9d11ced22 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1310,6 +1310,14 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + more: opts.mode = CR_SWRK; diff --git a/criu/crtools.c b/criu/crtools.c index 94657f4186..6f493850b9 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -169,7 +169,13 @@ int main(int argc, char *argv[], char *envp[]) pr_err("unknown command: %s\n", argv[optind]); goto usage; } - + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -254,8 +260,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1; From 216d804aabf08e23824bc2adbe68e471da28b1f7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:30:26 +0000 Subject: [PATCH 026/198] seize: fix error handling for check_freezer_cgroup When `check_freezer_cgroup()` has non-zero return value, `goto err` calls `return ret`. However, the value of `ret` has been set to `0` in the lines above and CRIU does not handle the error properly. This problem is related to https://github.com/checkpoint-restore/criu/issues/2508 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index edeb57cc8a..ab394f9ca5 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1009,7 +1009,7 @@ static int cgroup_version(void) int collect_pstree(void) { pid_t pid = root_item->pid->real; - int ret = -1; + int ret, exit_code = -1; struct proc_status_creds creds; struct pstree_item *iter; @@ -1069,7 +1069,6 @@ int collect_pstree(void) if (opts.freeze_cgroup && !freeze_cgroup_disabled && freezer_wait_processes()) { - ret = -1; goto err; } @@ -1081,12 +1080,12 @@ int collect_pstree(void) goto err; } - ret = 0; + exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); - return ret; + return exit_code; } From f8f0e1df76100ab039d39d35017c30c0550a99fd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:57:30 +0000 Subject: [PATCH 027/198] seize: enable support for frozen containers Container runtimes like CRI-O and containerd utilize the freezer cgroup to create a consistent snapshot of container root filesystem (rootfs) changes. In this case, the container is frozen before invoking CRIU. After CRIU successfully completes, a copy of the container rootfs diff is saved, and the container is then unfrozen. However, the `cuda-checkpoint` tool is not able to perform a 'lock' action on frozen threads. To support GPU checkpointing with these container runtimes, we need to unfreeze the cgroup and return it to its original state once the checkpointing is complete. To reflect this new behavior, the following changes are applied: - `dont_use_freeze_cgroup(void)` -> `set_compel_interrupt_only_mode(void)` - `bool freeze_cgroup_disabled` -> `bool compel_interrupt_only_mode` - `check_freezer_cgroup(void)` -> `prepare_freezer_for_interrupt_only_mode(void)` Note that when `compel_interrupt_only_mode` is set to `true`, `compel_interrupt_task()` is used instead of `freeze_processes()` to prevent tasks from running during `criu dump`. Fixes: #2508 Signed-off-by: Radostin Stoyanov --- criu/fault-injection.c | 4 +-- criu/include/fault-injection.h | 2 +- criu/include/seize.h | 2 +- criu/seize.c | 46 +++++++++++++++++++--------------- plugins/cuda/cuda_plugin.c | 2 +- test/jenkins/criu-fault.sh | 2 +- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 2272e6d842..5dd9acf601 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -24,8 +24,8 @@ int fault_injection_init(void) fi_strategy = start; switch (fi_strategy) { - case FI_DISABLE_FREEZE_CGROUP: - dont_use_freeze_cgroup(); + case FI_COMPEL_INTERRUPT_ONLY_MODE: + set_compel_interrupt_only_mode(); break; default: break; diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 59adf05b9e..e987c18ce3 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -21,7 +21,7 @@ enum faults { FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, - FI_DISABLE_FREEZE_CGROUP = 137, + FI_COMPEL_INTERRUPT_ONLY_MODE = 137, FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/criu/include/seize.h b/criu/include/seize.h index f5ea76b16c..64e8d2d12f 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -9,6 +9,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); -extern void dont_use_freeze_cgroup(void); +extern void set_compel_interrupt_only_mode(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ab394f9ca5..9bd1832d9b 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,17 +25,17 @@ #include "xmalloc.h" #include "util.h" -static bool freeze_cgroup_disabled; +static bool compel_interrupt_only_mode; /* * Disables the use of freeze cgroups for process seizing, even if explicitly - * requested via the --freeze-cgroup option. This is necessary for plugins - * (e.g., CUDA) that do not function correctly when processes are frozen using - * cgroups. + * requested via the --freeze-cgroup option or already set in a frozen state. + * This is necessary for plugins (e.g., CUDA) that do not function correctly + * when processes are frozen using cgroups. */ -void __attribute__((used)) dont_use_freeze_cgroup(void) +void __attribute__((used)) set_compel_interrupt_only_mode(void) { - freeze_cgroup_disabled = true; + compel_interrupt_only_mode = true; } char *task_comm_info(pid_t pid, char *comm, size_t size) @@ -410,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -505,29 +505,35 @@ static int log_unfrozen_stacks(char *root) return 0; } -static int check_freezer_cgroup(void) +static int prepare_freezer_for_interrupt_only_mode(void) { enum freezer_state state = THAWED; int fd; + int exit_code = -1; - BUG_ON(!freeze_cgroup_disabled); + BUG_ON(!compel_interrupt_only_mode); fd = freezer_open(); if (fd < 0) return -1; state = get_freezer_state(fd); - close(fd); if (state == FREEZER_ERROR) { - return -1; + goto err; } + origin_freezer_state = state == FREEZING ? FROZEN : state; + if (state != THAWED) { - pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); - return -1; + pr_warn("unfreezing cgroup for plugin compatibility\n"); + if (freezer_write_state(fd, THAWED)) + goto err; } - return 0; + exit_code = 0; +err: + close(fd); + return exit_code; } static int freeze_processes(void) @@ -681,7 +687,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -869,7 +875,7 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + if ((!opts.freeze_cgroup || compel_interrupt_only_mode) && compel_interrupt_task(pid)) continue; @@ -926,7 +932,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup && !freeze_cgroup_disabled) + if (opts.freeze_cgroup && !compel_interrupt_only_mode) attempts = 1; /* @@ -1032,11 +1038,11 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (opts.freeze_cgroup && !compel_interrupt_only_mode) { if (freeze_processes()) goto err; } else { - if (opts.freeze_cgroup && check_freezer_cgroup()) + if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); @@ -1067,7 +1073,7 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && !freeze_cgroup_disabled && + if (opts.freeze_cgroup && !compel_interrupt_only_mode && freezer_wait_processes()) { goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index c4fc67fa9f..3d624750e6 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -509,7 +509,7 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } - dont_use_freeze_cgroup(); + set_compel_interrupt_only_mode(); return 0; } diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index fc0eddc2b2..8cb71d8ca7 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -40,7 +40,7 @@ fi # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail -# check dont_use_freeze_cgroup +# check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst From 31b38d662d26f2ef9a6a36e9aeaed23de71bb578 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 8 Nov 2024 13:41:20 +0000 Subject: [PATCH 028/198] ci: test interrupt-only mode with frozen cgroup Signed-off-by: Radostin Stoyanov --- test/jenkins/criu-fault.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 8cb71d8ca7..6ee7ce33a8 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -43,6 +43,8 @@ fi # check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst +# check set_compel_interrupt_only_mode when test cgroup is frozen +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:f --fault 137 if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then fail From 26dcc216c2336f432cacbd5d4f126d7066f8c271 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 2 Nov 2024 08:29:43 +0000 Subject: [PATCH 029/198] cuda: fix check for GPU device availability The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 3d624750e6..718db30251 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) +/** + * Check if a CUDA device is available on the system + */ +static bool is_cuda_device_available(void) +{ + const char *gpu_path = "/proc/driver/nvidia/gpus/"; + struct stat sb; + + if (stat(gpu_path, &sb) != 0) + return false; + + return S_ISDIR(sb.st_mode); +} + int cuda_plugin_init(int stage) { int ret; @@ -481,8 +495,8 @@ int cuda_plugin_init(int stage) } } - if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { - pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { + pr_info("No GPU device found; CUDA plugin is disabled\n"); plugin_disabled = true; return 0; } From 6f0ec7def65ba535fcb572906dee5cd4c9f8046f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 6 Nov 2024 22:08:24 +0530 Subject: [PATCH 030/198] pidfd: one process creates a helper and opens all fds to it Currently, the `waitpid()` call on the tmp process can be made by a process which is not its parent. This causes restore to fail. This patch instead selects one process to create the tmp process and open all the fds that point to it. These fds are sent to the correct process(es). Fixes: #2496 Signed-off-by: Andrei Vagin Signed-off-by: Bhavik Sachdev --- criu/files.c | 7 +-- criu/include/pidfd.h | 2 +- criu/pidfd.c | 124 +++++++++++++++++++++---------------------- 3 files changed, 62 insertions(+), 71 deletions(-) diff --git a/criu/files.c b/criu/files.c index a57fb860fb..31e705bcc5 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1811,11 +1811,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); - - if (init_dead_pidfd_hash()) { - pr_err("Could not initialise hash map for dead pidfds\n"); - return -1; - } - + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h index 4d2d71700e..bcc0fb45ab 100644 --- a/criu/include/pidfd.h +++ b/criu/include/pidfd.h @@ -7,7 +7,7 @@ extern const struct fdtype_ops pidfd_dump_ops; extern struct collect_image_info pidfd_cinfo; extern int is_pidfd_link(char *link); -extern int init_dead_pidfd_hash(void); +extern void init_dead_pidfd_hash(void); struct pidfd_dump_info { PidfdEntry pidfe; pid_t pid; diff --git a/criu/pidfd.c b/criu/pidfd.c index 3ea3c93094..53b9bcf71a 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -21,32 +21,26 @@ struct pidfd_info { PidfdEntry *pidfe; struct file_desc d; + + struct dead_pidfd *dead; + struct pidfd_info *next; }; struct dead_pidfd { unsigned int ino; - int pid; - size_t count; - mutex_t pidfd_lock; + int creator_id; + struct hlist_node hash; + struct pidfd_info *list; }; #define DEAD_PIDFD_HASH_SIZE 32 static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; -static mutex_t *dead_pidfd_hash_lock; -int init_dead_pidfd_hash(void) +void init_dead_pidfd_hash(void) { for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) INIT_HLIST_HEAD(&dead_pidfd_hash[i]); - - dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); - if (!dead_pidfd_hash_lock) - return -1; - - mutex_init(dead_pidfd_hash_lock); - - return 0; } static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) @@ -54,15 +48,12 @@ static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) struct dead_pidfd *dead; struct hlist_head *chain; - mutex_lock(dead_pidfd_hash_lock); chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; hlist_for_each_entry(dead, chain, hash) { if (dead->ino == ino) { - mutex_unlock(dead_pidfd_hash_lock); return dead; } } - mutex_unlock(dead_pidfd_hash_lock); return NULL; } @@ -142,7 +133,7 @@ static int create_tmp_process(void) return tmp_process; } -static int free_dead_pidfd(struct dead_pidfd *dead) +static int kill_helper(pid_t pid) { int status; sigset_t blockmask, oldmask; @@ -160,15 +151,13 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - if (kill(dead->pid, SIGKILL) < 0) { - pr_perror("Could not kill temporary process with pid: %d", - dead->pid); + if (kill(pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", pid); goto err; } - if (waitpid(dead->pid, &status, 0) != dead->pid) { - pr_perror("Could not wait on temporary process with pid: %d", - dead->pid); + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Could not wait on temporary process with pid: %d", pid); goto err; } @@ -188,9 +177,6 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - mutex_lock(dead_pidfd_hash_lock); - hlist_del(&dead->hash); - mutex_unlock(dead_pidfd_hash_lock); return 0; err: return -1; @@ -198,8 +184,9 @@ static int free_dead_pidfd(struct dead_pidfd *dead) static int open_one_pidfd(struct file_desc *d, int *new_fd) { - struct pidfd_info *info; + struct pidfd_info *info, *child; struct dead_pidfd *dead = NULL; + pid_t pid; int pidfd; info = container_of(d, struct pidfd_info, d); @@ -215,34 +202,44 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd) dead = lookup_dead_pidfd(info->pidfe->ino); BUG_ON(!dead); - mutex_lock(&dead->pidfd_lock); - BUG_ON(dead->count == 0); - dead->count--; - if (dead->pid == -1) { - dead->pid = create_tmp_process(); - if (dead->pid < 0) { - mutex_unlock(&dead->pidfd_lock); - goto err_close; + if (info->dead && info->dead->creator_id != info->pidfe->id) { + int ret = recv_desc_from_peer(&info->d, &pidfd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd\n"); + return ret; } + goto out; } - pidfd = pidfd_open(dead->pid, info->pidfe->flags); - if (pidfd < 0) { - pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - mutex_unlock(&dead->pidfd_lock); + pid = create_tmp_process(); + if (pid < 0) goto err_close; - } - if (dead->count == 0) { - if (free_dead_pidfd(dead)) { - pr_err("Failed to delete dead_pidfd struct\n"); - mutex_unlock(&dead->pidfd_lock); - close(pidfd); + for (child = dead->list; child; child = child->next) { + if (child == info) + continue; + pidfd = pidfd_open(pid, child->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", child->pidfe->nspid); goto err_close; } + + if (send_desc_to_peer(pidfd, &child->d)) { + pr_perror("Can't send file descriptor"); + close(pidfd); + return -1; + } + close(pidfd); } - mutex_unlock(&dead->pidfd_lock); + pidfd = pidfd_open(pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + if (kill_helper(pid)) + goto err_close; out: if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { goto err_close; @@ -269,32 +266,31 @@ static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) info->pidfe = pb_msg(msg, PidfdEntry); pr_info_pidfd("Collected ", info->pidfe); + info->dead = NULL; if (info->pidfe->nspid != -1) goto out; dead = lookup_dead_pidfd(info->pidfe->ino); - if (dead) { - mutex_lock(&dead->pidfd_lock); - dead->count++; - mutex_unlock(&dead->pidfd_lock); - goto out; - } - - dead = shmalloc(sizeof(*dead)); if (!dead) { - pr_err("Could not allocate shared memory..\n"); - return -1; + dead = xmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->list = NULL; + dead->ino = info->pidfe->ino; + dead->creator_id = info->pidfe->id; + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); } - INIT_HLIST_NODE(&dead->hash); - dead->ino = info->pidfe->ino; - dead->count = 1; - dead->pid = -1; - mutex_init(&dead->pidfd_lock); + info->dead = dead; + info->next = dead->list; + dead->list = info; + if (dead->creator_id > info->pidfe->id) + dead->creator_id = info->pidfe->id; - mutex_lock(dead_pidfd_hash_lock); - hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); - mutex_unlock(dead_pidfd_hash_lock); out: return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); } From 223a8f1e8640bc99c7fe4d3aaadef8af1480ac2b Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 6 Nov 2024 22:10:08 +0530 Subject: [PATCH 031/198] zdtm: Check many processes with common dead pidfd We have multiple processes open a pidfd to a common dead process. After C/R we check that the inode numbers for these pidfds are equal or not. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_diffdead.c | 228 ++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 test/zdtm/static/pidfd_diffdead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 44ac64fe57..71a1b6a535 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -56,6 +56,7 @@ TST_NOFILE := \ pidfd_self \ pidfd_of_thread \ pidfd_dead \ + pidfd_diffdead \ pidfd_child \ pidfd_kill \ fd_from_pidfd \ diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c new file mode 100644 index 0000000000..5bc1911a51 --- /dev/null +++ b/test/zdtm/static/pidfd_diffdead.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main and child open a pidfd for grandchild. + * Before C/R we kill grandchild. + * We end up with two pidfds in two diff processes that point to the same dead process. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char *argv[]) +{ +#define READ 0 +#define WRITE 1 + + int child, ret, gchild, status; + struct statx stat; + task_waiter_t t; + unsigned long long ino; + + /* + * We use the inop pipe to send the inode number of the + * pidfd opened in the child to the main process for + * comparison. + */ + int p[2]; + int pidfd; + + test_init(argc, argv); + task_waiter_init(&t); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild; + gchild = test_fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[READ]); + close(p[WRITE]); + while (1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + close(pidfd); + + /* ino number should be same because both pidfds were for the same process */ + if (ino != stat.stx_ino) { + exit(1); + } + exit(0); + } + } + + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + /* + * We kill grandchild process only after opening pidfd. + */ + if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + return 1; + } + + /* Wait for child to waitpid on gchild */ + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + close(p[READ]); + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + goto err; + } + + /* Send inode number of pidfd to child for comparison */ + if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { + pr_perror("write"); + goto err; + } + close(p[WRITE]); + + if (kill(child, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto err; + } + + if (!WIFEXITED(status)) { + fail("Expected child to terminate normally"); + goto err; + } + + if (WEXITSTATUS(status) != 0) { + fail("Child failed"); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From d6e5e7677f70b2feab7917c768a64a3a04485c1c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 13:04:31 +0000 Subject: [PATCH 032/198] cuda: enable checkpoint support for paused tasks If a CUDA process is already in a "locked" or "checkpointed" state during criu dump, the CUDA plugin currently fails with an error because it attempts an unnecessary "lock" action using the cuda-checkpoint tool. This patch extends the CUDA plugin to handle such cases by first verifying the initial state of the CUDA processes and skipping unnecessary "lock" and "checkpoint" actions when a process has been locked or checkpointed before CRIU is invoked. In particular, CUDA tasks may already be in a "locked" or "checkpointed" state to ensure consistent checkpoint/restore for distributed workloads, such as model training, where multiple containers run across different cluster nodes. Another use case for this functionality is optimizing resource utilization, where CUDA tasks with low-priority are preempted immediately to release GPU resources needed by high-priority tasks, and the paused workloads are later resumed or migrated to another node. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 155 +++++++++++++++++++++++++++---------- 1 file changed, 116 insertions(+), 39 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 718db30251..7764cf3c75 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -26,6 +26,13 @@ #define ACTION_RESTORE "restore" #define ACTION_UNLOCK "unlock" +typedef enum { + CUDA_TASK_RUNNING = 0, + CUDA_TASK_LOCKED, + CUDA_TASK_CHECKPOINTED, + CUDA_TASK_UNKNOWN = -1 +} cuda_task_state_t; + #define CUDA_CKPT_BUF_SIZE (128) #ifdef LOG_PREFIX @@ -43,6 +50,7 @@ bool plugin_added_to_inventory = false; struct pid_info { int pid; char checkpointed; + cuda_task_state_t initial_task_state; struct list_head list; }; @@ -62,7 +70,7 @@ static void dealloc_pid_buffer(struct list_head *pid_buf) } } -static int add_pid_to_buf(struct list_head *pid_buf, int pid) +static int add_pid_to_buf(struct list_head *pid_buf, int pid, cuda_task_state_t state) { struct pid_info *new = xmalloc(sizeof(*new)); @@ -72,25 +80,12 @@ static int add_pid_to_buf(struct list_head *pid_buf, int pid) new->pid = pid; new->checkpointed = 0; + new->initial_task_state = state; list_add_tail(&new->list, pid_buf); return 0; } -static int update_checkpointed_pid(struct list_head *pid_buf, int pid) -{ - struct pid_info *info; - - list_for_each_entry(info, pid_buf, list) { - if (info->pid == pid) { - info->checkpointed = 1; - return 0; - } - } - - return -1; -} - static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 @@ -231,6 +226,37 @@ static int get_cuda_restore_tid(int root_pid) return atoi(pid_out); } +static cuda_task_state_t get_task_state_enum(const char *state_str) +{ + if (strncmp(state_str, "running", 7) == 0) + return CUDA_TASK_RUNNING; + + if (strncmp(state_str, "locked", 6) == 0) + return CUDA_TASK_LOCKED; + + if (strncmp(state_str, "checkpointed", 12) == 0) + return CUDA_TASK_CHECKPOINTED; + + pr_err("Unknown CUDA state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; +} + +static cuda_task_state_t get_cuda_state(pid_t pid) +{ + char pid_buf[16]; + char state_str[CUDA_CKPT_BUF_SIZE]; + const char *args[] = { CUDA_CHECKPOINT, "--get-state", "--pid", pid_buf, NULL }; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + if (launch_cuda_checkpoint(args, state_str, sizeof(state_str))) { + pr_err("Failed to launch cuda-checkpoint to retrieve state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; + } + + return get_task_state_enum(state_str); +} + static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, int buf_size) { @@ -319,6 +345,8 @@ int cuda_plugin_checkpoint_devices(int pid) int int_ret; int status; k_rtsigset_t save_sigset; + struct pid_info *task_info; + bool pid_found = false; if (plugin_disabled) { return -ENOTSUP; @@ -336,6 +364,26 @@ int cuda_plugin_checkpoint_devices(int pid) return 0; } + /* Check if the process is already in a checkpointed state */ + list_for_each_entry(task_info, &cuda_pids, list) { + if (task_info->pid == pid) { + if (task_info->initial_task_state == CUDA_TASK_CHECKPOINTED) { + pr_info("pid %d already in a checkpointed state\n", pid); + return 0; + } + pid_found = true; + break; + } + } + + if (pid_found == false) { + /* We return an error here. The task should be restored + * to its original state at cuda_plugin_fini(). + */ + pr_err("Failed to track pid %d\n", pid); + return -1; + } + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); /* We need to resume the checkpoint thread to prepare the mappings for * checkpointing @@ -348,22 +396,8 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - status = update_checkpointed_pid(&cuda_pids, pid); - if (status) { - pr_err("Failed to track checkpointed pid %d\n", pid); - status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); - } - } - if (!status && !plugin_added_to_inventory) { - status = add_inventory_plugin(CR_PLUGIN_DESC.name); - if (status) - pr_err("Failed to add CUDA plugin to inventory image\n"); - else - plugin_added_to_inventory = true; - } + task_info->checkpointed = 1; interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -376,6 +410,7 @@ int cuda_plugin_pause_devices(int pid) { int restore_tid; char msg_buf[CUDA_CKPT_BUF_SIZE]; + cuda_task_state_t task_state; if (plugin_disabled) { return -ENOTSUP; @@ -388,6 +423,34 @@ int cuda_plugin_pause_devices(int pid) return 0; } + task_state = get_cuda_state(restore_tid); + if (task_state == CUDA_TASK_UNKNOWN) { + pr_err("Failed to get CUDA state for PID %d\n", restore_tid); + return -1; + } + + if (!plugin_added_to_inventory) { + if (add_inventory_plugin(CR_PLUGIN_DESC.name)) { + pr_err("Failed to add CUDA plugin to inventory image\n"); + return -1; + } + plugin_added_to_inventory = true; + } + + if (task_state == CUDA_TASK_LOCKED) { + pr_info("pid %d already in a locked state\n", pid); + /* Leave this PID in a "locked" state at resume_device() */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_LOCKED); + return 0; + } + + if (task_state == CUDA_TASK_CHECKPOINTED) { + /* We need to skip this PID in cuda_plugin_checkpoint_devices(), + * and leave it in a "checkpoined" state at resume_device(). */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_CHECKPOINTED); + return 0; + } + pr_info("pausing devices on pid %d\n", pid); int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { @@ -397,7 +460,7 @@ int cuda_plugin_pause_devices(int pid) return -1; } - if (add_pid_to_buf(&cuda_pids, pid)) { + if (add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_RUNNING)) { pr_err("unable to track paused pid %d\n", pid); goto unlock; } @@ -412,7 +475,7 @@ int cuda_plugin_pause_devices(int pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) -int resume_device(int pid, int checkpointed) +int resume_device(int pid, int checkpointed, cuda_task_state_t initial_task_state) { char msg_buf[CUDA_CKPT_BUF_SIZE]; int status; @@ -420,6 +483,11 @@ int resume_device(int pid, int checkpointed) int int_ret; k_rtsigset_t save_sigset; + if (initial_task_state == CUDA_TASK_UNKNOWN) { + pr_info("skip resume for PID %d (unknown state)\n", pid); + return 0; + } + int restore_tid = get_cuda_restore_tid(pid); if (restore_tid == -1) { pr_info("No need to resume devices on pid %d\n", pid); @@ -439,7 +507,8 @@ int resume_device(int pid, int checkpointed) return -1; } - if (checkpointed) { + if (checkpointed && (initial_task_state == CUDA_TASK_RUNNING || initial_task_state == CUDA_TASK_LOCKED)) { + /* If the process was "locked" or "running" before checkpointing it, we need to restore it */ status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); @@ -448,10 +517,13 @@ int resume_device(int pid, int checkpointed) } } - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); - ret = -1; + if (initial_task_state == CUDA_TASK_RUNNING) { + /* If the process was "running" before we paused it, we need to unlock it */ + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } } interrupt: @@ -466,7 +538,12 @@ int cuda_plugin_resume_devices_late(int pid) return -ENOTSUP; } - return resume_device(pid, 1); + /* RESUME_DEVICES_LATE is used during `criu restore`. + * Here, we assume that users expect the target process + * to be in a "running" state after restore, even if it was + * in a "locked" or "checkpointed" state during `criu dump`. + */ + return resume_device(pid, 1, CUDA_TASK_RUNNING); } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) @@ -542,7 +619,7 @@ void cuda_plugin_fini(int stage, int ret) if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { struct pid_info *info; list_for_each_entry(info, &cuda_pids, list) { - resume_device(info->pid, info->checkpointed); + resume_device(info->pid, info->checkpointed, info->initial_task_state); } } if (stage == CR_PLUGIN_STAGE__DUMP) { From dd6b580b43b8a678a22190a42b76d971b4352bfe Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 15:14:51 +0000 Subject: [PATCH 033/198] test: add get-state to mocked cuda-checkpoint tool Signed-off-by: Radostin Stoyanov --- test/cuda-checkpoint/cuda-checkpoint.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c index f35a4b41df..3b7ce8b9ff 100644 --- a/test/cuda-checkpoint/cuda-checkpoint.c +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -11,6 +11,7 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { { "pid", required_argument, 0, 'p' }, + { "get-state", no_argument, 0, 's' }, { "get-restore-tid", no_argument, 0, 'g' }, { "action", required_argument, 0, 'a' }, { "timeout", required_argument, 0, 't' }, @@ -31,6 +32,9 @@ int main(int argc, char *argv[]) case 'a': case 't': break; + case 's': + printf("running\n"); + break; case 'h': printf("--action - execute an action"); break; From 7a8ed9e210e8746eb93e179dae0206d645da7db3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 16:10:20 +0000 Subject: [PATCH 034/198] compel: fix gitignore and remove autogenerated code We don't need to have compel/arch/riscv64/plugins/std/syscalls/syscalls.S tracked in git. It is autogenerated. We also need to update our .gitignore to ignore autogenerated files with syscall tables. Signed-off-by: Alexander Mikhalitsyn --- compel/.gitignore | 3 + .../riscv64/plugins/std/syscalls/syscalls.S | 112 ------------------ 2 files changed, 3 insertions(+), 112 deletions(-) delete mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S diff --git a/compel/.gitignore b/compel/.gitignore index eab3337d6b..5e770a86c7 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,6 +4,9 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S +arch/mips/plugins/std/syscalls/syscalls-64.S +arch/loongarch64/plugins/std/syscalls/syscalls-64.S +arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S deleted file mode 100644 index 715da46122..0000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Autogenerated, don't edit */ -#include -#include "std/syscalls/syscall-common.S" -syscall sys_read, __NR_read -syscall sys_write, __NR_write -syscall sys_close, __NR_close -syscall sys_lseek, __NR_lseek -syscall sys_mmap, __NR_mmap -syscall sys_mprotect, __NR_mprotect -syscall sys_munmap, __NR_munmap -syscall sys_brk, __NR_brk -syscall sys_sigaction, __NR_rt_sigaction -syscall sys_sigprocmask, __NR_rt_sigprocmask -syscall sys_rt_sigreturn, __NR_rt_sigreturn -syscall sys_ioctl, __NR_ioctl -syscall sys_pread64, __NR_pread64 -syscall sys_ptrace, __NR_ptrace -syscall sys_mremap, __NR_mremap -syscall sys_mincore, __NR_mincore -syscall sys_madvise, __NR_madvise -syscall sys_shmat, __NR_shmat -syscall sys_pause, __NR_pause -syscall sys_nanosleep, __NR_nanosleep -syscall sys_getitimer, __NR_getitimer -syscall sys_setitimer, __NR_setitimer -syscall sys_getpid, __NR_getpid -syscall sys_socket, __NR_socket -syscall sys_connect, __NR_connect -syscall sys_sendto, __NR_sendto -syscall sys_recvfrom, __NR_recvfrom -syscall sys_sendmsg, __NR_sendmsg -syscall sys_recvmsg, __NR_recvmsg -syscall sys_shutdown, __NR_shutdown -syscall sys_bind, __NR_bind -syscall sys_setsockopt, __NR_setsockopt -syscall sys_getsockopt, __NR_getsockopt -syscall sys_clone, __NR_clone -syscall sys_exit, __NR_exit -syscall sys_wait4, __NR_wait4 -syscall sys_waitid, __NR_waitid -syscall sys_kill, __NR_kill -syscall sys_fcntl, __NR_fcntl -syscall sys_flock, __NR_flock -syscall sys_readlinkat, __NR_readlinkat -syscall sys_umask, __NR_umask -syscall sys_getgroups, __NR_getgroups -syscall sys_setgroups, __NR_setgroups -syscall sys_setresuid, __NR_setresuid -syscall sys_getresuid, __NR_getresuid -syscall sys_setresgid, __NR_setresgid -syscall sys_getresgid, __NR_getresgid -syscall sys_getpgid, __NR_getpgid -syscall sys_setfsuid, __NR_setfsuid -syscall sys_setfsgid, __NR_setfsgid -syscall sys_getsid, __NR_getsid -syscall sys_capget, __NR_capget -syscall sys_capset, __NR_capset -syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo -syscall sys_setpriority, __NR_setpriority -syscall sys_sched_setscheduler, __NR_sched_setscheduler -syscall sys_sigaltstack, __NR_sigaltstack -syscall sys_personality, __NR_personality -syscall sys_prctl, __NR_prctl -syscall sys_setrlimit, __NR_setrlimit -syscall sys_mount, __NR_mount -syscall sys_umount2, __NR_umount2 -syscall sys_gettid, __NR_gettid -syscall sys_futex, __NR_futex -syscall sys_set_tid_address, __NR_set_tid_address -syscall sys_restart_syscall, __NR_restart_syscall -syscall sys_timer_create, __NR_timer_create -syscall sys_timer_settime, __NR_timer_settime -syscall sys_timer_gettime, __NR_timer_gettime -syscall sys_timer_getoverrun, __NR_timer_getoverrun -syscall sys_timer_delete, __NR_timer_delete -syscall sys_clock_gettime, __NR_clock_gettime -syscall sys_exit_group, __NR_exit_group -syscall sys_set_robust_list, __NR_set_robust_list -syscall sys_get_robust_list, __NR_get_robust_list -syscall sys_signalfd4, __NR_signalfd4 -syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo -syscall sys_vmsplice, __NR_vmsplice -syscall sys_timerfd_settime, __NR_timerfd_settime -syscall sys_fanotify_init, __NR_fanotify_init -syscall sys_fanotify_mark, __NR_fanotify_mark -syscall sys_open_by_handle_at, __NR_open_by_handle_at -syscall sys_setns, __NR_setns -syscall sys_kcmp, __NR_kcmp -syscall sys_openat, __NR_openat -syscall sys_mkdirat, __NR_mkdirat -syscall sys_unlinkat, __NR_unlinkat -syscall sys_memfd_create, __NR_memfd_create -syscall sys_io_setup, __NR_io_setup -syscall sys_io_submit, __NR_io_submit -syscall sys_io_getevents, __NR_io_getevents -syscall sys_seccomp, __NR_seccomp -syscall sys_gettimeofday, __NR_gettimeofday -syscall sys_preadv_raw, __NR_preadv_raw -syscall sys_userfaultfd, __NR_userfaultfd -syscall sys_fallocate, __NR_fallocate -syscall sys_ppoll, __NR_ppoll -syscall sys_fsopen, __NR_fsopen -syscall sys_fsconfig, __NR_fsconfig -syscall sys_fsmount, __NR_fsmount -syscall sys_clone3, __NR_clone3 -syscall sys_pidfd_open, __NR_pidfd_open -syscall sys_pidfd_getfd, __NR_pidfd_getfd -syscall sys_rseq, __NR_rseq -syscall sys_move_mount, __NR_move_mount -syscall sys_open_tree, __NR_open_tree -syscall sys_openat2, __NR_openat2 -#include From 1452c76f6541721a77e886a41c6ee63bb5571c0c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 18:32:03 +0000 Subject: [PATCH 035/198] compel/arch/riscv64: properly implement compel_task_size() We need to dynamically calculate TASK_SIZE depending on the MMU on RISC-V system. [We are using analogical approach on aarch64/ppc64le.] This change was tested on physical machine: StarFive VisionFive 2 isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb mmu : sv39 uarch : sifive,u74-mc mvendorid : 0x489 marchid : 0x8000000000000007 mimpid : 0x4210427 hart isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb Signed-off-by: Alexander Mikhalitsyn --- compel/arch/riscv64/src/lib/infect.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 01395a205a..861fe3b2f2 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -181,20 +181,22 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) * Task size is the maximum virtual address space size that a process can occupy in the memory * Refer to linux kernel arch/riscv/include/asm/pgtable.h, * task size is: - * - 0x9fc00000 (~2.5GB) for RV32. - * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu - * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu - * - * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V - * Instruction Set Manual Volume II: Privileged Architecture" states that - * "load and store effective addresses, which are 64bits, must have bits - * 63–48 all equal to bit 47, or else a page-fault exception will occur." -*/ -#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu + */ +#define TASK_SIZE_MIN (1UL << 38) +#define TASK_SIZE_MAX (1UL << 56) unsigned long compel_task_size(void) { - return TASK_SIZE; + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; } /* From beff27eca1ea2f92ee7d023884479f14571f119a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 23 Nov 2024 22:29:45 +0000 Subject: [PATCH 036/198] pidfd: add missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for the following error when building CRIU on Rocky Linux 8 criu/pidfd.c: In function ‘pidfd_open’: criu/pidfd.c:119:17: error: ‘__NR_pidfd_open’ undeclared (first use in this function); did you mean ‘pidfd_open’? return syscall(__NR_pidfd_open, pid, flags); ^~~~~~~~~~~~~~~ pidfd_open criu/pidfd.c:119:17: note: each undeclared identifier is reported only once for each function it appears in criu/pidfd.c:120:1: error: control reaches end of non-void function [-Werror=return-type] } ^ criu/pidfd.c: At top level: cc1: error: unrecognized command line option ‘-Wno-unknown-warning-option’ [-Werror] cc1: error: unrecognized command line option ‘-Wno-dangling-pointer’ [-Werror] cc1: all warnings being treated as errors Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index 53b9bcf71a..ae32025b09 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -11,6 +11,8 @@ #include "common/bug.h" #include "rst-malloc.h" +#include "compel/plugins/std/syscall-codes.h" + #undef LOG_PREFIX #define LOG_PREFIX "pidfd: " From 058572e91dea1d8ac9c345e69e08a58e8abfacbb Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Dec 2024 22:17:38 +0000 Subject: [PATCH 037/198] vdso: handle vvar_vclock vma-s The vvar_vclock was introduced by [1]. Basically, the old vvar vma has been splited on two parts. In term of C/R, these two vma-s can be still treated as one. [1] e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping") Signed-off-by: Andrei Vagin --- criu/include/util-vdso.h | 1 + criu/pie/parasite-vdso.c | 19 ++++++++++++++++++- criu/proc_parse.c | 23 +++++++++++++++++++---- criu/vdso.c | 28 +++++++++++++++++++++------- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index c4386cf8ed..9fd9a6de4a 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -30,6 +30,7 @@ struct vdso_symbol { struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; + unsigned long vvar_vclock_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index 355007fa92..f3ad3107fe 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -45,6 +45,7 @@ static int remap_one(char *who, unsigned long *from, unsigned long to, size_t si static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) { unsigned long vvar_size = rt->sym.vvar_size; + unsigned long vvar_vclock_size = rt->sym.vvar_vclock_size; unsigned long vdso_size = rt->sym.vdso_size; int ret; @@ -54,8 +55,24 @@ static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) std_log_set_gettimeofday(NULL); /* stop using vdso for timings */ - if (vvar) + if (vvar) { + /* + * v6.13-rc1~172^2~9 splits the vvar vma in two parts vvar and + * vvar_clock. The last one is mapped right after the first + * one. + */ + if (vvar_vclock_size) { + unsigned long from; + + vvar_size -= vvar_vclock_size; + from = rt->vvar_start + vvar_size; + + ret = remap_one("rt-vvar", &from, vvar + vvar_size, vvar_vclock_size); + if (ret) + return ret; + } ret = remap_one("rt-vvar", &rt->vvar_start, vvar, vvar_size); + } if (!ret) vdso_update_gtod_addr(rt); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 95ebe3a411..6c4303e7dd 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -579,7 +579,8 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; - } else if (!strcmp(file_path, "[vvar]")) { + } else if (!strcmp(file_path, "[vvar]") || + !strcmp(file_path, "[vvar_vclock]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { @@ -771,7 +772,7 @@ static int task_size_check(pid_t pid, VmaEntry *entry) int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { - struct vma_area *vma_area = NULL; + struct vma_area *vma_area = NULL, *prev_vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; @@ -813,8 +814,22 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du continue; } - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) - goto err; + if (vma_area && vma_area_is(vma_area, VMA_AREA_VVAR) && + prev_vma_area && vma_area_is(prev_vma_area, VMA_AREA_VVAR)) { + if (prev_vma_area->e->end != vma_area->e->start) { + pr_err("two nonconsecutive vvar vma-s: " + "%" PRIx64 "-%" PRIx64 " %" PRIx64 "-%" PRIx64 "\n", + prev_vma_area->e->start, prev_vma_area->e->end, + vma_area->e->start, vma_area->e->end); + goto err; + } + /* Merge all vvar vma-s into one. */ + prev_vma_area->e->end = vma_area->e->end; + } else { + if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) + goto err; + prev_vma_area = vma_area; + } if (eof) break; diff --git a/criu/vdso.c b/criu/vdso.c index 7de2fae784..d4d3511314 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -310,7 +310,7 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) while (1) { unsigned long start, end; - char *has_vdso, *has_vvar; + char *has_vdso, *has_vvar, *has_vvar_vclock; buf = breadline(&f); if (buf == NULL) @@ -318,13 +318,19 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) if (IS_ERR(buf)) goto err; - has_vdso = strstr(buf, "[vdso]"); - if (!has_vdso) + has_vvar = NULL; + has_vvar_vclock = NULL; + do { + has_vdso = strstr(buf, "[vdso]"); + if (has_vdso) + break; has_vvar = strstr(buf, "[vvar]"); - else - has_vvar = NULL; + if (has_vvar) + break; + has_vvar_vclock = strstr(buf, "[vvar_vclock]"); + } while (0); - if (!has_vdso && !has_vvar) + if (!has_vdso && !has_vvar && !has_vvar_vclock) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { @@ -339,13 +345,21 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) } s->vdso_start = start; s->sym.vdso_size = end - start; - } else { + } else if (has_vvar) { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; + } else { + if (s->vvar_start == VDSO_BAD_ADDR || + s->vvar_start + s->sym.vvar_size != start) { + pr_err("VVAR and VVAR_VCLOCK entries are not subsequent\n"); + goto err; + } + s->sym.vvar_vclock_size = end - start; + s->sym.vvar_size += s->sym.vvar_vclock_size; } } From 6d1da6148298e772e8ebabd613f68eccdb214a9e Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Tue, 10 Dec 2024 12:11:57 -0800 Subject: [PATCH 038/198] cuda: Fix return value from CHECKPOINT_DEVICES hook so that dump's fail properly cuda-checkpoint returns the positive CUDA error code when it runs into an issue and passing that along as the return value would cause errors to get ignored Signed-off-by: Jesus Ramos --- plugins/cuda/cuda_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 7764cf3c75..e78828b189 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -402,7 +402,7 @@ int cuda_plugin_checkpoint_devices(int pid) interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? status : int_ret; + return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From d46cbf76ed51b3adb9e9575a8f9ae27476b659d6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 7 Dec 2024 00:08:59 +0000 Subject: [PATCH 039/198] test/java: increate the ghost file limit Right now, this test fails with this error: Error (criu/files-reg.c:1031): Can't dump ghost file /criu/test/javaTests/omrvmem_000000626_Mlm48x of 2097152 size, increase limit Signed-off-by: Andrei Vagin --- scripts/build/Dockerfile.openj9-ubuntu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index c2cf20a36b..e190c27929 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -24,9 +24,10 @@ RUN apt-install protobuf-c-compiler \ gcc \ maven +RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT mvn -f test/javaTests/pom.xml test From 32d5a766eeb8a41e52a1c0234bbf2c5757f64b4b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 13 Dec 2024 09:03:42 -0800 Subject: [PATCH 040/198] test: run scm06 in the ns and uns flavors The kernel releases a test socket asynchronously, so the restore can fail if it is executed before the kernel actually destroys the socket. Fixes #2537 Signed-off-by: Andrei Vagin --- test/zdtm/static/scm06.desc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/scm06.desc b/test/zdtm/static/scm06.desc index 2eac7e654b..38cc3be519 100644 --- a/test/zdtm/static/scm06.desc +++ b/test/zdtm/static/scm06.desc @@ -1 +1,4 @@ -{'flags': 'suid'} +# This test isn't executed in the host flavor (in the same network namespace, +# because the kernel releases a test socket asynchronously, so the restore +# can fail if it is executed before the kernel actually destroys the socket. +{'flags': 'suid', 'flavor': 'ns uns'} From f314ca5e1f330f0fbb4897117a5792547efa4e5c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 16 Dec 2024 16:38:31 -0800 Subject: [PATCH 041/198] criu/seize.c: clang-format it Done using clang-format 19.1.5 with .clang-format obtained via scripts/fetch-clang-format.sh. Signed-off-by: Kir Kolyshkin --- criu/seize.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/criu/seize.c b/criu/seize.c index 9bd1832d9b..529fff5626 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -87,7 +87,10 @@ static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING }; +enum freezer_state { FREEZER_ERROR = -1, + THAWED, + FROZEN, + FREEZING }; /* Track if we are running on cgroup v2 system. */ static bool cgroup_v2 = false; From 9c3c095cfeef56731027e04369ed444a53d34363 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:29:34 -0800 Subject: [PATCH 042/198] freeze_processes: fix logic There are a few issues with the freeze_processes logic: 1. Commit 9fae23fbe2 grossly (by 1000x) miscalculated the number of attempts required, as a result, we are seeing something like this: > (00.000340) freezing processes: 100000 attempts with 100 ms steps > (00.000351) freezer.state=THAWED > (00.000358) freezer.state=FREEZING > (00.100446) freezer.state=FREEZING > ...close to 100 lines skipped... > (09.915110) freezer.state=FREEZING > (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0 > (10.000563) freezer.state=FREEZING For 10s with 100ms steps we only need 100 attempts, not 100000. 2. When the timeout is hit, the "failed to freeze cgroup" error is not printed, and the log_unfrozen_stacks is not called either. 3. The nanosleep at the last iteration is useless (this was hidden by issue 1 above, as the timeout was hit first). Fix all these. While at it, 4. Amend the error message with the number of attempts, sleep duration, and timeout. 5. Modify the "freezing cgroup" debug message to be in sync with the above error. Was: > freezing processes: 100000 attempts with 100 ms steps Now: > freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s Signed-off-by: Kir Kolyshkin --- criu/seize.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 529fff5626..6701446aec 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -545,7 +545,8 @@ static int freeze_processes(void) enum freezer_state state = THAWED; static const unsigned long step_ms = 100; - unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; + /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */ + unsigned long nr_attempts = (opts.timeout * 1000) / step_ms; unsigned long i = 0; const struct timespec req = { @@ -554,14 +555,12 @@ static int freeze_processes(void) }; if (unlikely(!nr_attempts)) { - /* - * If timeout is turned off, lets - * wait for at least 10 seconds. - */ - nr_attempts = (10 * 1000000) / step_ms; + /* If the timeout is 0, wait for at least 10 seconds. */ + nr_attempts = (10 * 1000) / step_ms; } - pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); + pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n", + opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout); fd = freezer_open(); if (fd < 0) @@ -588,22 +587,22 @@ static int freeze_processes(void) * not read @tasks pids while freezer in * transition stage. */ - for (; i <= nr_attempts; i++) { + while (1) { state = get_freezer_state(fd); if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == FROZEN) + if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; - if (alarm_timeouted()) - goto err; + nanosleep(&req, NULL); } - if (i > nr_attempts) { - pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); + if (state != FROZEN) { + pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n", + opts.freeze_cgroup, i, step_ms, opts.timeout); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; From 7c66617d0eb6c563faf31ae9a174e0463cb1bbb9 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:34:17 -0800 Subject: [PATCH 043/198] freeze_processes: implement kludges for cgroup v1 Cgroup v1 freezer has always been problematic, failing to freeze a cgroup. In runc, we have implemented a few kludges to increase the chance of succeeding, but those are used when runc freezes a cgroup for its own purposes (for "runc pause" and to modify device properties for cgroup v1). When criu is used, it fails to freeze a cgroup from time to time (see [1], [2]). Let's try adding kludges similar to ones in runc. Alas, I have absolutely no way to test this, so please review carefully. [1]: https://github.com/opencontainers/runc/issues/4273 [2]: https://github.com/opencontainers/runc/issues/4457 Signed-off-by: Kir Kolyshkin --- criu/seize.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/criu/seize.c b/criu/seize.c index 6701446aec..829d7c2783 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -539,6 +539,34 @@ static int prepare_freezer_for_interrupt_only_mode(void) return exit_code; } +static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { + /* As per older kernel docs (freezer-subsystem.txt before + * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + * userspace should either retry or thaw. While current + * kernel cgroup v1 docs no longer mention a need to retry, + * even recent kernels can't reliably freeze a cgroup v1. + * + * Let's keep asking the kernel to freeze from time to time. + * In addition, do occasional thaw/sleep/freeze. + * + * This is still a game of chances (the real fix belongs to the kernel) + * but these kludges might improve the probability of success. + * + * Cgroup v2 does not have this problem. + */ + switch (iter % 32) { + case 9: + case 20: + freezer_write_state(fd, FROZEN); + break; + case 31: + freezer_write_state(fd, THAWED); + nanosleep(req, NULL); + freezer_write_state(fd, FROZEN); + break; + } +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -597,6 +625,9 @@ static int freeze_processes(void) if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; + if (!cgroup_v2) + cgroupv1_freezer_kludges(fd, i, &req); + nanosleep(&req, NULL); } From 6991ea1ff94faa562eb1b3594027737d0fc95b40 Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Thu, 19 Dec 2024 08:16:36 +0000 Subject: [PATCH 044/198] cr: Task CapAmb support Signed-off-by: Liu Chao --- criu/cr-restore.c | 2 ++ criu/include/parasite.h | 1 + criu/include/prctl.h | 9 +++++++++ criu/include/proc_parse.h | 1 + criu/include/restorer.h | 1 + criu/parasite-syscall.c | 3 +++ criu/pie/parasite.c | 13 +++++++++++++ criu/pie/restorer.c | 16 ++++++++++++++++ criu/proc_parse.c | 11 +++++++++-- criu/pstree.c | 3 +++ images/creds.proto | 2 ++ 11 files changed, 60 insertions(+), 2 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 646300bdb8..ddca6b8ece 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2992,6 +2992,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; + args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; @@ -2999,6 +3000,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); + copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 1244220f67..b33d6710f8 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -148,6 +148,7 @@ struct parasite_dump_creds { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; int uids[4]; int gids[4]; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 4c2a548b16..f5f23c9692 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -36,6 +36,15 @@ #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #endif +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +#endif +#ifndef PR_CAP_AMBIENT_IS_SET +#define PR_CAP_AMBIENT_IS_SET 1 +#endif +#ifndef PR_CAP_AMBIENT_RAISE +#define PR_CAP_AMBIENT_RAISE 2 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0c334a190d..0bd79bf553 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -81,6 +81,7 @@ struct proc_status_creds { u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 3fb5322a4b..a4fb7ea794 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -75,6 +75,7 @@ struct thread_creds_args { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; char *lsm_profile; unsigned int *groups; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index a88f8a66f2..6db9d21fee 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -103,16 +103,19 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); + BUILD_BUG_ON(sizeof(ce->cap_amb[0]) != sizeof(c->cap_amb[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); + BUG_ON(ce->n_cap_amb != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + memcpy(ce->cap_amb, c->cap_amb, sizeof(c->cap_amb[0]) * CR_CAP_SIZE); if (c->no_new_privs > 0) { ce->no_new_privs = c->no_new_privs; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e151ed6563..1bc03dc2a0 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -324,6 +324,7 @@ static int dump_creds(struct parasite_dump_creds *args) args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; + args->cap_amb[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) @@ -336,6 +337,18 @@ static int dump_creds(struct parasite_dump_creds *args) if (ret) args->cap_bnd[i] |= (1 << j); } + + for (j = 0; j < 32; j++) { + if (j + i * 32 > args->cap_last_cap) + break; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, j + i * 32, 0, 0); + if (ret < 0) { + pr_err("Unable to read ambient capability %d: %d\n", j + i * 32, ret); + return -1; + } + if (ret) + args->cap_amb[i] |= (1 << j); + } } args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 51ed6ed4c8..0a6a7977c9 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -347,6 +347,22 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } + for (b = 0; b < CR_CAP_SIZE; b++) { + for (i = 0; i < 32; i++) { + if (b * 32 + i > args->cap_last_cap) + break; + if ((args->cap_amb[b] & (1 << i)) == 0) + /* don't set */ + continue; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + if (!ret) + continue; + pr_err("Unable to raise ambient capability %d: %d\n", i + b * 32, ret); + return -1; + } + } + + if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 6c4303e7dd..0fa9b7ba56 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1071,7 +1071,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1155,6 +1155,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(str, "CapAmb:", 7)) { + if (cap_parse(str + 8, cr->cap_amb)) + goto err_parse; + done++; + continue; + } + if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; @@ -1198,7 +1205,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 12 : 11); + expected_done = (parsed_seccomp ? 13 : 12); if (kdat.has_nspid) expected_done++; if (done == expected_done) diff --git a/criu/pstree.c b/criu/pstree.c index 8c44e71343..41df846eda 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -63,6 +63,7 @@ CoreEntry *core_entry_alloc(int th, int tsk) sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_amb[0]); /* * @groups are dynamic and allocated * on demand. @@ -122,10 +123,12 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; + ce->n_cap_amb = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ce->cap_amb = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_amb[0])); if (arch_alloc_thread_info(core)) { xfree(core); diff --git a/images/creds.proto b/images/creds.proto index 220ed38587..932a40ccff 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -25,4 +25,6 @@ message creds_entry { optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; optional uint32 no_new_privs = 18; + + repeated uint32 cap_amb = 19; } From d4d393701772e5b79afc1d77d98f223fec29036e Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Fri, 3 Jan 2025 03:33:27 +0000 Subject: [PATCH 045/198] zdtm: Check CapAmb is restored correctly after C/R This test sets CapAmb according to CapPrm and CapInh and check CapAmb after C/R. Signed-off-by: Liu Chao --- test/zdtm/static/Makefile | 1 + test/zdtm/static/caps01.c | 168 +++++++++++++++++++++++++++++++++++ test/zdtm/static/caps01.desc | 1 + 3 files changed, 170 insertions(+) create mode 100644 test/zdtm/static/caps01.c create mode 100644 test/zdtm/static/caps01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 71a1b6a535..78f96430e8 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -8,6 +8,7 @@ TST_NOFILE := \ sleeping00 \ pid00 \ caps00 \ + caps01 \ wait00 \ zombie00 \ zombie01 \ diff --git a/test/zdtm/static/caps01.c b/test/zdtm/static/caps01.c new file mode 100644 index 0000000000..0f8a7101e2 --- /dev/null +++ b/test/zdtm/static/caps01.c @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that CapAmb are preserved"; +const char *test_author = "Liu Chao "; + +struct cap_hdr { + unsigned int version; + int pid; +}; + +struct cap_data { + unsigned int eff; + unsigned int prm; + unsigned int inh; +}; + +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 +#define CAP_DAC_OVERRIDE 1 +#define PR_CAP_AMBIENT 47 +#define PR_CAP_AMBIENT_IS_SET 1 +#define PR_CAP_AMBIENT_RAISE 2 +#define PR_CAP_AMBIENT_LOWER 3 + +int capget(struct cap_hdr *hdrp, struct cap_data *datap); +int capset(struct cap_hdr *hdrp, const struct cap_data *datap); + +static int cap_last_cap = 63; + +int main(int argc, char **argv) +{ + task_waiter_t t; + int pid, result_pipe[2]; + unsigned int amb[_LINUX_CAPABILITY_U32S_3]; + unsigned int amb_2[_LINUX_CAPABILITY_U32S_3]; + char res = 'x'; + FILE *f; + + test_init(argc, argv); + task_waiter_init(&t); + + f = fopen("/proc/sys/kernel/cap_last_cap", "r"); + if (f) { + if (fscanf(f, "%d", &cap_last_cap) != 1) { + pr_perror("Unable to read cal_last_cap"); + fclose(f); + return 1; + } + fclose(f); + } else + test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); + + if (pipe(result_pipe)) { + pr_perror("Can't create pipe"); + return 1; + } + + pid = test_fork(); + if (pid == 0) { + int b, i, ret; + struct cap_hdr hdr; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + return -1; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); + data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); + data[0].inh = data[0].prm; + data[1].inh = data[1].prm; + + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + return -1; + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb[b] = data[b].prm; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + if ((amb[b] & (1 << i)) > 0) + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + else + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i + b * 32, 0, 0); + if (ret) { + pr_perror("Unable to set ambient capability %d to %d: %d", i + b * 32, amb[b] & (1 << i), ret); + return -1; + } + } + } + + task_waiter_complete_current(&t); + task_waiter_wait4(&t, getppid()); + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb_2[b] = 0; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i + b * 32, 0, 0); + if (ret < 0) { + pr_perror("Unable to read ambient capability %d: %d", i + b * 32, ret); + goto bad; + } + + amb_2[b] |= (ret << i); + } + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + if (amb[b] != amb_2[b]) { + res = '1'; + goto bad; + } + } + + res = '0'; + bad: + write(result_pipe[1], &res, 1); + + if (res != '0') { + write(result_pipe[1], amb, sizeof(amb)); + write(result_pipe[1], amb_2, sizeof(amb_2)); + } + + close(result_pipe[0]); + close(result_pipe[1]); + _exit(0); + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&t); + + read(result_pipe[0], &res, 1); + + if (res == '0') + pass(); + else { + read(result_pipe[0], amb, sizeof(amb)); + read(result_pipe[0], amb_2, sizeof(amb_2)); + test_msg("amb[]=%08x, %08x\n", amb[0], amb[1]); + test_msg("amb[]=%08x, %08x\n", amb_2[0], amb_2[1]); + fail("Fail: %c", res); + } + close(result_pipe[0]); + close(result_pipe[1]); + + return 0; +} diff --git a/test/zdtm/static/caps01.desc b/test/zdtm/static/caps01.desc new file mode 100644 index 0000000000..2eac7e654b --- /dev/null +++ b/test/zdtm/static/caps01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} From ea2ddb886a4cb285a41e4df2b6cc309460748543 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 16 Jan 2025 07:52:42 +0000 Subject: [PATCH 046/198] util: added cleanup_file attribute. Signed-off-by: Adrian Reber --- criu/include/util.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/criu/include/util.h b/criu/include/util.h index ae293a68c8..4793f7f20e 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -406,6 +406,14 @@ static inline void cleanup_freep(void *p) free(*pp); } +#define cleanup_file __attribute__((cleanup(cleanup_filep))) +static inline void cleanup_filep(FILE **f) +{ + FILE *file = *f; + if (file) + (void)fclose(file); +} + extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); /* From 27a5b9aa87d6f54ff837a406d4ab6a20ab170302 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 17 Dec 2024 08:52:46 +0100 Subject: [PATCH 047/198] net: redirect nftables stdout and stderr to CRIU's log file When using the nftables network locking backend and restoring a process a second time the network locking has already been deleted by the first restore. The second restore will print out to the console text like: Error: Could not process rule: No such file or directory delete table inet CRIU-202621 With this change CRIU's log FD is used by libnftables stdout and stderr. Signed-off-by: Adrian Reber --- criu/net.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/criu/net.c b/criu/net.c index eee3311087..efd52db327 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3066,9 +3066,43 @@ static int iptables_restore(bool ipv6, char *buf, int size) return ret; } +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline FILE *redirect_nftables_output(struct nft_ctx *nft) +{ + FILE *fp; + int fd; + + fd = dup(log_get_fd()); + if (fd < 0) { + pr_perror("dup() to redirect nftables output failed"); + return NULL; + } + + fp = fdopen(fd, "w"); + if (!fp) { + pr_perror("fdopen() to redirect nftables output failed"); + return NULL; + } + + /** + * Without setvbuf() the output from libnftables will be + * somewhere in the log file, probably at the end. + * With setvbuf() potential output will be at the correct + * position. + */ + setvbuf(fp, NULL, _IONBF, 0); + + nft_ctx_set_output(nft, fp); + nft_ctx_set_error(nft, fp); + + return fp; +} +#endif + static inline int nftables_lock_network_internal(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; int ret = 0; char table[32]; @@ -3081,6 +3115,10 @@ static inline int nftables_lock_network_internal(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + goto out; + snprintf(buf, sizeof(buf), "create table %s", table); if (NFT_RUN_CMD(nft, buf)) goto err2; @@ -3168,6 +3206,7 @@ static inline int nftables_network_unlock(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) int ret = 0; + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; char table[32]; char buf[128]; @@ -3179,6 +3218,10 @@ static inline int nftables_network_unlock(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + return -1; + snprintf(buf, sizeof(buf), "delete table %s", table); if (NFT_RUN_CMD(nft, buf)) ret = -1; From ca90d8e7ebf53bdbf4eb7c786b23c9f0e110e514 Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Thu, 19 Dec 2024 14:30:41 +0800 Subject: [PATCH 048/198] seize: Adjust the position of the log message Based on the code, the `ret` variable at this point does not represent the task state, so this log message should be moved to a position after the `compel_wait_task()` function. Signed-off-by: Yuanhong Peng --- criu/seize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 829d7c2783..007e8e580d 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -707,8 +707,6 @@ static int collect_children(struct pstree_item *item) goto free; } - pr_info("Seized task %d, state %d\n", pid, ret); - c = alloc_pstree_item(); if (c == NULL) { ret = -1; @@ -746,6 +744,8 @@ static int collect_children(struct pstree_item *item) if (ret == TASK_STOPPED) c->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); + c->pid->real = pid; c->parent = item; c->pid->state = ret; From 10ffad218889d57fa7a3687c6e222a30f3069aa0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 Jan 2025 15:05:42 +0100 Subject: [PATCH 049/198] files-reg: fix buffer overflow on aarch64 Running the zdtm/static/unlink_regular00 test on Ubuntu 24.04 on aarch64 results in following error: # ./zdtm.py run -t zdtm/static/unlink_regular00 -k always userns is supported === Run 1/1 ================ zdtm/static/unlink_regular00 ==================== Run zdtm/static/unlink_regular00 in ns ==================== Skipping rtc at root Start test Test is SUID ./unlink_regular00 --pidfile=unlink_regular00.pid --outfile=unlink_regular00.out --dirname=unlink_regular00.test Run criu dump *** buffer overflow detected ***: terminated ############# Test zdtm/static/unlink_regular00 FAIL at CRIU dump ############## Test output: ================================ <<< ================================ Send the 9 signal to 47 Wait for zdtm/static/unlink_regular00(47) to die for 0.100000 ##################################### FAIL ##################################### According to the backtrace: #0 __pthread_kill_implementation (threadid=281473158467616, signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44 #1 0x0000ffff93477690 in __pthread_kill_internal (signo=6, threadid=) at ./nptl/pthread_kill.c:78 #2 0x0000ffff9342cb3c in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 #3 0x0000ffff93417e00 in __GI_abort () at ./stdlib/abort.c:79 #4 0x0000ffff9346abf0 in __libc_message_impl (fmt=fmt@entry=0xffff93552a78 "*** %s ***: terminated\n") at ../sysdeps/posix/libc_fatal.c:132 #5 0x0000ffff934e81a8 in __GI___fortify_fail (msg=msg@entry=0xffff93552a28 "buffer overflow detected") at ./debug/fortify_fail.c:24 #6 0x0000ffff934e79e4 in __GI___chk_fail () at ./debug/chk_fail.c:28 #7 0x0000ffff934e9070 in ___snprintf_chk (s=s@entry=0xffffc6ed04a3 "testfile", maxlen=maxlen@entry=4056, flag=flag@entry=2, slen=slen@entry=4053, format=format@entry=0xaaaacffe3888 "link_remap.%d") at ./debug/snprintf_chk.c:29 #8 0x0000aaaacff4b8b8 in snprintf (__fmt=0xaaaacffe3888 "link_remap.%d", __n=4056, __s=0xffffc6ed04a3 "testfile") at /usr/include/aarch64-linux-gnu/bits/stdio2.h:54 #9 create_link_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, lfd=lfd@entry=20, idp=idp@entry=0xffffc6ed14ec, nsid=nsid@entry=0xaaaada2bac00, parms=parms@entry=0xffffc6ed2808, fallback=0xaaaacff4c6c0 , fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1164 #10 0x0000aaaacff4c6c0 in dump_linked_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, parms=parms@entry=0xffffc6ed2808, lfd=lfd@entry=20, id=id@entry=12, nsid=nsid@entry=0xaaaada2bac00, fallback=fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1198 #11 0x0000aaaacff4d8b0 in check_path_remap (nsid=0xaaaada2bac00, id=12, lfd=20, parms=0xffffc6ed2808, link=) at criu/files-reg.c:1426 #12 dump_one_reg_file (lfd=20, id=12, p=0xffffc6ed2808) at criu/files-reg.c:1827 #13 0x0000aaaacff51078 in dump_one_file (pid=, fd=4, lfd=20, opts=opts@entry=0xaaaada2ba2c0, ctl=ctl@entry=0xaaaada2c4d50, e=e@entry=0xffffc6ed39c8, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:581 #14 0x0000aaaacff5176c in dump_task_files_seized (ctl=ctl@entry=0xaaaada2c4d50, item=item@entry=0xaaaada2b8f80, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:657 #15 0x0000aaaacff3d3c0 in dump_one_task (parent_ie=0x0, item=0xaaaada2b8f80) at criu/cr-dump.c:1679 #16 cr_dump_tasks (pid=) at criu/cr-dump.c:2224 #17 0x0000aaaacff163a0 in main (argc=, argv=0xffffc6ed40e8, envp=) at criu/crtools.c:293 This line is the problem: snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); The problem was that the `-1` was on the inside of the braces and not on the outside. This way the destination size was increase by 1 instead of being decreased by 1 which triggered the buffer overflow detection. Signed-off-by: Adrian Reber --- criu/files-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index fc61493501..66c0e6cda7 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1150,7 +1150,7 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); From 637682d8aa6f4d13423b4f635569f7f242db3646 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 20 Jan 2025 19:00:16 +0100 Subject: [PATCH 050/198] test: fix cmdlinenv00 on aarch64 On aarch64 the test cmdlinenv00 was failing with: FAIL: cmdlinenv00.c:120: auxv corrupted on restore (errno = 11 (Resource temporarily unavailable)) Starting with Linux kernel version 6.3 the size of AUXV was changed: commit 28c8e088427ad30b4260953f3b6f908972b77c2d Author: Mathieu Desnoyers Date: Wed Jan 4 14:20:54 2023 -0500 rseq: Increase AT_VECTOR_SIZE_BASE to match rseq auxvec entries Two new auxiliary vector entries are introduced for rseq without matching increment of the AT_VECTOR_SIZE_BASE, which causes failures with CONFIG_HARDENED_USERCOPY=y. Fixes: 317c8194e6ae ("rseq: Introduce feature size and alignment ELF auxiliary vector entries") With this change AT_VECTOR_SIZE increases from 40 to 50 on aarch64. CRIU uses AT_VECTOR_SIZE to read the content of /proc/PID/auxv auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); Now the tests works again on aarch64. Signed-off-by: Adrian Reber --- criu/arch/aarch64/include/asm/types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index 363c1cae28..db118cafd2 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -33,7 +33,16 @@ static inline uint64_t encode_pointer(void *p) return (uint64_t)p; } -#define AT_VECTOR_SIZE 40 +/** + * See also: + * * arch/arm64/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + * * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 22 +#define AT_VECTOR_SIZE_ARCH 2 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + typedef uint64_t auxv_t; typedef uint64_t tls_t; From 5eee7a6ee27f272880b48f16a3c92f345615f454 Mon Sep 17 00:00:00 2001 From: Austin Kuo <104871462+hckuo@users.noreply.github.com> Date: Tue, 7 Jan 2025 04:31:05 +0000 Subject: [PATCH 051/198] timer: Refine itimer_armed logic and improve timer value handling Right now, CRIU skips timers non-periodic timers. This change addresses this issue. Signed-off-by: Austin Kuo --- criu/pie/restorer.c | 2 +- criu/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0a6a7977c9..6d048c3f1d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -2226,7 +2226,7 @@ __visible long __export_restore_task(struct task_restore_args *args) * code below doesn't fail due to bad timing values. */ -#define itimer_armed(args, i) (args->itimers[i].it_interval.tv_sec || args->itimers[i].it_interval.tv_usec) +#define itimer_armed(args, i) (args->itimers[i].it_value.tv_sec || args->itimers[i].it_value.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); diff --git a/criu/timer.c b/criu/timer.c index e94cf0280d..0413e2a720 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -16,7 +16,7 @@ static inline int timeval_valid(struct timeval *tv) static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) { - if (ie->isec == 0 && ie->iusec == 0) { + if (ie->isec == 0 && ie->iusec == 0 && ie->vsec == 0 && ie->vusec == 0) { memzero_p(val); return 0; } From b7cbd2ca92fa88eb3b63164dd09a032fe4d9f0f0 Mon Sep 17 00:00:00 2001 From: Austin Kuo Date: Tue, 21 Jan 2025 12:04:33 -0800 Subject: [PATCH 052/198] test/zdtm: add a new test to check non-periodic timers It creates a few timers with log expiration intervals, waites for C/R and check that timers are armed and their intervals have been restored. Signed-off-by: Austin Kuo --- test/zdtm/static/Makefile | 1 + test/zdtm/static/timers01.c | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 test/zdtm/static/timers01.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 78f96430e8..f72fb2a77f 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -24,6 +24,7 @@ TST_NOFILE := \ sse20 \ mprotect00 \ timers \ + timers01 \ timerfd \ unbound_sock \ sched_prio00 \ diff --git a/test/zdtm/static/timers01.c b/test/zdtm/static/timers01.c new file mode 100644 index 0000000000..10ecc34815 --- /dev/null +++ b/test/zdtm/static/timers01.c @@ -0,0 +1,74 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks non-periodic timers\n"; +const char *test_author = "Andrei Vagin "; + +static struct { + const int timer_type; + const int signal; + volatile sig_atomic_t count; +} timer_tests[] = { + /* from slowest to fastest */ + { ITIMER_VIRTUAL, SIGVTALRM }, + { ITIMER_PROF, SIGPROF }, + { ITIMER_REAL, SIGALRM }, +}; + +#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) +#define TIMER_TIMEOUT 3600 +#define TIMER_ALLOWED_DELTA 300 + +static void setup_timers(void) +{ + int i; + struct itimerval tv = { + .it_interval = { .tv_sec = 0, .tv_usec = 0 }, + .it_value = { .tv_sec = TIMER_TIMEOUT, .tv_usec = 0 }, + }; + + for (i = 0; i < NUM_TIMERS; i++) { + if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { + pr_perror("can't set timer %d", i); + exit(1); + } + } +} + +static void check_timers(void) +{ + int i; + + for (i = 0; i < NUM_TIMERS; i++) { + struct itimerval tv = {}; + + if (getitimer(timer_tests[i].timer_type, &tv)) { + pr_perror("gettimer"); + exit(1); + } + if (tv.it_value.tv_sec > TIMER_TIMEOUT || + tv.it_value.tv_sec < TIMER_TIMEOUT - TIMER_ALLOWED_DELTA) { + fail("%ld isn't in [%d, %d]", (long)tv.it_value.tv_sec, + TIMER_TIMEOUT, + TIMER_TIMEOUT - TIMER_ALLOWED_DELTA); + exit(1); + } + } + pass(); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setup_timers(); + + test_daemon(); + test_waitsig(); + + check_timers(); + return 0; +} From b3869c91729ba630e552c2380e1e8ec56cf7f8c1 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jan 2025 09:27:16 +0100 Subject: [PATCH 053/198] ci: two check-commits.yml changes * Switch to v4 actions/checkout (from v3) * Use our apt wrapper to gracefully handle temporary repository errors Signed-off-by: Adrian Reber --- .github/workflows/check-commits.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index be2fbd2856..94861ab528 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -12,14 +12,14 @@ jobs: # Check if pull request does not have label "not-selfcontained-ok" if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Needed to rebase against the base branch fetch-depth: 0 # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" From d165b94bb51a1ee34c7101898a8f99409366272f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 09:26:15 +0000 Subject: [PATCH 054/198] criu: use libuuid for criu_run_id generation criu_run_id will be used in upcoming changes to create and remove network rules for network locking. Instead of trying to come up with a way to create unique IDs, just use an existing library. libuuid should be installed on most systems as it is indirectly required by systemd (via libmount). Signed-off-by: Adrian Reber --- .cirrus.yml | 2 +- .github/workflows/check-commits.yml | 2 +- compel/include/uapi/infect-util.h | 11 ++++++++++- compel/src/lib/infect-util.c | 2 +- compel/src/lib/infect.c | 2 +- criu/Makefile.packages | 4 +++- criu/fdstore.c | 2 +- criu/files.c | 2 +- criu/include/util.h | 4 +++- criu/pidfd-store.c | 2 +- criu/unittest/mock.c | 4 +++- criu/util.c | 17 +++++++---------- scripts/build/Dockerfile.alpine | 3 ++- scripts/build/Dockerfile.amd-rocm | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/build/Dockerfile.hotspot-alpine | 1 + scripts/build/Dockerfile.hotspot-ubuntu | 1 + scripts/build/Dockerfile.linux32.tmpl | 1 + scripts/build/Dockerfile.openj9-ubuntu | 1 + .../build/Dockerfile.riscv64-stable-cross.tmpl | 1 + scripts/build/Dockerfile.stable-cross.tmpl | 1 + scripts/build/Dockerfile.tmpl | 1 + scripts/build/Dockerfile.unstable-cross.tmpl | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- 26 files changed, 48 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 5e30ca2c2b..a4b53a54b0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 94861ab528..354873909e 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index ace6f6b6b1..658df9393d 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -3,11 +3,20 @@ #include "common/compiler.h" +/** + * The length of the hash is based on what libuuid provides. + * According to the manpage this is: + * + * The uuid_unparse() function converts the supplied UUID uu from the binary + * representation into a 36-byte string (plus trailing '\0') + */ +#define RUN_ID_HASH_LENGTH 37 + /* * compel_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other processes. */ -extern uint64_t compel_run_id; +extern char compel_run_id[RUN_ID_HASH_LENGTH]; struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index 00a7c83f7d..dc57e28f7c 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,7 +7,7 @@ #include "infect-rpc.h" #include "infect-util.h" -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 1e3ffb9670..caf54e03fd 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -427,7 +427,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%" PRIx64, key, compel_run_id); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 7f6113c8f1..3e2e6efd18 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,6 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel +REQ-RPM-PKG-NAMES += libuuid-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -16,6 +17,7 @@ REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += uuid-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev @@ -25,7 +27,7 @@ REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/fdstore.c b/criu/fdstore.c index d615ad15d0..6ac639c553 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -58,7 +58,7 @@ int fdstore_init(void) } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%" PRIx64, st.st_ino, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/files.c b/criu/files.c index 31e705bcc5..f16ec32a23 100644 --- a/criu/files.c +++ b/criu/files.c @@ -978,7 +978,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } diff --git a/criu/include/util.h b/criu/include/util.h index 4793f7f20e..194e94deeb 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,6 +21,8 @@ #include "log.h" #include "common/err.h" +#include "compel/infect-util.h" + #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 @@ -420,7 +422,7 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void * criu_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other CRIU processes. */ -extern uint64_t criu_run_id; +extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); extern char *resolve_mountpoint(char *path); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index 9fdc74cb74..110f7802a2 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -99,7 +99,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) goto err; } - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%" PRIx64, pid, sk, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%s", pid, sk, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index e517720e42..b2d5072787 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -5,6 +5,8 @@ #include #include +#include "compel/infect-util.h" + int add_external(char *key) { return 0; @@ -141,4 +143,4 @@ int check_mount_v2(void) return 0; } -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; diff --git a/criu/util.c b/criu/util.c index d2bc9a8657..58c18e20be 100644 --- a/criu/util.c +++ b/criu/util.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "linux/mount.h" @@ -2026,20 +2027,16 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) return fret; } -uint64_t criu_run_id; +char criu_run_id[RUN_ID_HASH_LENGTH]; void util_init(void) { - struct stat statbuf; + uuid_t uuid; - criu_run_id = getpid(); - if (!stat("/proc/self/ns/pid", &statbuf)) - criu_run_id |= (uint64_t)statbuf.st_ino << 32; - else if (errno != ENOENT) - pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - - compel_run_id = criu_run_id; - pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); + uuid_generate(uuid); + uuid_unparse(uuid, criu_run_id); + pr_info("CRIU run id = %s\n", criu_run_id); + memcpy(compel_run_id, criu_run_id, sizeof(criu_run_id)); } /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 329d7791de..d843793ea2 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -24,7 +24,8 @@ RUN apk update && apk add \ sudo \ libcap-utils \ libdrm-dev \ - util-linux + util-linux \ + util-linux-dev COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c466a73d2d..ed66ae4fec 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -56,6 +56,7 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta python-protobuf \ python3-minimal \ python-ipaddress \ + uuid-dev \ curl \ wget \ vim \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 4056514891..9d11194bb0 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ python-junit-xml \ python-importlib-metadata \ libdrm \ + util-linux-libs \ diffutils COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index cb9332fd0c..6caf9d0b1b 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -19,6 +19,7 @@ RUN apk update && apk add \ maven \ ip6tables \ iptables \ + util-linux-dev \ bash COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 0318f650f3..67de916acb 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven COPY . /criu diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index 13e9926424..d218e06414 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,6 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ + uuid-dev \ python3-minimal COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index e190c27929..0ae4727d2c 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index 39a0c33c6c..e95a433067 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -33,6 +33,7 @@ RUN apt-get install -y --no-install-recommends \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnl-route-3-dev:${DEBIAN_ARCH} \ libnftables-dev:${DEBIAN_ARCH} \ libgnutls28-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 078372c38c..65ae558334 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -18,6 +18,7 @@ RUN apt-install \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 9b53a76aab..3d6de10441 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -29,6 +29,7 @@ RUN apt-install \ protobuf-compiler \ python3-minimal \ python3-protobuf \ + uuid-dev \ python3-yaml COPY . /criu diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index dacfd96ef0..3504b0433c 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -17,6 +17,7 @@ RUN apt-install \ python3-protobuf \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 09085c403b..42252c93c9 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -36,6 +36,7 @@ dnf install -y \ e2fsprogs \ rubygem-asciidoctor \ libdrm-devel \ + libuuid-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index b472e954c2..611ff78037 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,7 +4,7 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml + libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip python3-importlib-metadata python3-junit.xml libdrm-dev) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 3904c51d22..ed5a011787 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -39,7 +39,7 @@ setup() { ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline From 5513a33300385071abb9bd1e91eaa78954894dc4 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 17:42:45 +0000 Subject: [PATCH 055/198] net: remember the name of the lock chain (nftables) Using libnftables the chain to lock the network is composed of ("CRIU-%d", real_pid). This leads to around 40 zdtm tests failing with errors like this: Error: No such file or directory; did you mean table 'CRIU-62' in family inet? delete table inet CRIU-86 The reason is that as soon as a process is running in a namespace the real PID can be anything and only the PID in the namespace is restored correctly. Relying on the real PID does not work for the chain name. Using the PID of the innermost namespace would lead to the chain be called 'CRIU-1' most of the time which is also not really unique. With this commit the change is now named using the already existing CRIU run ID. To be able to correctly restore the process and delete the locking table, the CRIU run id during checkpointing is now stored in the inventory as dump_criu_run_id. Signed-off-by: Adrian Reber --- criu/image.c | 30 ++++++++++++++++++++++++++++++ criu/include/util.h | 2 ++ criu/netfilter.c | 20 +++++++++++++++++++- images/inventory.proto | 4 ++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/criu/image.c b/criu/image.c index 9589167fb1..f3747d6ff5 100644 --- a/criu/image.c +++ b/criu/image.c @@ -25,6 +25,7 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +char dump_criu_run_id[RUN_ID_HASH_LENGTH]; struct inventory_plugin { struct list_head node; @@ -120,6 +121,24 @@ int check_img_inventory(bool restore) goto out_err; } } + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + if (he->dump_criu_run_id) { + strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); + pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); + } else { + /** + * If restoring from an old image this is a marker + * that no dump_criu_run_id exists. + */ + dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; + } + } ret = 0; @@ -367,6 +386,17 @@ int prepare_inventory(InventoryEntry *he) he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + he->dump_criu_run_id = xstrdup(criu_run_id); + + if (!he->dump_criu_run_id) + return -1; + return 0; } diff --git a/criu/include/util.h b/criu/include/util.h index 194e94deeb..55ad5b63cf 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -424,6 +424,8 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void */ extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); +#define NO_DUMP_CRIU_RUN_ID 0x7f +extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; extern char *resolve_mountpoint(char *path); diff --git a/criu/netfilter.c b/criu/netfilter.c index 9e78dc4b03..e2c82764f2 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -299,7 +299,25 @@ int nftables_lock_connection(struct inet_sk_desc *sk) int nftables_get_table(char *table, int n) { - if (snprintf(table, n, "inet CRIU-%d", root_item->pid->real) < 0) { + int ret; + + switch(dump_criu_run_id[0]) { + case 0: + /* This is not a restore.*/ + ret = snprintf(table, n, "inet CRIU-%s", criu_run_id); + break; + case NO_DUMP_CRIU_RUN_ID: + /** + * This is a restore from an older image with no + * dump_criu_run_id available. Let's use the old ID. + */ + ret = snprintf(table, n, "inet CRIU-%d", root_item->pid->real); + break; + default: + ret = snprintf(table, n, "inet CRIU-%s", dump_criu_run_id); + } + + if (ret < 0) { pr_err("Cannot generate CRIU's nftables table name\n"); return -1; } diff --git a/images/inventory.proto b/images/inventory.proto index 7f655031bc..1e18815bb9 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -29,4 +29,8 @@ message inventory_entry { optional bool tcp_close = 10; optional uint32 network_lock_method = 11; optional plugins_entry plugins_entry = 12; + // Remember the criu_run_id when CRIU dumped the process. + // This is currently used to delete the correct nftables + // network locking rule. + optional string dump_criu_run_id = 13; } From 59b022db356d206e712e60b782712f8b93ff398a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 14 Dec 2024 21:14:58 +0000 Subject: [PATCH 056/198] cuda: prevent task lockup on timeout error When creating a checkpoint of large models, the `checkpoint` action of `cuda-checkpoint` can exceed the CRIU timeout. This causes CRIU to fail with the following error, leaving the CUDA task in a locked state: cuda_plugin: Checkpointing CUDA devices on pid 84145 restore_tid 84202 Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 0 Error (cuda_plugin.c:139): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:396): cuda_plugin: CHECKPOINT_DEVICES failed with net: Unlock network cuda_plugin: finished cuda_plugin stage 0 err -1 cuda_plugin: resuming devices on pid 84145 cuda_plugin: Restore thread pid 84202 found for real pid 84145 Unfreezing tasks into 1 Unseizing 84145 into 1 Error (criu/cr-dump.c:2111): Dumping FAILED. To fix this, we set `task_info->checkpointed` before invoking the `checkpoint` action to ensure that the CUDA task is resumed even if CRIU times out. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e78828b189..976ce824ca 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -391,14 +391,14 @@ int cuda_plugin_checkpoint_devices(int pid) if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } + + task_info->checkpointed = 1; status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - task_info->checkpointed = 1; - interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); From dcd8808db0852e9e264aaa0640d165daf79dc836 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 21 Dec 2024 14:17:35 +0000 Subject: [PATCH 057/198] seize: use separate checkpoint_devices function Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to ensure that the function's sole responsibility is to use the cgroup freezer for the process tree. This allows us to avoid a time-out error when checkpointing applications with large GPU state. v2: This patch calls `checkpoint_devices()` only for `criu dump`. Support for GPU checkpointing with `pre-dump` will be introduced in a separate patch. Suggested-by: Andrei Vagin Suggested-by: Jesus Ramos Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 3 +++ criu/include/seize.h | 1 + criu/seize.c | 23 ++++++++++++++++------- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 1bc5d934f5..302078caa0 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2192,6 +2192,9 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; diff --git a/criu/include/seize.h b/criu/include/seize.h index 64e8d2d12f..fc7facad37 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); diff --git a/criu/seize.c b/criu/seize.c index 007e8e580d..f56357ac7b 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1050,7 +1050,6 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret, exit_code = -1; struct proc_status_creds creds; - struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -1111,6 +1110,21 @@ int collect_pstree(void) goto err; } + exit_code = 0; + timing_stop(TIME_FREEZING); + timing_start(TIME_FROZEN); + +err: + /* Freezing stage finished in time - disable timer. */ + alarm(0); + return exit_code; +} + +int checkpoint_devices(void) +{ + struct pstree_item *iter; + int ret, exit_code = -1; + for_each_pstree_item(iter) { if (!task_alive(iter)) continue; @@ -1120,11 +1134,6 @@ int collect_pstree(void) } exit_code = 0; - timing_stop(TIME_FREEZING); - timing_start(TIME_FROZEN); - err: - /* Freezing stage finished in time - disable timer. */ - alarm(0); return exit_code; -} +} \ No newline at end of file From fc1dbc4915ddafc1f53b66f62d4df29e209976f5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 15 Jan 2025 20:54:10 +0000 Subject: [PATCH 058/198] cuda: disable CUDA plugin for pre-dump Temporarily disable CUDA plugin for `criu pre-dump`. pre-dump currently fails with the following error: Handling VMA with the following smaps entry: 1822c000-18da5000 rw-p 00000000 00:00 0 [heap] Handling VMA with the following smaps entry: 200000000-200200000 ---p 00000000 00:00 0 Handling VMA with the following smaps entry: 200200000-200400000 rw-s 00000000 00:06 895 /dev/nvidia0 Error (criu/proc_parse.c:116): handle_device_vma plugin failed: No such file or directory Error (criu/proc_parse.c:632): Can't handle non-regular mapping on 705693's map 200200000 Error (criu/cr-dump.c:1486): Collect mappings (pid: 705693) failed with -1 We plan to enable support for pre-dump by skipping nvidia mappings in a separate patch. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 976ce824ca..99e4caf743 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -565,6 +565,12 @@ int cuda_plugin_init(int stage) { int ret; + /* Disable CUDA checkpointing with pre-dump */ + if (stage == CR_PLUGIN_STAGE__PRE_DUMP) { + plugin_disabled = true; + return 0; + } + if (stage == CR_PLUGIN_STAGE__RESTORE) { if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { plugin_disabled = true; From 343e7319b9dee516c18d7c24d02b24c349c46a31 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 19 Dec 2024 10:33:54 +0000 Subject: [PATCH 059/198] lib: do not set protobuf has_* field too early For two cases libcriu was setting the RPC protobuf field `has_*` before checking if the given parameter is valid. This can lead to situations, if the caller doesn't check the return value, that we pass as RPC struct to CRIU which has the `has_*` protobuf field set to true, but does not have a verified value (or non at all) set for the actual RPC entry. Signed-off-by: Adrian Reber --- lib/c/criu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/c/criu.c b/lib/c/criu.c index 7f766db857..c16fe5dcd7 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -352,8 +352,8 @@ int criu_set_parent_images(const char *path) int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) { - opts->rpc->has_pre_dump_mode = true; if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { + opts->rpc->has_pre_dump_mode = true; opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; return 0; } @@ -1867,8 +1867,8 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { - opts->rpc->has_network_lock = true; if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { + opts->rpc->has_network_lock = true; opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } From 7eaf43368d67b7c7641ec2f8126db99fb7232d7b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Jan 2025 15:19:16 +0000 Subject: [PATCH 060/198] ci: handle results from latest codespell CI pulls in a newer version of codespell. This fixes complaints from that codespell version. Signed-off-by: Adrian Reber --- .codespellrc | 2 +- criu/include/rbtree.h | 2 +- criu/include/rst_info.h | 2 +- criu/page-xfer.c | 4 ++-- test/zdtm/static/packet_sock.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.codespellrc b/.codespellrc index dd31dd851c..15e6fc7bcc 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index ba0a8100e7..6981aa8f9c 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -14,7 +14,7 @@ #define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent anc color */ + unsigned long rb_parent_color; /* Keeps both parent and color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 59b891fa26..df9f9de012 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -22,7 +22,7 @@ struct fdt { pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restrored, if fdt_lock is equal to nr + 1 + * The fdt table was restored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 94f4774148..0314963e6d 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1421,7 +1421,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) if (opts.ps_socket != -1) { ask = opts.ps_socket; - pr_info("Re-using ps socket %d\n", ask); + pr_info("Reusing ps socket %d\n", ask); goto no_server; } @@ -1467,7 +1467,7 @@ static int connect_to_page_server(void) if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; - pr_info("Re-using ps socket %d\n", page_server_sk); + pr_info("Reusing ps socket %d\n", page_server_sk); goto out; } diff --git a/test/zdtm/static/packet_sock.c b/test/zdtm/static/packet_sock.c index 4a9078f815..c1c94ac219 100644 --- a/test/zdtm/static/packet_sock.c +++ b/test/zdtm/static/packet_sock.c @@ -5,7 +5,7 @@ const char *test_author = "Pavel Emelyanov "; /* * Description: - * Create and bind several packet sockets, check thet getname + * Create and bind several packet sockets, check that getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ From 7f35e46e9d6110ca8a15048e48706f0ff8c48edc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 10:25:24 +0800 Subject: [PATCH 061/198] net/sysctl: put common multiplier outside the brackets Also add an explanation of the logic behind this calculation. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/criu/net.c b/criu/net.c index efd52db327..97c53f84f9 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2149,10 +2149,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) list_for_each_entry(p, &ns->net.ids, node) i++; + /* + * Here we allocate one single big buffer for storing multiple arrays + * of protobuf entries and pointers to entries in it and we later use + * xptr_pull_s to claim a part of this buffer of proper size for each + * particular array. Next we read data from sysctl files to those + * arrays and then finally save them into images. + */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - size4 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - size6 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - sizex * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); + (size4 * 2 + size6 * 2 + sizex) * + (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; From f38e58836ac0e0fd842424076e19667fb49f975b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:00:28 +0800 Subject: [PATCH 062/198] net/sysctl: c/r ipv4/ping_group_range value It is per net namespace, we need it to allow creation of unprivileged ICMP sockets. Note: in case this sysctl was disabled after unprivileged ICMP socket was created we still need to somehow handle it on restore. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 103 +++++++++++++++++++++++++++++++++++++++++++- images/netdev.proto | 1 + 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 97c53f84f9..ee46f1c495 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2128,6 +2128,79 @@ static inline int dump_nftables(struct cr_imgset *fds) } #endif +static const char *ipv4_sysctl_entries[] = { + "ping_group_range", +}; + +#define IPV4_SYSCTL_BASE "net/ipv4" +#define IPV4_SYSCTL_FMT IPV4_SYSCTL_BASE"/%s" +#define MAX_IPV4_SYSCTL_OPT 32 +#define MAX_IPV4_SYSCTL_PATH (sizeof(IPV4_SYSCTL_FMT) + MAX_IPV4_SYSCTL_OPT - 2) +#define MAX_STR_IPV4_SYSCTL_LEN 200 + +static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) +{ + int i, ret = -1, flags = 0; + char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; + struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; + SysctlEntry **sysctl = *rsysctl; + size_t n = *pn; + + if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { + pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + return -EINVAL; + } + + if (opts.weak_sysctls || op == CTL_READ) + flags = CTL_FLAGS_OPTIONAL; + + for (i = 0; i < n; i++) { + snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[i].name = path[i]; + req[i].flags = flags; + + switch (sysctl[i]->type) { + case SYSCTL_TYPE__CTL_STR: + req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + + /* skip write if have no value */ + if (op == CTL_WRITE && !sysctl[i]->sarg) + continue; + + req[i].arg = sysctl[i]->sarg; + break; + default: + pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); + return -1; + } + } + + ret = sysctl_op(req, n, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + return -1; + } + + if (op == CTL_READ) { + bool has_entries = false; + + for (i = 0; i < n; i++) { + if (req[i].flags & CTL_FLAGS_HAS) { + sysctl[i]->has_iarg = true; + if (!has_entries) + has_entries = true; + } + } + + if (!has_entries) { + *pn = 0; + *rsysctl = NULL; + } + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2142,6 +2215,9 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + SysctlEntry *ipv4_sysctls = NULL; + size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); + char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; NetnsId *ids; struct netns_id *p; @@ -2157,7 +2233,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) * arrays and then finally save them into images. */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - (size4 * 2 + size6 * 2 + sizex) * + (2 * size4 + 2 * size6 + sizex + ipv4_sysctl_size) * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; @@ -2223,6 +2299,21 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; } + netns.n_ipv4_sysctl = ipv4_sysctl_size; + netns.ipv4_sysctl = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry *)); + ipv4_sysctls = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry)); + for (i = 0; i < ipv4_sysctl_size; i++) { + sysctl_entry__init(&ipv4_sysctls[i]); + netns.ipv4_sysctl[i] = &ipv4_sysctls[i]; + if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { + netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; + netns.ipv4_sysctl[i]->sarg = ping_group_range; + } else { + /* Need to handle this case when we have more sysctls */ + BUG(); + } + } + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -2241,6 +2332,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + ret = ipv4_sysctls_op(&netns.ipv4_sysctl, &netns.n_ipv4_sysctl, CTL_READ); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -2593,6 +2688,12 @@ static int restore_netns_conf(struct ns_id *ns) goto out; } + if ((netns)->ipv4_sysctl) { + ret = ipv4_sysctls_op(&(netns)->ipv4_sysctl, &(netns)->n_ipv4_sysctl, CTL_WRITE); + if (ret) + goto out; + } + ns->net.netns = netns; out: return ret; diff --git a/images/netdev.proto b/images/netdev.proto index 748fd02004..42e2bc7d7e 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -74,4 +74,5 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; repeated sysctl_entry unix_conf = 9; + repeated sysctl_entry ipv4_sysctl = 10; } From 1c9fd58ff052e8b1f2373076fdb4db8662bc5cb5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:56:00 +0800 Subject: [PATCH 063/198] zdtm/netns_sub_sysctl: add ipv4/ping_group_range sysctl check Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/sysctl.c | 43 +++++++++++++++++++++ test/zdtm/lib/sysctl.h | 2 + test/zdtm/static/netns_sub_sysctl.c | 58 +++++++++++++++++++++++------ 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c index 9583ec3df5..3b1ebc1687 100644 --- a/test/zdtm/lib/sysctl.c +++ b/test/zdtm/lib/sysctl.c @@ -3,6 +3,49 @@ #include "zdtmtst.h" #include "sysctl.h" +int sysctl_read_str(const char *name, char *data, size_t size) +{ + int fd, ret; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = read(fd, data, size - 1); + if (ret < 0) { + pr_perror("Can't read %s", name); + close(fd); + return -1; + } + data[ret] = '\0'; + close(fd); + + return 0; +} + +int sysctl_write_str(const char *name, char *data) +{ + int fd, ret; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = write(fd, data, strlen(data)); + if (ret < 0) { + pr_perror("Can't write %s into %s", data, name); + close(fd); + return -1; + } + close(fd); + + return 0; +} + int sysctl_read_int(const char *name, int *data) { int fd; diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h index 67129102fe..d435bd7e98 100644 --- a/test/zdtm/lib/sysctl.h +++ b/test/zdtm/lib/sysctl.h @@ -3,5 +3,7 @@ extern int sysctl_read_int(const char *name, int *data); extern int sysctl_write_int(const char *name, int val); +extern int sysctl_read_str(const char *name, char *data, size_t size); +extern int sysctl_write_str(const char *name, char *data); #endif diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 545a17308a..0f94c40a79 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -3,18 +3,33 @@ #include "zdtmtst.h" #include "sysctl.h" -const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; +const char *test_doc = "Check dump and restore of sysctls in subns"; const char *test_author = "Alexander Mikhalitsyn "; +#define MAX_STR_SYSCTL_LEN 200 + +enum { + SYSCTL_INT, + SYSCTL_STR, +}; + typedef struct { const char *path; + int type; int old; int new; + char s_old[MAX_STR_SYSCTL_LEN]; + char s_new[MAX_STR_SYSCTL_LEN]; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" +#define IPV4_SYSCTL_BASE "/proc/sys/net/ipv4" -static sysctl_opt_t net_unix_params[] = { { CONF_UNIX_BASE "/max_dgram_qlen", 0, 0 }, { NULL, 0, 0 } }; +static sysctl_opt_t net_unix_params[] = { + {CONF_UNIX_BASE "/max_dgram_qlen", SYSCTL_INT}, + {IPV4_SYSCTL_BASE "/ping_group_range", SYSCTL_STR, 0, 0, "40000\t50000\n"}, + {NULL, 0, 0} +}; int main(int argc, char **argv) { @@ -23,10 +38,17 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { - p->old = (((unsigned)lrand48()) % 1023) + 1; - if (sysctl_write_int(p->path, p->old)) { - pr_perror("Can't change %s", p->path); - return -1; + if (p->type == SYSCTL_INT) { + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_write_str(p->path, p->s_old)) { + pr_perror("Can't change %s", p->path); + return -1; + } } } @@ -34,13 +56,25 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { - if (sysctl_read_int(p->path, &p->new)) - ret = 1; + if (p->type == SYSCTL_INT) { + if (sysctl_read_int(p->path, &p->new)) + ret = 1; - if (p->old != p->new) { - errno = EINVAL; - pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); - ret = 1; + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_read_str(p->path, p->s_new, MAX_STR_SYSCTL_LEN)) { + ret = 1; + } else { + if (strcmp(p->s_old, p->s_new)) { + errno = EINVAL; + pr_perror("%s changed: %s ---> %s", p->path, p->s_old, p->s_new); + ret = 1; + } + } } } From 237ac72c32b95949dd134a6946abeac15e86fb5e Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 22 Jan 2025 14:35:26 +0100 Subject: [PATCH 064/198] vdso: switch from DT_HASH to DT_GNU_HASH (aarch64) Trying to run latest CRIU on CentOS Stream 10 or Ubuntu 24.04 (aarch64) fails like this: # criu/criu check -v4 [...] (00.096460) vdso: Parsing at ffffb2e2a000 ffffb2e2c000 (00.096539) vdso: PT_LOAD p_vaddr: 0 (00.096567) vdso: DT_STRTAB: 1d0 (00.096592) vdso: DT_SYMTAB: 128 (00.096616) vdso: DT_STRSZ: 8a (00.096640) vdso: DT_SYMENT: 18 (00.096663) Error (criu/pie-util-vdso.c:193): vdso: Not all dynamic entries are present (00.096688) Error (criu/vdso.c:627): vdso: Failed to fill self vdso symtable (00.096713) Error (criu/kerndat.c:1906): kerndat_vdso_fill_symtable failed when initializing kerndat. (00.096812) Found mmap_min_addr 0x10000 (00.096881) files stat: fs/nr_open 1073741816 (00.096908) Error (criu/crtools.c:267): Could not initialize kernel features detection. This seems to be related to the kernel (6.12.0-41.el10.aarch64). The Ubuntu user-space is running in a container on the same kernel. Looking at the kernel this seems to be related to: commit 48f6430505c0b0498ee9020ce3cf9558b1caaaeb Author: Fangrui Song Date: Thu Jul 18 10:34:23 2024 -0700 arm64/vdso: Remove --hash-style=sysv glibc added support for .gnu.hash in 2006 and .hash has been obsoleted for more than one decade in many Linux distributions. Using --hash-style=sysv might imply unaddressed issues and confuse readers. Just drop the option and rely on the linker default, which is likely "both", or "gnu" when the distribution really wants to eliminate sysv hash overhead. Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash"). The commit basically does: -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ Which results in only a GNU hash being added to the ELF header. This change has been merged with 6.11. Looking at the referenced x86 commit: commit 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12 Author: Andy Lutomirski Date: Thu Aug 6 14:45:45 2015 -0700 x86/vdso: Emit a GNU hash Some dynamic loaders may be slightly faster if a GNU hash is available. Strangely, this seems to have no effect at all on the vdso size. This is unlikely to have any measurable effect on the time it takes to resolve vdso symbols (since there are so few of them). In some contexts, it can be a win for a different reason: if every DSO has a GNU hash section, then libc can avoid calculating SysV hashes at all. Both musl and glibc appear to have this optimization. It's plausible that this breaks some ancient glibc version. If so, then, depending on what glibc versions break, we could either require COMPAT_VDSO for them or consider reverting. Which is also a really simple change: -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ The big difference here is that for x86 both hash sections are generated. For aarch64 only the newer GNU hash is generated. That is why we only see this error on kernel >= 6.11 and aarch64. Changing from DT_HASH to DT_GNU_HASH seems to work on aarch64. The test suite runs without any errors. Unfortunately I am not aware of all implication of this change and if a successful test suite run means that it still works. Looking at the kernel I see following hash styles for the VDSO: aarch64: not specified (only GNU hash style) arm: --hash-style=sysv loongarch: --hash-style=sysv mips: --hash-style=sysv powerpc: --hash-style=both riscv: --hash-style=both s390: --hash-style=both x86: --hash-style=both Only aarch64 on kernels >= 6.11 is a problem right now, because all other platforms provide the old style hashing. Signed-off-by: Adrian Reber Co-developed-by: Dmitry Safonov Co-authored-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- criu/pie/util-vdso.c | 245 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 198 insertions(+), 47 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index f1e3239ff5..9819335d81 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -48,10 +49,25 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } +/* Local strlen implementation */ +static size_t __strlen(const char *str) +{ + const char *ptr; + + if (!str) + return 0; + + ptr = str; + while (*ptr != '\0') + ptr++; + + return ptr - str; +} + /* * Elf hash, see format specification. */ -static unsigned long elf_hash(const unsigned char *name) +static unsigned long elf_sysv_hash(const unsigned char *name) { unsigned long h = 0, g; @@ -65,6 +81,15 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* * The GNU hash format. Taken from glibc. */ +static unsigned long elf_gnu_hash(const unsigned char *name) +{ + unsigned long h = 5381; + for (unsigned char c = *name; c != '\0'; c = *++name) + h = h * 33 + c; + return h; +} + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else @@ -149,11 +174,14 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t * * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section - * @dyn_hash - address of the symbol hash table + * @dyn_hash - address of the symbol hash table + * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH */ -static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, - Dyn_t **dyn_hash) +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, + Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, + Dyn_t **dyn_hash, bool *use_gnu_hash) { + Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; @@ -184,16 +212,52 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { - *dyn_hash = d; + dyn_sysv_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_GNU_HASH) { + /* + * This is complicated. + * + * Looking at the Linux kernel source, the following can be seen + * regarding which hashing style the VDSO uses on each arch: + * + * aarch64: not specified (depends on linker, can be + * only GNU hash style) + * arm: --hash-style=sysv + * loongarch: --hash-style=sysv + * mips: --hash-style=sysv + * powerpc: --hash-style=both + * riscv: --hash-style=both + * s390: --hash-style=both + * x86: --hash-style=both + * + * Some architectures are using both hash-styles, that + * is the easiest for CRIU. Some architectures are only + * using the old style (sysv), that is what CRIU supports. + * + * Starting with Linux 6.11, aarch64 unfortunately decided + * to switch from '--hash-style=sysv' to ''. Specifying + * nothing unfortunately may mean GNU hash style only and not + * 'both' (depending on the linker). + */ + dyn_gnu_hash = d; + pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } - if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || + (!dyn_gnu_hash && !dyn_sysv_hash)) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } + /* + * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and + * as a result more stable. + */ + *use_gnu_hash = !dyn_sysv_hash; + *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash; + return 0; err_oob: @@ -208,60 +272,141 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif -static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, - uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) +static bool elf_symbol_match(uintptr_t mem, size_t size, + uintptr_t dynsymbol_names, Sym_t *sym, + const char *symbol, const size_t vdso_symbol_length) { - ARCH_VDSO_SYMBOLS_LIST - - const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; - const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; + uintptr_t addr = (uintptr_t)sym; + char *name; - Hash_t nbucket, nchain; - Hash_t *bucket, *chain; + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + return false; - unsigned int i, j, k; - uintptr_t addr; + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + return false; - nbucket = hash[0]; - nchain = hash[1]; - bucket = &hash[2]; - chain = &hash[nbucket + 2]; + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + return false; + name = (void *)addr; - pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, - (unsigned long)chain); + return !std_strncmp(name, symbol, vdso_symbol_length); +} - for (i = 0; i < VDSO_SYMBOL_MAX; i++) { - const char *symbol = vdso_symbols[i]; - k = elf_hash((const unsigned char *)symbol); - for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { - Sym_t *sym; - char *name; +static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, + const char *symbol, uint32_t symbol_hash, unsigned int sym_off, + uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, + Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + const size_t vdso_symbol_length, bool use_gnu_hash) +{ + unsigned int j; + uintptr_t addr; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + + if (use_gnu_hash) { + uint32_t *h = bucket + nbucket + (j - sym_off); + uint32_t hash_val; + + symbol_hash |= 1; + do { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + hash_val = *h++; + if ((hash_val | 1) == symbol_hash && + elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + j++; + } while (!(hash_val & 1)); + } else { + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + if (elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + } + } + return 0; +} - addr += sizeof(Sym_t) * j; - if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) - continue; - sym = (void *)addr; +static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) +{ + ARCH_VDSO_SYMBOLS_LIST - if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) - continue; + const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; + const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - addr = dynsymbol_names + sym->st_name; - if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) - continue; - name = (void *)addr; + Hash_t *bucket = NULL; + Hash_t *chain = NULL; + Hash_t nbucket = 0; + Hash_t nchain = 0; + + unsigned int sym_off = 0; + unsigned int i = 0; + + unsigned long (*elf_hash)(const unsigned char *); + + if (use_gnu_hash) { + uint32_t *gnu_hash = (uint32_t *)hash; + uint32_t bloom_sz; + size_t *bloom; + + nbucket = gnu_hash[0]; + sym_off = gnu_hash[1]; + bloom_sz = gnu_hash[2]; + bloom = (size_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + elf_hash = &elf_gnu_hash; + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + (unsigned long)nbucket, (unsigned long)sym_off, + (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bucket); + } else { + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + elf_hash = &elf_sysv_hash; + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (unsigned long)nbucket, (unsigned long)nchain, + (unsigned long)bucket, (unsigned long)chain); + } - if (std_strncmp(name, symbol, vdso_symbol_length)) - continue; - /* XXX: provide strncpy() implementation for PIE */ - memcpy(t->symbols[i].name, name, vdso_symbol_length); - t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; - break; + for (i = 0; i < VDSO_SYMBOL_MAX; i++) { + const char *symbol = vdso_symbols[i]; + unsigned long addr, symbol_hash; + const size_t symbol_length = __strlen(symbol); + + symbol_hash = elf_hash((const unsigned char *)symbol); + addr = elf_symbol_lookup(mem, size, symbol, symbol_hash, + sym_off, dynsymbol_names, dyn_symtab, load, + nbucket, nchain, bucket, chain, + vdso_symbol_length, use_gnu_hash); + pr_debug("symbol %s at address %lx\n", symbol, addr); + if (!addr) + continue; + + /* XXX: provide strncpy() implementation for PIE */ + if (symbol_length > vdso_symbol_length) { + pr_err("strlen(%s) %zd, only %zd bytes available\n", + symbol, symbol_length, vdso_symbol_length); + return -EINVAL; } + memcpy(t->symbols[i].name, symbol, symbol_length); + t->symbols[i].offset = addr - load->p_vaddr; } + + return 0; } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) @@ -271,6 +416,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; + bool use_gnu_hash; uintptr_t dynsymbol_names; uintptr_t addr; @@ -296,7 +442,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) * needed. Note that we're interested in a small set of tags. */ - ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); + ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, + &dyn_hash, &use_gnu_hash); if (ret < 0) return ret; @@ -310,7 +457,11 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) goto err_oob; hash = (void *)addr; - parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); + ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, + use_gnu_hash); + + if (ret <0) + return ret; return 0; From d4585a024d2c7becf085cb9ef77417b2714fedaa Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 31 Jan 2025 14:45:03 -0800 Subject: [PATCH 065/198] Makefile: move codespell options to .codespellrc This way, - Makefile is less cluttered; - one can run codespell from the command line. Fixes: fd7e97fcf ("lint: exclude tags file from codespell") Signed-off-by: Kir Kolyshkin --- .codespellrc | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.codespellrc b/.codespellrc index 15e6fc7bcc..e91a6d2eb5 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki +skip = ./.git,./test/pki,./tags ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/Makefile b/Makefile index 60b78a0749..90908de837 100644 --- a/Makefile +++ b/Makefile @@ -466,7 +466,7 @@ shellcheck: shellcheck -x test/others/action-script/*.sh codespell: - codespell -S tags + codespell lint: ruff shellcheck codespell # Do not append \n to pr_perror, pr_pwarn or fail From 48d0910a459ff369d160076014e8c390bb0b6f6f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 17:32:48 +0100 Subject: [PATCH 066/198] ci: install gawk for Fedora based tests Currently Fedora rawhide based CI runs fail with: /bin/sh: line 1: awk: command not found Let's install it. Signed-off-by: Adrian Reber --- scripts/ci/prepare-for-fedora-rawhide.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 42252c93c9..f8ad9cf978 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -4,6 +4,7 @@ set -e -x dnf install -y \ diffutils \ findutils \ + gawk \ gcc \ git \ gnutls-devel \ From b0f0d0fa0eac154df1de428eb4b115094c43cb70 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 20 Feb 2025 04:31:12 +0000 Subject: [PATCH 067/198] kerndat: run iptables with -n to not resolve service names Resolving service names can be slow and it isn't needed here. Fixes #2032 Signed-off-by: Andrei Vagin --- criu/kerndat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index fa1ed21fad..5939005a41 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -646,7 +646,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { From e71d53ce18f5b2c3d4dc50c965c1f3cee5c0682f Mon Sep 17 00:00:00 2001 From: dschervov Date: Wed, 5 Feb 2025 20:04:37 +0300 Subject: [PATCH 068/198] criu: fix internal representation of cgroups hierarchical structure strstartswith() function is incorrect choice for finding parent directory so i change it to issubpath() function Signed-off-by: Dmitrii Chervov --- criu/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index fcaed07080..9246be6390 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -248,7 +248,7 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (strstartswith(path, d->path)) { + if (issubpath(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; From 6f94888cb3b2050fd6e5ba0cf504daedd7ea6755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Thu, 23 Jan 2025 04:07:42 +0530 Subject: [PATCH 069/198] coredump: enable coredump generation on aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the aarch64 platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 6 +- coredump/criu_coredump/coredump.py | 164 +++++++++++++++++++++-------- coredump/criu_coredump/elf.py | 55 +++++++++- test/others/criu-coredump/test.sh | 5 +- 4 files changed, 178 insertions(+), 52 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index 3fbdafe81c..f1027773d2 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,6 +6,8 @@ import sys import criu_coredump +PLATFORMS = ["aarch64", "x86_64"] + def coredump(opts): generator = criu_coredump.coredump_generator() @@ -37,8 +39,8 @@ def main(): opts = vars(parser.parse_args()) - if platform.machine() != 'x86_64': - print('ERROR: %s only supported on x86_64' % sys.argv[0]) + if platform.machine() not in PLATFORMS: + print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) sys.exit(1) try: diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 20ec8e5dc8..6bfc462f26 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -31,6 +31,7 @@ import io import sys import ctypes +import platform from pycriu import images from . import elf @@ -130,6 +131,11 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; + # thread info key based on the current arch + thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + + machine = platform.machine() # current arch + def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -213,7 +219,7 @@ def _gen_ehdr(self, pid, phdrs): ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT ehdr.e_type = elf.ET_CORE - ehdr.e_machine = elf.EM_X86_64 + ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) @@ -224,6 +230,13 @@ def _gen_ehdr(self, pid, phdrs): return ehdr + def _get_e_machine(self): + """ + Get the e_machine field based on the current architecture. + """ + e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + return e_machine_dict[self.machine] + def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. @@ -332,7 +345,7 @@ def _gen_prstatus(self, pid, tid): Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["gpregs"] + regs = self._get_gpregs(core) pstree = self.pstree[pid] prstatus = elf.elf_prstatus() @@ -345,33 +358,7 @@ def _gen_prstatus(self, pid, tid): prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - prstatus.pr_reg.r15 = regs["r15"] - prstatus.pr_reg.r14 = regs["r14"] - prstatus.pr_reg.r13 = regs["r13"] - prstatus.pr_reg.r12 = regs["r12"] - prstatus.pr_reg.rbp = regs["bp"] - prstatus.pr_reg.rbx = regs["bx"] - prstatus.pr_reg.r11 = regs["r11"] - prstatus.pr_reg.r10 = regs["r10"] - prstatus.pr_reg.r9 = regs["r9"] - prstatus.pr_reg.r8 = regs["r8"] - prstatus.pr_reg.rax = regs["ax"] - prstatus.pr_reg.rcx = regs["cx"] - prstatus.pr_reg.rdx = regs["dx"] - prstatus.pr_reg.rsi = regs["si"] - prstatus.pr_reg.rdi = regs["di"] - prstatus.pr_reg.orig_rax = regs["orig_ax"] - prstatus.pr_reg.rip = regs["ip"] - prstatus.pr_reg.cs = regs["cs"] - prstatus.pr_reg.eflags = regs["flags"] - prstatus.pr_reg.rsp = regs["sp"] - prstatus.pr_reg.ss = regs["ss"] - prstatus.pr_reg.fs_base = regs["fs_base"] - prstatus.pr_reg.gs_base = regs["gs_base"] - prstatus.pr_reg.ds = regs["ds"] - prstatus.pr_reg.es = regs["es"] - prstatus.pr_reg.fs = regs["fs"] - prstatus.pr_reg.gs = regs["gs"] + self._set_pr_regset(prstatus.pr_reg, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -385,28 +372,64 @@ def _gen_prstatus(self, pid, tid): return note + def _get_gpregs(self, core): + """ + Get the general purpose registers based on the current architecture. + """ + thread_info_key = self.thread_info_key[self.machine] + thread_info = core[thread_info_key] + + return thread_info["gpregs"] + + def _set_pr_regset(self, pr_reg, regs): + """ + Set the pr_reg struct based on the current architecture. + """ + if self.machine == "aarch64": + pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) + pr_reg.sp = regs["sp"] + pr_reg.pc = regs["pc"] + pr_reg.pstate = regs["pstate"] + elif self.machine == "x86_64": + pr_reg.r15 = regs["r15"] + pr_reg.r14 = regs["r14"] + pr_reg.r13 = regs["r13"] + pr_reg.r12 = regs["r12"] + pr_reg.rbp = regs["bp"] + pr_reg.rbx = regs["bx"] + pr_reg.r11 = regs["r11"] + pr_reg.r10 = regs["r10"] + pr_reg.r9 = regs["r9"] + pr_reg.r8 = regs["r8"] + pr_reg.rax = regs["ax"] + pr_reg.rcx = regs["cx"] + pr_reg.rdx = regs["dx"] + pr_reg.rsi = regs["si"] + pr_reg.rdi = regs["di"] + pr_reg.orig_rax = regs["orig_ax"] + pr_reg.rip = regs["ip"] + pr_reg.cs = regs["cs"] + pr_reg.eflags = regs["flags"] + pr_reg.rsp = regs["sp"] + pr_reg.ss = regs["ss"] + pr_reg.fs_base = regs["fs_base"] + pr_reg.gs_base = regs["gs_base"] + pr_reg.ds = regs["ds"] + pr_reg.es = regs["es"] + pr_reg.fs = regs["fs"] + pr_reg.gs = regs["gs"] + def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["fpregs"] + regs = self._get_fpregs(core) fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) + self._set_fpregset(fpregset, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -420,6 +443,58 @@ def _gen_fpregset(self, pid, tid): return note + def _get_fpregs(self, core): + """ + Get the floating point register dictionary based on the current architecture. + """ + fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} + fpregs_key = fpregs_key_dict[self.machine] + + thread_info_key = self.thread_info_key[self.machine] + + return core[thread_info_key][fpregs_key] + + def _set_fpregset(self, fpregset, regs): + """ + Set the fpregset struct based on the current architecture. + """ + if self.machine == "aarch64": + fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) + fpregset.fpsr = regs["fpsr"] + fpregset.fpcr = regs["fpcr"] + elif self.machine == "x86_64": + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + + def _gen_arm_tls(self, tid): + """ + Generate NT_ARM_TLS note for thread tid of process pid. + """ + core = self.cores[tid] + tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) + nhdr.n_type = elf.NT_ARM_TLS + + note = elf_note() + note.data = tls + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -593,8 +668,11 @@ def _gen_thread_notes(self, pid, tid): notes.append(self._gen_prstatus(pid, tid)) notes.append(self._gen_fpregset(pid, tid)) - notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) + if self.machine == "aarch64": + notes.append(self._gen_arm_tls(tid)) + elif self.machine == "x86_64": + notes.append(self._gen_x86_xstate(pid, tid)) return notes diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 092b478575..2697fad075 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,5 +1,8 @@ # Define structures and constants for generating elf file. import ctypes +import platform + +MACHINE = platform.machine() Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -39,6 +42,7 @@ # Legal values for e_machine (architecture). EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ +EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ @@ -119,6 +123,7 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ class Elf64_Nhdr(ctypes.Structure): # typedef struct @@ -218,7 +223,7 @@ class timeval(ctypes.Structure): # struct timeval ] -class user_regs_struct(ctypes.Structure): # struct user_regs_struct +class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct _fields_ = [ ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; @@ -277,10 +282,31 @@ class user_regs_struct(ctypes.Structure): # struct user_regs_struct ] +class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct + _fields_ = [ + ("regs", + ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; + ("sp", + ctypes.c_ulonglong), # unsigned long long int sp; + ("pc", + ctypes.c_ulonglong), # unsigned long long int pc; + ("pstate", + ctypes.c_ulonglong), # unsigned long long int pstate; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG -elf_gregset_t = user_regs_struct +user_regs_dict = { + "aarch64": aarch64_user_regs_struct, + "x86_64": x86_64_user_regs_struct, +} + +try: + elf_gregset_t = user_regs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) class elf_prstatus(ctypes.Structure): # struct elf_prstatus @@ -420,7 +446,7 @@ class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo ] -class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct +class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct _fields_ = [ # unsigned short int cwd; ("cwd", ctypes.c_ushort), @@ -447,7 +473,28 @@ class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct ] -elf_fpregset_t = user_fpregs_struct +class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct + _fields_ = [ + # unsigned long long int vregs[64]; + ("vregs", ctypes.c_ulonglong * 64), + # unsigned int fpsr; + ("fpsr", ctypes.c_uint), + # unsigned int fpcr; + ("fpcr", ctypes.c_uint), + # unsigned int padding[2]; + ("padding", ctypes.c_uint * 2), + ] + + +user_fpregs_dict = { + "aarch64": aarch64_user_fpregs_struct, + "x86_64": x86_64_user_fpregs_struct, +} + +try: + elf_fpregset_t = user_fpregs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) # siginfo_t related constants. diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 4399044d71..e0ddce58da 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,9 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [ "$UNAME_M" != "x86_64" ]; then - # the criu-coredump script is only x86_64 aware - echo "criu-coredump only support x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 and x86_64. skipping." exit 0 fi From da7f5b75f4c485f210f1d024367b4059910c9b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Mon, 17 Feb 2025 18:06:10 +0530 Subject: [PATCH 070/198] coredump: enable coredump generation on arm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the arm platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 2 +- coredump/criu_coredump/coredump.py | 124 +++++++++++++++---- coredump/criu_coredump/elf.py | 188 ++++++++++++++++++++++++++++- test/others/criu-coredump/test.sh | 4 +- 4 files changed, 288 insertions(+), 30 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index f1027773d2..5b3e6f366b 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,7 +6,7 @@ import sys import criu_coredump -PLATFORMS = ["aarch64", "x86_64"] +PLATFORMS = ["aarch64", "armv7l", "x86_64"] def coredump(opts): diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 6bfc462f26..c6a758c8ad 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -95,8 +95,13 @@ def write(self, f): buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} + + offset = ctypes.sizeof(ehdr[bits]()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) filesz = 0 for note in self.notes: @@ -132,9 +137,18 @@ class coredump_generator: pagemaps = {} # pagemap by pid; # thread info key based on the current arch - thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + thread_info_key = { + "aarch64": "ti_aarch64", + "armv7l": "ti_arm", + "x86_64": "thread_info", + } machine = platform.machine() # current arch + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr + nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr def _img_open_and_strip(self, name, single=False, pid=None): """ @@ -207,23 +221,30 @@ def _gen_ehdr(self, pid, phdrs): """ Generate elf header for process pid with program headers phdrs. """ - ehdr = elf.Elf64_Ehdr() + ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} + + ehdr = self.ehdr[self.bits]() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + if self.machine == "armv7l": + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM + else: + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE + ehdr.e_type = elf.ET_CORE ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) @@ -234,7 +255,11 @@ def _get_e_machine(self): """ Get the e_machine field based on the current architecture. """ - e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + e_machine_dict = { + "aarch64": elf.EM_AARCH64, + "armv7l": elf.EM_ARM, + "x86_64": elf.EM_X86_64, + } return e_machine_dict[self.machine] def _gen_phdrs(self, pid, notes, vmas): @@ -243,15 +268,15 @@ def _gen_phdrs(self, pid, notes, vmas): """ phdrs = [] - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + offset = ctypes.sizeof(self.ehdr[self.bits]()) + offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -271,7 +296,7 @@ def _gen_phdrs(self, pid, notes, vmas): for vma in vmas: offset += filesz filesz = vma.filesz - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -328,7 +353,7 @@ def _gen_prpsinfo(self, pid): prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] prpsinfo.pr_fname = core["tc"]["comm"].encode() - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO @@ -360,7 +385,7 @@ def _gen_prstatus(self, pid, tid): self._set_pr_regset(prstatus.pr_reg, regs) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS @@ -390,6 +415,25 @@ def _set_pr_regset(self, pr_reg, regs): pr_reg.sp = regs["sp"] pr_reg.pc = regs["pc"] pr_reg.pstate = regs["pstate"] + elif self.machine == "armv7l": + pr_reg.r0 = regs["r0"] + pr_reg.r1 = regs["r1"] + pr_reg.r2 = regs["r2"] + pr_reg.r3 = regs["r3"] + pr_reg.r4 = regs["r4"] + pr_reg.r5 = regs["r5"] + pr_reg.r6 = regs["r6"] + pr_reg.r7 = regs["r7"] + pr_reg.r8 = regs["r8"] + pr_reg.r9 = regs["r9"] + pr_reg.r10 = regs["r10"] + pr_reg.fp = regs["fp"] + pr_reg.ip = regs["ip"] + pr_reg.sp = regs["sp"] + pr_reg.lr = regs["lr"] + pr_reg.pc = regs["pc"] + pr_reg.cpsr = regs["cpsr"] + pr_reg.orig_r0 = regs["orig_r0"] elif self.machine == "x86_64": pr_reg.r15 = regs["r15"] pr_reg.r14 = regs["r14"] @@ -495,6 +539,34 @@ def _gen_arm_tls(self, tid): return note + def _gen_arm_vfp(self, tid): + """ + Generate NT_ARM_VFP note for thread tid of process pid. + """ + core = self.cores[tid] + fpstate = core["ti_arm"]["fpstate"] + + data = elf.vfp_hard_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) + data.fpexc = fpstate["fpexc"] + data.fpscr = fpstate["fpscr"] + data.fpinst = fpstate["fpinst"] + data.fpinst2 = fpstate["fpinst2"] + + nhdr = elf.Elf32_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_ARM_VFP + + note = elf_note() + note.data = data + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -544,7 +616,7 @@ def _gen_siginfo(self, pid, tid): # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO @@ -563,17 +635,22 @@ def _gen_auxv(self, pid): mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"]) // 2 - class elf_auxv(ctypes.Structure): + class elf32_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] + + class elf64_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - auxv = elf_auxv() + elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} + + auxv = elf_auxv[self.bits] for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) nhdr.n_type = elf.NT_AUXV note = elf_note() @@ -650,7 +727,7 @@ class elf_files(ctypes.Structure): setattr(data, "file_ofs" + str(i), info.file_ofs) setattr(data, "name" + str(i), info.name.encode()) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) @@ -667,10 +744,13 @@ def _gen_thread_notes(self, pid, tid): notes = [] notes.append(self._gen_prstatus(pid, tid)) - notes.append(self._gen_fpregset(pid, tid)) + if self.machine != "armv7l": + notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) if self.machine == "aarch64": notes.append(self._gen_arm_tls(tid)) + elif self.machine == "armv7l": + notes.append(self._gen_arm_vfp(tid)) elif self.machine == "x86_64": notes.append(self._gen_x86_xstate(pid, tid)) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 2697fad075..2911f491e3 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -4,13 +4,19 @@ MACHINE = platform.machine() +Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; +Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; +Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; +Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; +Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; + Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf64_Ehdr related constants. +# Elf_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -31,22 +37,50 @@ EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ +EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ + EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ +ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). +EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ +# Legal values for e_osabi +ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ +ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ + + +class Elf32_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("e_ident", + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf32_Half), # Elf32_Half e_type; + ("e_machine", Elf32_Half), # Elf32_Half e_machine; + ("e_version", Elf32_Word), # Elf32_Word e_version; + ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; + ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; + ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; + ("e_flags", Elf32_Word), # Elf32_Word e_flags; + ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; + ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; + ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; + ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; + ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; + ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; + ] # } Elf32_Ehdr; + class Elf64_Ehdr(ctypes.Structure): # typedef struct _fields_ = [ @@ -68,7 +102,7 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct ] # } Elf64_Ehdr; -# Elf64_Phdr related constants. +# Elf_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -80,6 +114,19 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ +class Elf32_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("p_type", Elf32_Word), # Elf32_Word p_type; + ("p_offset", Elf32_Off), # Elf32_Off p_offset; + ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; + ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; + ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; + ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; + ("p_flags", Elf32_Word), # Elf32_Word p_flags; + ("p_align", Elf32_Word), # Elf32_Word p_align; + ] # } Elf32_Phdr; + + class Elf64_Phdr(ctypes.Structure): # typedef struct _fields_ = [ ("p_type", Elf64_Word), # Elf64_Word p_type; @@ -93,7 +140,25 @@ class Elf64_Phdr(ctypes.Structure): # typedef struct ] # } Elf64_Phdr; -# Elf64_auxv_t related constants. +# Elf_auxv_t related constants. + + +class _Elf32_auxv_t_U(ctypes.Union): + _fields_ = [("a_val", ctypes.c_uint32)] + + +class Elf32_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ + ("a_type", + ctypes.c_uint32), # uint32_t a_type; /* Entry type */ + ("a_un", _Elf32_auxv_t_U) # union + + # uint32_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf32_auxv_t; class _Elf64_auxv_t_U(ctypes.Union): @@ -114,7 +179,7 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct ] # } Elf64_auxv_t; -# Elf64_Nhdr related constants. +# Elf_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ @@ -123,9 +188,24 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ +class Elf32_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ + ( + "n_namesz", Elf32_Word + ), # Elf32_Word n_namesz; /* Length of the note's name. */ + ( + "n_descsz", Elf32_Word + ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ + ( + "n_type", Elf32_Word + ), # Elf32_Word n_type; /* Type of the note. */ + ] # } Elf32_Nhdr; + + class Elf64_Nhdr(ctypes.Structure): # typedef struct _fields_ = [ ( @@ -139,7 +219,52 @@ class Elf64_Nhdr(ctypes.Structure): # typedef struct ] # } Elf64_Nhdr; -# Elf64_Shdr related constants. +# Elf_Shdr related constants. + + +class Elf32_Shdr(ctypes.Structure): + _fields_ = [ + ( + # Section name (string tbl index) + "sh_name", Elf32_Word + ), + ( + # Section type + "sh_type", Elf32_Word + ), + ( + # Section flags + "sh_flags", Elf32_Word + ), + ( + # Section virtual addr at execution + "sh_addr", Elf32_Addr + ), + ( + # Section file offset + "sh_offset", Elf32_Off + ), + ( + # Section size in bytes + "sh_size", Elf32_Word + ), + ( + # Link to another section + "sh_link", Elf32_Word + ), + ( + # Additional section information + "sh_info", Elf32_Word + ), + ( + # Section alignment + "sh_addralign", Elf32_Word + ), + ( + # Entry size if section holds table + "sh_entsize", Elf32_Word + ) + ] class Elf64_Shdr(ctypes.Structure): @@ -295,11 +420,53 @@ class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_st ] +class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct + _fields_ = [ + ("r0", + ctypes.c_ulong), # unsigned ulong int r0; + ("r1", + ctypes.c_ulong), # unsigned ulong int r1; + ("r2", + ctypes.c_ulong), # unsigned ulong int r2; + ("r3", + ctypes.c_ulong), # unsigned ulong int r3; + ("r4", + ctypes.c_ulong), # unsigned ulong int r4; + ("r5", + ctypes.c_ulong), # unsigned ulong int r5; + ("r6", + ctypes.c_ulong), # unsigned ulong int r6; + ("r7", + ctypes.c_ulong), # unsigned ulong int r7; + ("r8", + ctypes.c_ulong), # unsigned ulong int r8; + ("r9", + ctypes.c_ulong), # unsigned ulong int r9; + ("r10", + ctypes.c_ulong), # unsigned ulong int r10; + ("fp", + ctypes.c_ulong), # unsigned ulong int fp; + ("ip", + ctypes.c_ulong), # unsigned ulong int ip; + ("sp", + ctypes.c_ulong), # unsigned ulong int sp; + ("lr", + ctypes.c_ulong), # unsigned ulong int lr; + ("pc", + ctypes.c_ulong), # unsigned ulong int pc; + ("cpsr", + ctypes.c_ulong), # unsigned ulong int cpsr; + ("orig_r0", + ctypes.c_ulong), # unsigned ulong int orig_r0; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG user_regs_dict = { "aarch64": aarch64_user_regs_struct, + "armv7l": arm_user_regs_struct, "x86_64": x86_64_user_regs_struct, } @@ -488,6 +655,7 @@ class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpreg user_fpregs_dict = { "aarch64": aarch64_user_fpregs_struct, + "armv7l": None, "x86_64": x86_64_user_fpregs_struct, } @@ -889,3 +1057,13 @@ class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { # struct ymmh_struct ymmh; ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; + + +class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { + _fields_ = [ + ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; + ("fpexc", ctypes.c_ulong), # __u32 fpexc; + ("fpscr", ctypes.c_ulong), # __u32 fpscr; + ("fpinst", ctypes.c_ulong), # __u32 fpinst; + ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; + ] # }; diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index e0ddce58da..2be82e64cf 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,8 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then - echo "criu-coredump only supports aarch64 and x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "armv7l" &&"$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 armv7l, and x86_64. skipping." exit 0 fi From 72ab27051da2bb499b19d05ffff98b414aa75f17 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 3 Mar 2025 15:03:51 +0000 Subject: [PATCH 071/198] scripts/uninstall_module: import signal module With Python 3.13, the `subprocess` module now uses the `posix_spawn()` function [1], which requires the `signal` module to be imported. Fixes: #2607 [1] https://docs.python.org/3/whatsnew/3.13.html#subprocess Signed-off-by: Radostin Stoyanov --- scripts/uninstall_module.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 8a9b70892b..2da63c8003 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -10,6 +10,16 @@ import subprocess import sys +# With Python 3.13 the subprocess module now uses the `posix_spawn()` +# function which requires loading the `signal` module: +# https://docs.python.org/3/whatsnew/3.13.html#subprocess +# +# We need to load this module here, before PYTHONPATH and sys.path +# have been modified to use the path specified with `--prefix`. +# +# flake8: noqa: F401 +import signal + import importlib_metadata From 3025ee1f7cea6d4c3987c1944ab5fe9eb6441b50 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 16:38:33 +0100 Subject: [PATCH 072/198] ci: update to latest actions for codeql CI job Signed-off-by: Adrian Reber --- .github/workflows/codeql.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 518d9b8ae0..88e21d3d17 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,22 +29,22 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" From 4b099510b35f98a1f1d6589b1660470402fc1fef Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 11:07:17 +0100 Subject: [PATCH 073/198] lsm: use the user provided lsm label Currently CRIU has the possibility to specify a LSM label during restore. Unfortunately the information is completely ignored in the case of SELinux. This change selects the lsm label from the user if it is provided and else the label from the checkpoint image is used. Signed-off-by: Adrian Reber --- criu/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index d1b73cc79e..70b66d42ee 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -370,7 +370,7 @@ int render_lsm_profile(char *profile, char **val) case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", profile) < 0) { + if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { *val = NULL; return -1; } From e5fe6cc16dc7ac098f8b5cecf05699bc00694ad9 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 7 Feb 2025 09:24:19 +0100 Subject: [PATCH 074/198] vdso: Fixes in DT_GNU_HASH handling * Hash buckets is an array of 32-bit words. While DT_HASH is 32-bit on most platforms except s390 (where it's 64-bit). * The bloom filter word size differs between 32-bit and 64-bit ELF files. This commit adjusts the code to handle both cases. Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 57 +++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 9819335d81..af3c089858 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -121,7 +121,8 @@ static int has_elf_identity(Ehdr_t *ehdr) return true; } -static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) +static int parse_elf_phdr(uintptr_t mem, size_t size, + Phdr_t **dynamic, Phdr_t **load, bool *is_32bit) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; @@ -136,6 +137,8 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t * if (!has_elf_identity(ehdr)) return -EINVAL; + *is_32bit = ehdr->e_ident[EI_CLASS] != ELFCLASS64; + addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; @@ -272,6 +275,8 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif +typedef uint32_t Hash32_t; + static bool elf_symbol_match(uintptr_t mem, size_t size, uintptr_t dynsymbol_names, Sym_t *sym, const char *symbol, const size_t vdso_symbol_length) @@ -297,21 +302,22 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; uintptr_t addr; - j = bucket[symbol_hash % nbucket]; - if (j == STN_UNDEF) - return 0; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; if (use_gnu_hash) { - uint32_t *h = bucket + nbucket + (j - sym_off); - uint32_t hash_val; + Hash32_t *h, hash_val, *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + h = bucket + nbucket + (j - sym_off); symbol_hash |= 1; do { @@ -325,6 +331,12 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, j++; } while (!(hash_val & 1)); } else { + Hash_t *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; @@ -338,17 +350,17 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, uintptr_t dynsymbol_names, - Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash, + bool is_32bit) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t *bucket = NULL; + void *bucket = NULL; Hash_t *chain = NULL; - Hash_t nbucket = 0; - Hash_t nchain = 0; + uint64_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; @@ -358,17 +370,23 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, if (use_gnu_hash) { uint32_t *gnu_hash = (uint32_t *)hash; uint32_t bloom_sz; - size_t *bloom; nbucket = gnu_hash[0]; sym_off = gnu_hash[1]; bloom_sz = gnu_hash[2]; - bloom = (size_t *)&gnu_hash[4]; - bucket = (Hash_t *)(&bloom[bloom_sz]); + if (is_32bit) { + uint32_t *bloom; + bloom = (uint32_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } else { + uint64_t *bloom; + bloom = (uint64_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } elf_hash = &elf_gnu_hash; - pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bucket %lx\n", (unsigned long)nbucket, (unsigned long)sym_off, - (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bloom_sz, (unsigned long)bucket); } else { nbucket = hash[0]; @@ -417,6 +435,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; bool use_gnu_hash; + bool is_32bit; uintptr_t dynsymbol_names; uintptr_t addr; @@ -427,7 +446,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ - ret = parse_elf_phdr(mem, size, &dynamic, &load); + ret = parse_elf_phdr(mem, size, &dynamic, &load, &is_32bit); if (ret < 0) return ret; if (!load || !dynamic) { @@ -458,7 +477,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) hash = (void *)addr; ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, - use_gnu_hash); + use_gnu_hash, is_32bit); if (ret <0) return ret; From 5b4c819d54f953197e721b4c6af67af42860e5f3 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Mon, 10 Mar 2025 14:43:24 +0100 Subject: [PATCH 075/198] pstree: print clone flags in error message Signed-off-by: Han-Wen Nienhuys --- criu/pstree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/pstree.c b/criu/pstree.c index 41df846eda..660f1b9d99 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -962,7 +962,7 @@ static int prepare_pstree_kobj_ids(void) * this namespace is either inherited from the * criu or is created for the init task (only) */ - pr_err("Can't restore sub-task in NS\n"); + pr_err("Can't restore sub-task in NS (cflags %lx)\n", cflags); return -1; } } From 8d5cef546a035c4dda3a1be28ff1202c3b1b4c72 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Thu, 13 Mar 2025 08:46:16 +0100 Subject: [PATCH 076/198] restorer: Add a lock around cgroupd communication. Threads are put into cgroups through the cgroupd thread, which communicates with other threads using a socketpair. Previously, each thread received a dup'd copy of the socket, and did the following sendmsg(socket_dup_fd, my_cgroup_set); // wait for ack. while (1) { recvmsg(socket_dup_fd, &h, MSG_PEEK); if (h.pid != my_pid) continue; recvmsg(socket_dup_fd, &h, 0); } close(socket_dup_fd); When restoring many threads, many threads would be spinning in the above loop waiting for their PID to appear. In my test-case, restoring a process with a 11.5G heap and 491 threads could take anywhere between 10 seconds and 60 seconds to complete. To avoid the spinning, we drop the loop and MSG_PEEK, and add a lock around the above code. This does not decrease parallelism, as the cgroupd daemon uses a single thread anyway. With the lock in place, the same restore consistently takes around 10 seconds on my machine (Thinkpad P14s, AMD Ryzen 8840HS). There is a similar "daemon" thread for user namespaces. That already is protected with a similar userns_sync_lock in __userns_call(). Fixes #2614 Signed-off-by: Han-Wen Nienhuys --- criu/cr-restore.c | 1 + criu/include/rst_info.h | 1 + criu/pie/restorer.c | 61 ++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ddca6b8ece..e906da0cea 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2329,6 +2329,7 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index df9f9de012..4c9335a738 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -14,6 +14,7 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; + mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 6d048c3f1d..348ce6659b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -704,9 +704,8 @@ static int send_cg_set(int sk, int cg_set) } /* - * As this socket is shared among threads, recvmsg(MSG_PEEK) - * from the socket until getting its own thread id as an - * acknowledge of successful threaded cgroup fixup + * As the cgroupd socket is shared among threads and processes, this + * should be called with task_entries->cgroupd_sync_lock held. */ static int recv_cg_set_restore_ack(int sk) { @@ -719,33 +718,22 @@ static int recv_cg_set_restore_ack(int sk) h.msg_control = cmsg; h.msg_controllen = sizeof(cmsg); - while (1) { - ret = sys_recvmsg(sk, &h, MSG_PEEK); - if (ret < 0) { - pr_err("Unable to peek from cgroupd %d\n", ret); - return -1; - } - - if (h.msg_controllen != sizeof(cmsg)) { - pr_err("The message from cgroupd is truncated\n"); - return -1; - } - - ch = CMSG_FIRSTHDR(&h); - cred = (struct ucred *)CMSG_DATA(ch); - if (cred->pid != sys_gettid()) - continue; + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } - /* - * Actual remove message from recv queue of socket - */ - ret = sys_recvmsg(sk, &h, 0); - if (ret < 0) { - pr_err("Unable to receive from cgroupd %d\n", ret); - return -1; - } + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } - break; + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) { + pr_err("cred pid %d != gettid\n", cred->pid); + return -1; } return 0; } @@ -782,12 +770,21 @@ __visible long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; if (args->cg_set != -1) { + int err = 0; + + mutex_lock(&task_entries_local->cgroupd_sync_lock); + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); - if (send_cg_set(args->cgroupd_sk, args->cg_set)) - goto core_restore_end; - if (recv_cg_set_restore_ack(args->cgroupd_sk)) - goto core_restore_end; + + err = send_cg_set(args->cgroupd_sk, args->cg_set); + if (!err) + err = recv_cg_set_restore_ack(args->cgroupd_sk); + + mutex_unlock(&task_entries_local->cgroupd_sync_lock); sys_close(args->cgroupd_sk); + + if (err) + goto core_restore_end; } if (restore_thread_common(args)) From 1cf8040173d5b6cd258350800cbf7918cb47543d Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 19:14:54 +0000 Subject: [PATCH 077/198] arm64: C/R PAC keys PAC stands for Pointer Authentication Code. Each process has 5 PAC keys and a mask of enabled keys. All this properties have to be C/R-ed. As they are per-process protperties, we can save/restore them just for one thread. Signed-off-by: Andrei Vagin --- compel/arch/aarch64/src/lib/infect.c | 2 +- compel/arch/arm/src/lib/infect.c | 2 +- compel/arch/loongarch64/src/lib/infect.c | 2 +- compel/arch/mips/src/lib/infect.c | 2 +- compel/arch/ppc64/src/lib/infect.c | 2 +- compel/arch/riscv64/src/lib/infect.c | 2 +- compel/arch/s390/src/lib/infect.c | 2 +- compel/arch/x86/src/lib/infect.c | 2 +- compel/include/uapi/infect.h | 2 +- compel/src/lib/infect.c | 2 +- criu/arch/aarch64/crtools.c | 167 ++++++++++++++++++++++- criu/arch/aarch64/include/asm/dump.h | 2 +- criu/arch/aarch64/include/asm/restore.h | 10 ++ criu/arch/arm/crtools.c | 2 +- criu/arch/arm/include/asm/dump.h | 2 +- criu/arch/loongarch64/crtools.c | 2 +- criu/arch/loongarch64/include/asm/dump.h | 2 +- criu/arch/mips/crtools.c | 2 +- criu/arch/mips/include/asm/dump.h | 2 +- criu/arch/ppc64/crtools.c | 2 +- criu/arch/ppc64/include/asm/dump.h | 2 +- criu/arch/riscv64/crtools.c | 2 +- criu/arch/riscv64/include/asm/dump.h | 2 +- criu/arch/s390/crtools.c | 2 +- criu/arch/s390/include/asm/dump.h | 2 +- criu/arch/x86/crtools.c | 2 +- criu/arch/x86/include/asm/compat.h | 2 + criu/arch/x86/include/asm/dump.h | 2 +- criu/cr-restore.c | 10 ++ criu/include/rst_info.h | 8 ++ images/core-aarch64.proto | 23 ++++ 31 files changed, 244 insertions(+), 26 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 812ba34a37..ec1d0d59ea 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -81,7 +81,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); err: return ret; } diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 8b810a88f5..a9fb639e28 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -94,7 +94,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } - ret = save(arg, regs, vfp); + ret = save(pid, arg, regs, vfp); err: return ret; } diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c index 8e3c19aff2..190c39227a 100644 --- a/compel/arch/loongarch64/src/lib/infect.c +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -91,7 +91,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpregs); + ret = save(pid, arg, regs, fpregs); err: return 0; } diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index 0e98aaee3f..a1d4865ccd 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -149,7 +149,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct regs->regs[0] = 0; } - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); return ret; } diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 84c2b1d7c3..54abd48a4b 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -400,7 +400,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct if (ret) return ret; - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 861fe3b2f2..3f3a4b7ecc 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -92,7 +92,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct return -1; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); return ret; } diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 85dfc3a4d4..a77b38917e 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -348,7 +348,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } /* Call save_task_regs() */ - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index a07b1c9f37..644c483b40 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -453,7 +453,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; out: - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); err: return ret; } diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 7e6134f4bc..ed97d64dd6 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -97,7 +97,7 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index caf54e03fd..a9bbd64004 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1300,7 +1300,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index e87b8629a0..6cde03ee38 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,10 +21,86 @@ #include "cpu.h" #include "restorer.h" #include "compel/infect.h" +#include "pstree.h" + +extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +static int save_pac_keys(int pid, CoreEntry *core) +{ + struct user_pac_address_keys paca; + struct user_pac_generic_keys pacg; + PacKeys *pac_entry; + long pac_enabled_key; + struct iovec iov; + int ret; + + unsigned long hwcaps = getauxval(AT_HWCAP); + + pac_entry = xmalloc(sizeof(PacKeys)); + if (!pac_entry) + return -1; + core->ti_aarch64->pac_keys = pac_entry; + pac_keys__init(pac_entry); + + if (hwcaps & HWCAP_PACA) { + PacAddressKeys *pac_address_keys; + + pr_debug("%d: Dumping address authentication keys\n", pid); + iov.iov_base = &paca; + iov.iov_len = sizeof(paca); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to get address authentication key for %d", pid); + return -1; + } + pac_address_keys = xmalloc(sizeof(PacAddressKeys)); + if (!pac_address_keys) + return -1; + pac_address_keys__init(pac_address_keys); + pac_entry->pac_address_keys = pac_address_keys; + pac_address_keys->apiakey_lo = paca.apiakey; + pac_address_keys->apiakey_hi = paca.apiakey >> 64; + pac_address_keys->apibkey_lo = paca.apibkey; + pac_address_keys->apibkey_hi = paca.apibkey >> 64; + pac_address_keys->apdakey_lo = paca.apdakey; + pac_address_keys->apdakey_hi = paca.apdakey >> 64; + pac_address_keys->apdbkey_lo = paca.apdbkey; + pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; + + iov.iov_base = &pac_enabled_key; + iov.iov_len = sizeof(pac_enabled_key); + ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); + if (ret) { + pr_perror("Failed to get authentication key mask for %d", pid); + return -1; + } + + pac_address_keys->pac_enabled_key = pac_enabled_key; + + } + if (hwcaps & HWCAP_PACG) { + PacGenericKeys *pac_generic_keys; + + pr_debug("%d: Dumping generic authentication keys\n", pid); + iov.iov_base = &pacg; + iov.iov_len = sizeof(pacg); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to get a generic authantication key for %d", pid); + return -1; + } + pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); + if (!pac_generic_keys) + return -1; + pac_generic_keys__init(pac_generic_keys); + pac_entry->pac_generic_keys = pac_generic_keys; + pac_generic_keys->apgakey_lo = pacg.apgakey; + pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; + } + return 0; +} + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; @@ -43,6 +120,8 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsi assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + if (save_pac_keys(pid, core)) + return -1; return 0; } @@ -92,6 +171,12 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } + if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { + PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; + xfree(pac_entry->pac_address_keys); + xfree(pac_entry->pac_generic_keys); + xfree(pac_entry); + } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -135,3 +220,83 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } + +int arch_ptrace_restore(int pid, struct pstree_item *item) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + struct user_pac_address_keys upaca; + struct user_pac_generic_keys upacg; + PacAddressKeys *paca; + PacGenericKeys *pacg; + long pac_enabled_keys; + struct iovec iov; + int ret; + + + pr_debug("%d: Restoring PAC keys\n", pid); + + paca = &rsti(item)->arch_info.pac_address_keys; + pacg = &rsti(item)->arch_info.pac_generic_keys; + if (rsti(item)->arch_info.has_paca) { + if (!(hwcaps & HWCAP_PACA)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; + + upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); + upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); + upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); + upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); + + iov.iov_base = &upaca; + iov.iov_len = sizeof(upaca); + + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to set address authentication keys for %d", pid); + return 1; + } + iov.iov_base = &pac_enabled_keys; + iov.iov_len = sizeof(pac_enabled_keys); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { + pr_perror("Failed to set enabled key mask for %d", pid); + return 1; + } + } + + if (rsti(item)->arch_info.has_pacg) { + if (!(hwcaps & HWCAP_PACG)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); + iov.iov_base = &upacg; + iov.iov_len = sizeof(upacg); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to set the generic authentication key for %d", pid); + return 1; + } + } + + return 0; +} + +void arch_rsti_init(struct pstree_item *p) +{ + PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; + + rsti(p)->arch_info.has_paca = false; + rsti(p)->arch_info.has_pacg = false; + + if (!pac_keys) + return; + + if (pac_keys->pac_address_keys) { + rsti(p)->arch_info.has_paca = true; + rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; + } + if (pac_keys->pac_generic_keys) { + rsti(p)->arch_info.has_pacg = true; + rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; + } +} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index 90cd8bca8c..ecab061c30 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index 75e87996a5..c79605c40d 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -26,4 +26,14 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#define ARCH_RST_INFO y +struct rst_arch_info { + bool has_paca, has_pacg; + PacAddressKeys pac_address_keys; + PacGenericKeys pac_generic_keys; +}; + +int arch_ptrace_restore(int pid, struct pstree_item *item); +void arch_rsti_init(struct pstree_item *current); + #endif diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index 26b94e1574..6a5e4c89a1 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -22,7 +22,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index 485986065a..b0ac5715dd 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c index eeb0731ca6..783951b5b9 100644 --- a/criu/arch/loongarch64/crtools.c +++ b/criu/arch/loongarch64/crtools.c @@ -29,7 +29,7 @@ #define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { int i; CoreEntry *core = x; diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h index 04347155c3..a1c0c4c588 100644 --- a/criu/arch/loongarch64/include/asm/dump.h +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c index ed4da9b7ef..eabbd85f43 100644 --- a/criu/arch/mips/crtools.c +++ b/criu/arch/mips/crtools.c @@ -27,7 +27,7 @@ #include "images/core.pb-c.h" #include "images/creds.pb-c.h" -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h index 58015833d2..ec59b051bd 100644 --- a/criu/arch/mips/include/asm/dump.h +++ b/criu/arch/mips/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index a08a2ca5bf..d570400087 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -404,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpre return 0; } -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index eb488900a8..7393654fa1 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c index b2d6d29512..eea98d6de7 100644 --- a/criu/arch/riscv64/crtools.c +++ b/criu/arch/riscv64/crtools.c @@ -23,7 +23,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h index c2988f9bf6..4f0a2d209d 100644 --- a/criu/arch/riscv64/include/asm/dump.h +++ b/criu/arch/riscv64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 5cf160d823..96cef819e3 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -282,7 +282,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index c200724d77..5a24c5b3dc 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index e068a9a020..1f4d0736b2 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -15,7 +15,7 @@ #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index 867357fa28..4ca704fd7c 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -11,6 +11,8 @@ #include +#include "log.h" + static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index 192f6bd029..925ea91ff1 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e906da0cea..1f4881dab0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "types.h" #include @@ -1707,6 +1708,9 @@ static int restore_task_with_children(void *_arg) arg); } +int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); +int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1747,6 +1751,8 @@ static int attach_to_tasks(bool root_seized) pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); return -1; } + if (arch_ptrace_restore(pid, item)) + return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -3104,6 +3110,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } +void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); +void arch_rsti_init(struct pstree_item *p) {} + static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3323,6 +3332,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; + arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 4c9335a738..deb297e5f5 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,6 +1,7 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ +#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" @@ -33,6 +34,11 @@ struct rst_rseq { uint64_t rseq_cs_pointer; }; +#ifndef ARCH_RST_INFO +struct rst_arch_info { +}; +#endif + struct rst_info { struct list_head fds; @@ -80,6 +86,8 @@ struct rst_info { futex_t shstk_unlock; void *breakpoint; + + struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 3356e6b757..64b0ee9fb7 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,9 +17,32 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message pac_address_keys { + required uint64 apiakey_lo = 1; + required uint64 apiakey_hi = 2; + required uint64 apibkey_lo = 3; + required uint64 apibkey_hi = 4; + required uint64 apdakey_lo = 5; + required uint64 apdakey_hi = 6; + required uint64 apdbkey_lo = 7; + required uint64 apdbkey_hi = 8; + required uint64 pac_enabled_key = 9; +} + +message pac_generic_keys { + required uint64 apgakey_lo = 1; + required uint64 apgakey_hi = 2; +} + +message pac_keys { + optional pac_address_keys pac_address_keys = 6; + optional pac_generic_keys pac_generic_keys = 7; +} + message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; + optional pac_keys pac_keys = 5; } From ce6bb4fd9e80de362f62a9e234b9315ea3503da6 Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 20:09:05 +0000 Subject: [PATCH 078/198] test/zdtm: check that PAC keys are C/R-ed Add another variation of ptrhead00 compiled with enabled branch-protection. Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 9 +++++++++ test/zdtm/static/pthread00-pac.c | 1 + 2 files changed, 10 insertions(+) create mode 120000 test/zdtm/static/pthread00-pac.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f72fb2a77f..6a19cad3c2 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -66,6 +66,7 @@ TST_NOFILE := \ pipe01 \ pipe02 \ pthread00 \ + pthread00-pac \ pthread01 \ pthread02 \ pthread_timers \ @@ -497,6 +498,12 @@ STATE_OUT = $(TST_STATE:%=%.out) include ../Makefile.inc +ifeq ($(ARCH),aarch64) + PAC_CFLAGS := -mbranch-protection=standard +else + PAC_CFLAGS := +endif + all: $(TST) criu-rtc.so install: all .PHONY: all install @@ -588,6 +595,8 @@ uptime_grow: LDLIBS += -lrt -pthread unlink_largefile: CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE inotify_system_nodel: CFLAGS += -DNO_DEL pthread00: LDLIBS += -pthread +pthread00-pac: CFLAGS += ${PAC_CFLAGS} +pthread00-pac: LDLIBS += -pthread pthread01: LDLIBS += -pthread pthread02: LDLIBS += -pthread pthread_timers: LDLIBS += -lrt -pthread diff --git a/test/zdtm/static/pthread00-pac.c b/test/zdtm/static/pthread00-pac.c new file mode 120000 index 0000000000..3ee8dc1f17 --- /dev/null +++ b/test/zdtm/static/pthread00-pac.c @@ -0,0 +1 @@ +pthread00.c \ No newline at end of file From 12bb0de6f9b6ee6504410c9370634a79d2ccb2ba Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 12 Mar 2025 23:46:05 +0000 Subject: [PATCH 079/198] vdso: correct data types for ELF hash table sizes Let's change the data types of `nbucket` and `nchain` to uint32. This should fix the following compile-time error on arm32: /criu/criu/pie/util-vdso.c:336: undefined reference to `__aeabi_uldivmod' Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index af3c089858..8daf5c71f3 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -302,7 +302,7 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, + uint32_t nbucket, uint32_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; @@ -360,7 +360,7 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, void *bucket = NULL; Hash_t *chain = NULL; - uint64_t nbucket, nchain = 0; + uint32_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; From 327685c2a5647be6f5e47f0e9837c03cba27a49f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 16 Mar 2025 22:23:14 +0000 Subject: [PATCH 080/198] zdtm/vdso02: unmap vvar_vclock mappings It is a part of vvar and this test intends to unmap vdso and all vvar mappings. Fixes #2622 Signed-off-by: Andrei Vagin --- test/zdtm/static/vdso02.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/zdtm/static/vdso02.c b/test/zdtm/static/vdso02.c index 2050bca718..5779b7fd6d 100644 --- a/test/zdtm/static/vdso02.c +++ b/test/zdtm/static/vdso02.c @@ -29,7 +29,8 @@ static int parse_vm_area(char *buf, struct vm_area *vma) return -1; } -static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) +static int find_blobs(pid_t pid, struct vm_area *vdso, + struct vm_area *vvar, struct vm_area *vvar_vclock) { char buf[BUF_SZ]; int ret = -1; @@ -39,6 +40,8 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; + vvar_vclock->start = VVAR_BAD_ADDR; + vvar_vclock->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); @@ -57,12 +60,18 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; + if (strstr(buf, "[vvar_vclock]") && + parse_vm_area(buf, vvar_vclock)) + goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); + if (vvar_vclock->start != VVAR_BAD_ADDR) + test_msg("[vvar_vclock] %lx-%lx\n", + vvar_vclock->start, vvar_vclock->end); ret = 0; err: fclose(maps); @@ -143,10 +152,10 @@ void sys_exit(int status) static int unmap_blobs(void) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; int ret; - if (find_blobs(getpid(), &vdso, &vvar)) + if (find_blobs(getpid(), &vdso, &vvar, &vvar_vclock)) return -1; if (vdso.start != VDSO_BAD_ADDR) { @@ -159,13 +168,19 @@ static int unmap_blobs(void) if (ret) return ret; } + if (vvar_vclock.start != VVAR_BAD_ADDR) { + ret = sys_munmap((void *)vvar_vclock.start, + vvar_vclock.end - vvar_vclock.start); + if (ret) + return ret; + } return 0; } int main(int argc, char *argv[]) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; pid_t child; int status, ret = -1; @@ -201,9 +216,11 @@ int main(int argc, char *argv[]) goto out_kill; } - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; - if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + if (vdso.start != VDSO_BAD_ADDR || + vvar.start != VVAR_BAD_ADDR || + vvar_vclock.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } @@ -211,7 +228,7 @@ int main(int argc, char *argv[]) test_daemon(); test_waitsig(); - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); From 3eccf09be6ed793b52e2ceac96074d052c6a3c4a Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:50:02 +0000 Subject: [PATCH 081/198] make: allow setting the default network locking backend As different Linux distributions are switching away from iptables to nftables, this makes it easier to compile CRIU with a different default network locking backend. Instead of changing the source code it is now possible to select the nft backend like this: make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES Signed-off-by: Adrian Reber --- Makefile | 4 ++++ criu/include/cr_options.h | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/Makefile b/Makefile index 90908de837..5d8e89ac1b 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,10 @@ ifneq ($(GCOV),) CFLAGS += $(CFLAGS-GCOV) endif +ifneq ($(NETWORK_LOCK_DEFAULT),) + CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) +endif + ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 60cf9437e6..ab0bd8fa36 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -70,7 +70,15 @@ enum NETWORK_LOCK_METHOD { NETWORK_LOCK_SKIP, }; +/** + * CRIU currently defaults to the iptables locking backend. + * + * It is, however, possible to change this by defining + * NETWORK_LOCK_DEFAULT to a different value on the command-line. + */ +#ifndef NETWORK_LOCK_DEFAULT #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES +#endif /* * Ghost file size we allow to carry by default. From c9d3bfe3e81a208cd5a20ebc35739b642d62bb64 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:56:27 +0000 Subject: [PATCH 082/198] docs: update INSTALL.md with a section about building CRIU The building section also contains the information how to change the network locking backend without source code changes. Signed-off-by: Adrian Reber --- INSTALL.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index d786d06eb6..76ace5b023 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,3 +1,23 @@ +## Building CRIU from source code + +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. + +To compile CRIU, run: +``` +make +``` +This should create the `./criu/criu` executable. + +To change the default behaviour of CRIU, the following variables can be passed +to the make command: + + * **NETWORK_LOCK_DEFAULT**, can be set to one of the following + values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, + `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` + if nothing is specified. If another network locking backend is + needed, `make` can be called like this: + `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` + ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package From 1f326df401dd13f3203008967cfca05048a35442 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:57:47 +0000 Subject: [PATCH 083/198] docs: mark make commands with same format as elsewhere This uses the same formatting for the make command examples as seen in README.md. Signed-off-by: Adrian Reber --- INSTALL.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 76ace5b023..af07025186 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -23,9 +23,9 @@ to the make command: Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing - - make install - +``` +make install +``` this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -36,17 +36,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type - - make DESTDIR=/some/new/place install - +``` +make DESTDIR=/some/new/place install +``` and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type - - make uninstall - +``` +make uninstall +``` and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. From 26068cfee9519ad4a2e4c0aee5a28befa1ed2f36 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 07:34:26 +0000 Subject: [PATCH 084/198] test: others/rpc do not use nftables locking backend The tests in others/rpc are running as non-root and fail silently if the nftables network locking backend is used. This switches those tests to skip the network locking. Signed-off-by: Adrian Reber --- test/others/rpc/errno.py | 2 ++ test/others/rpc/ps_test.py | 1 + test/others/rpc/run.sh | 2 +- test/others/rpc/test-c.c | 2 ++ test/others/rpc/test.py | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index b600b6d1c4..4ea6c9d441 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -67,6 +67,7 @@ def no_process(self): req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -84,6 +85,7 @@ def process_exists(self): req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index daeda49bce..259f22e775 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -23,6 +23,7 @@ req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP s.send(req.SerializeToString()) diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index afd4fb5e33..3d5a53ae66 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -51,7 +51,7 @@ function test_restore_loop { title_print "Dump loop process" # So theoretically '-j' (--shell-job) should not be necessary, but on alpine # this test fails without it. - ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} + ${CRIU} dump -j -v4 -o dump-loop.log --network-lock skip -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop diff --git a/test/others/rpc/test-c.c b/test/others/rpc/test-c.c index 792dbbf9c9..b3507975f8 100644 --- a/test/others/rpc/test-c.c +++ b/test/others/rpc/test-c.c @@ -99,6 +99,8 @@ int main(int argc, char *argv[]) req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; + req.opts->has_network_lock = true; + req.opts->network_lock = CRIU_NETWORK_LOCK_METHOD__SKIP; /* * Connect to service socket diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index ce8411bc60..6f692f7557 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -24,6 +24,7 @@ req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP # Send request s.send(req.SerializeToString()) From 6d8898c4e76b9c71009762d038a6a09cc54e2c4c Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 09:57:52 +0000 Subject: [PATCH 085/198] test: print out logs if tests fail If the tests in others/rpc are failing no information about that error can be seen in a CI run. This change displays the log files if the test fails. Signed-off-by: Adrian Reber --- test/others/rpc/Makefile | 10 +++++++++- test/others/rpc/run.sh | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 69537bb0d3..b2f907abee 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -8,9 +8,17 @@ PYTHON ?= python3 run: all @make -C .. loop - mkdir -p build + mkdir -p build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} chmod a+rwx build + chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + @# Create all log files to be accessible for anybody + @# so that they can be displayed by any user. + for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ + imgs_c/restore-c.log imgs_loop/criu.log imgs_loop/dump-loop.log \ + imgs_py/criu.log imgs_py/restore-py.log imgs_c/criu.log service.log; do \ + touch build/$$i; chmod 666 build/$$i; \ + done sudo -g '#1000' -u '#1000' mkfifo build/status @# Need to start the criu daemon here to access the pidfile. @# The script read.py is used to wait until 'criu service' diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 3d5a53ae66..b6158dfea5 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -3,6 +3,7 @@ set -e CRIU=./criu +FAIL=1 export PROTODIR=`readlink -f "${PWD}/../../protobuf"` @@ -19,6 +20,13 @@ function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile + if [ "${FAIL}" == "1" ]; then + for i in build/output*; do + echo "File: $i" + cat $i + done + find . -name "*.log" -print -exec cat {} \; || true + fi } function test_c { @@ -80,6 +88,8 @@ test_restore_loop test_ps test_errno +FAIL=0 + stop_server trap 'echo "Success"' EXIT From 5d87fac0e0022e0bad2a594005e8813f6a0fdcb0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 5 Feb 2025 07:51:38 +0000 Subject: [PATCH 086/198] ci: do not run tests requiring iptables if it is missing There are a couple of tests that require the iptables binary. Instead of adding a checkskip script, which could also handle this, this change now uses CRIU's feature detection to see if the CRIU feature 'has_ipt_legacy' exists. Signed-off-by: Adrian Reber --- test/zdtm/static/net_lock_socket_iptables.desc | 1 + test/zdtm/static/net_lock_socket_iptables6.desc | 1 + test/zdtm/static/netns-nf.desc | 1 + test/zdtm/static/netns_lock_iptables.desc | 1 + test/zdtm/static/socket-tcp-closed-last-ack.desc | 2 +- test/zdtm/static/socket-tcp-reseted.desc | 2 +- test/zdtm/static/socket-tcp-syn-sent.desc | 2 +- 7 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/net_lock_socket_iptables.desc b/test/zdtm/static/net_lock_socket_iptables.desc index 936ff87027..cb622536f9 100644 --- a/test/zdtm/static/net_lock_socket_iptables.desc +++ b/test/zdtm/static/net_lock_socket_iptables.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/net_lock_socket_iptables6.desc b/test/zdtm/static/net_lock_socket_iptables6.desc index 936ff87027..cb622536f9 100644 --- a/test/zdtm/static/net_lock_socket_iptables6.desc +++ b/test/zdtm/static/net_lock_socket_iptables6.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index c99696d1cf..58c23e8ba8 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -3,4 +3,5 @@ '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', + 'feature': 'has_ipt_legacy', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_lock_iptables.desc b/test/zdtm/static/netns_lock_iptables.desc index 69020f34ed..b465706b82 100644 --- a/test/zdtm/static/netns_lock_iptables.desc +++ b/test/zdtm/static/netns_lock_iptables.desc @@ -1,6 +1,7 @@ { 'flavor': 'h', 'flags': 'suid excl reqrst', + 'feature': 'has_ipt_legacy', 'opts': '--tcp-established', 'dopts': '--network-lock iptables', 'ropts': '--join-ns net:/var/run/netns/criu-net-lock-test' diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index 309854fa53..c77d58477d 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -5,6 +5,6 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed', + 'feature' : 'tcp_half_closed has_ipt_legacy', 'flavor': 'ns uns', } diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 4aa48ad874..ff92e9f9f8 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -6,5 +6,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 71cd26d727..52382414bd 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -5,5 +5,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } From d18912fc88f3dc7bde5fdfa3575691977eb21753 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 16:10:02 +0000 Subject: [PATCH 087/198] ci: run tests on a nftables only system Signed-off-by: Adrian Reber --- .github/workflows/nftables-test.yml | 24 ++++++++++++++++++++++++ scripts/ci/run-ci-tests.sh | 13 ++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/nftables-test.yml diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml new file mode 100644 index 0000000000..eb3d8e8141 --- /dev/null +++ b/.github/workflows/nftables-test.yml @@ -0,0 +1,24 @@ +name: Nftables bases testing + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: nftables-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove iptables + run: sudo apt remove -y iptables + - name: Install libnftables-dev + run: sudo scripts/ci/apt-install libnftables-dev + - name: chmod 755 /home/runner + # CRIU's tests are sometimes running as some random user and need + # to be able to access the test files. + run: sudo chmod 755 /home/runner + - name: Build with nftables network locking backend + run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 611ff78037..0c4a089757 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -39,6 +39,10 @@ ci_prep () { # This can fail on aarch64 travis service apport stop || : + # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user + # namespaces by unprivileged users. We need this for some of our tests. + sysctl kernel.apparmor_restrict_unprivileged_userns=0 || : + if [ "$CLANG" = "1" ]; then # clang support CC=clang @@ -121,8 +125,14 @@ if [ "${CD_TO_TOP}" = "1" ]; then fi export GCOV CC +if [ -z "$COMPILE_FLAGS" ]; then + LOCAL_COMPILE_FLAGS=("V=1") +else + IFS=" " read -r -a LOCAL_COMPILE_FLAGS <<< "$COMPILE_FLAGS" + LOCAL_COMPILE_FLAGS=("V=1" "${LOCAL_COMPILE_FLAGS[@]}") +fi $CC --version -time make CC="$CC" -j4 V=1 +time make CC="$CC" -j4 "${LOCAL_COMPILE_FLAGS[@]}" ./criu/criu -v4 cpuinfo dump || : ./criu/criu -v4 cpuinfo check || : @@ -150,6 +160,7 @@ ulimit -c unlimited cgid=$$ cleanup_cgroup() { ./test/zdtm_umount_cgroups $cgid + dmesg } trap cleanup_cgroup EXIT ./test/zdtm_mount_cgroups $cgid From 25f7185202f394377bd68d498530fcd73ce35111 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 19 Mar 2025 23:19:31 +0700 Subject: [PATCH 088/198] namespace: skip cleaning up the uid/gid map in error cases free_userns_maps is called to clean up uid/gid map when the dump finishes. If we try to clean up these maps in error cases, it can lead to double free panic. So just skip cleaning up these maps and let free_userns_maps do its job. Signed-off-by: Bui Quang Minh --- criu/namespaces.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/criu/namespaces.c b/criu/namespaces.c index b7c0ab4008..0c9b16a87d 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1009,36 +1009,31 @@ int dump_user_ns(pid_t pid, int ns_id) ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - goto err; + /* + * The uid_map and gid_map is clean up in free_userns_maps + * later, so we don't need to clean these up in error cases. + */ + return -1; + e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - goto err; + return -1; e->n_gid_map = ret; if (check_user_ns(pid)) - goto err; + return -1; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - goto err; + return -1; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - goto err; + return -1; return 0; -err: - if (e->uid_map) { - xfree(e->uid_map[0]); - xfree(e->uid_map); - } - if (e->gid_map) { - xfree(e->gid_map[0]); - xfree(e->gid_map); - } - return -1; } void free_userns_maps(void) From 2a428d20ce17da5c84f1edb3461ca86ad36f3023 Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sat, 22 Mar 2025 19:31:02 -0400 Subject: [PATCH 089/198] criu: fix log_keep_err signal deadlock When using pr_err in signal handler, locking is used in an unsafe manner. If another signal happens while holding the lock, deadlock can happen. To fix this, we can introduce mutex_trylock similar to pthread_mutex_trylock that returns immediately. Due to the fact that lock is used only for writing first_err, this change garantees that deadlock cannot happen. Fixes: #358 Signed-off-by: Ivan Pravdin --- criu/log.c | 9 +++++---- include/common/lock.h | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/criu/log.c b/criu/log.c index 89ae8f8205..70e267fd65 100644 --- a/criu/log.c +++ b/criu/log.c @@ -132,10 +132,11 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - mutex_lock(&first_err->l); - if (first_err->s[0] == '\0') - __strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); + if (mutex_trylock(&first_err->l)) { + if (first_err->s[0] == '\0') + __strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } } } diff --git a/include/common/lock.h b/include/common/lock.h index ccfa468b83..4733d72870 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -2,6 +2,7 @@ #define __CR_COMMON_LOCK_H__ #include +#include #include #include #include @@ -162,6 +163,11 @@ static inline void mutex_lock(mutex_t *m) } } +static inline bool mutex_trylock(mutex_t *m) +{ + return atomic_inc_return(&m->raw) == 1; +} + static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; From 1cab647b37d1b4318127bd3baa0c2bd19cea7019 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sat, 18 Jan 2025 13:43:15 +0000 Subject: [PATCH 090/198] ci: try GitHub arm runners Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml index 8b0a63fc7b..567746a5f4 100644 --- a/.github/workflows/actuated-aarch64-test.yaml +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -1,4 +1,4 @@ -name: Actuated aarch64 test +name: aarch64 test on: [push, pull_request] @@ -11,32 +11,38 @@ jobs: build: # Actuated runners are not available in all repositories. if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected. - # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. - runs-on: actuated-arm64-4cpu-3gb + # The memory size and the number of CPUs can be freely selected for + # the actuated runners. 3GB and 4 CPUs seems to be enough according to the + # result from 'vmmeter'. + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] target: [GCC=1, CLANG=1] steps: # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md # vmmeter start - name: Prepare arkade + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: alexellis/arkade-get@master with: crane: latest print-summary: false - name: Install vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} run: | crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - name: Run vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: self-actuated/vmmeter-action@master # vmmeter end - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} # Following tests are failing on the actuated VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From 4249c11213c61c94b80ce202edae695550356166 Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Wed, 2 Apr 2025 18:48:12 +0800 Subject: [PATCH 091/198] criu: Do not print failed message when there is no late stage hook This is highly confusing, and it seems that the ret variable is not handled in the subsequent process. Signed-off-by: Yuanhong Peng --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1f4881dab0..583b446e0b 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2258,7 +2258,7 @@ static int restore_root_task(struct pstree_item *init) * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } From 24ea8befcc67fe091898cf6c458dba66b80c5db7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 21:13:16 +0000 Subject: [PATCH 092/198] compel: fix the stack test The stack test incorrectly assumed the page immediately following the stack pointer could never be changed. This doesn't work, because this page can be a part of another mapping. This commit introduces a dedicated "stack redzone," a small guard region directly after the stack. The stack test is modified to specifically check for corruption within this redzone. Signed-off-by: Andrei Vagin --- compel/include/uapi/infect.h | 9 +++ compel/src/lib/infect.c | 6 +- compel/test/stack/spy.c | 113 +---------------------------------- 3 files changed, 12 insertions(+), 116 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index ed97d64dd6..1f61876ffb 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,6 +13,15 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index a9bbd64004..4ea27bc633 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -1064,7 +1062,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1078,7 +1076,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c index 9b7c9a7f09..184c8ab318 100644 --- a/compel/test/stack/spy.c +++ b/compel/test/stack/spy.c @@ -50,70 +50,6 @@ static void *get_parasite_rstack_start(struct parasite_ctl *ctl) return rstack_start; } -static int page_writable(struct parasite_ctl *ctl, int pid, void *page) -{ - FILE *maps; - size_t maps_line_len = 0; - char *maps_line = NULL; - char victim_maps_path[6 + 11 + 5 + 1]; - int written; - int ret = 0; - - if (((uintptr_t)page & (page_size() - 1)) != 0) { - fprintf(stderr, "Page address not aligned\n"); - ret = -1; - goto done; - } - - written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); - if (written < 0 || written >= sizeof(victim_maps_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); - ret = -1; - goto done; - } - - maps = fopen(victim_maps_path, "r"); - if (maps == NULL) { - perror("Can't open victim's /proc/$pid/maps"); - ret = -1; - goto done; - } - - while (getline(&maps_line, &maps_line_len, maps) != -1) { - unsigned long vmstart, vmend; - char r, w; - - if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { - fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); - ret = -1; - goto free_linebuf; - } - - if (page >= (void *)vmstart && page < (void *)vmend) { - if (w == 'w') { - if (r != 'r') { - fprintf(stderr, "Expecting writable memory to also be readable"); - ret = -1; - goto free_linebuf; - } - ret = 1; - } - break; - } - } - - if (errno) { - perror("Can't read victim's /proc/$pid/maps"); - ret = -1; - } - -free_linebuf: - free(maps_line); - fclose(maps); -done: - return ret; -} - static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; @@ -153,51 +89,6 @@ static void *read_proc_mem(int pid, void *offset, size_t len) return NULL; } -static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, - size_t *saved_data_size) -{ - size_t page_mask = page_size() - 1; - size_t saved_size = 0; - size_t stack_size_last_page = (uintptr_t)stack & page_mask; - void *next_page = stack; - - if (stack_size_last_page != 0) { - size_t empty_space_last_page = page_size() - stack_size_last_page; - saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); - next_page += page_size() - stack_size_last_page; - } - - while (saved_size < SAVED_DATA_MAX && next_page != NULL) { - switch (page_writable(ctl, pid, next_page)) { - case 1: - saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); - next_page += page_size(); - break; - case 0: - next_page = NULL; - break; - default: - return -1; - } - } - - if (saved_size > 0) { - void *sd; - - sd = read_proc_mem(pid, stack, saved_size); - if (sd == NULL) - return -1; - - *saved_data = sd; - } else { - *saved_data = NULL; - } - - *saved_data_size = saved_size; - - return 0; -} - static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { @@ -221,7 +112,7 @@ static int do_infection(int pid) struct infect_ctx *ictx; int *arg; void *stack; - size_t saved_data_size; + size_t saved_data_size = PARASITE_STACK_REDZONE; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); @@ -257,8 +148,6 @@ static int do_infection(int pid) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); - if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) - err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); From 5cea5b6d3f1f3a0e2b4b33f1caf573c4c626b3b7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Mar 2025 14:21:03 +0000 Subject: [PATCH 093/198] restore: use the new kernel interface to restore timers Thomas Gleixner introduced the new interface to create posix timers with specifed timer IDs: https://github.com/torvalds/linux/commit/ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 Previously, CRIU recreated timers by repeatedly creating and deleting them until the desired ID was reached. This approach isn't fast, especially for timers with large IDs. For example, restoring two timers with IDs 1000000 and 2000000 took approximately 1.5 seconds. The new `prctl()` based interface allows direct creation of timers with specified IDs, reducing the restoration time to around 3 microseconds for the same example. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++++ criu/include/kerndat.h | 1 + criu/include/prctl.h | 7 ++++++ criu/include/restorer.h | 1 + criu/kerndat.c | 20 +++++++++++++++ criu/pie/restorer.c | 54 +++++++++++++++++++++++++++++++++++++---- criu/timer.c | 2 ++ 7 files changed, 90 insertions(+), 5 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0388cbe7fe..7b4a6415a5 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1392,6 +1392,14 @@ static int check_pagemap_scan(void) return 0; } +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + /* musl doesn't have a statx wrapper... */ struct staty { __u32 stx_dev_major; @@ -1703,6 +1711,7 @@ int cr_check(void) ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1825,6 +1834,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e03a573419..bd8744d62b 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -89,6 +89,7 @@ struct kerndat_s { bool has_pagemap_scan; bool has_shstk; bool has_close_range; + bool has_timer_cr_ids; }; extern struct kerndat_s kdat; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index f5f23c9692..2966659dad 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -97,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index a4fb7ea794..56bea0fcc0 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -170,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; diff --git a/criu/kerndat.c b/criu/kerndat.c index 5939005a41..930117b0a4 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1720,6 +1720,22 @@ static int kerndat_has_close_range(void) return 0; } +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1981,6 +1997,10 @@ int kerndat_init(void) pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 348ce6659b..9867a3ddd5 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1235,9 +1235,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; + int ret, i, exit_code = -1; kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1249,16 +1263,36 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + goto out; } next_id++; @@ -1268,12 +1302,22 @@ static int create_posix_timers(struct task_restore_args *args) ret = sys_timer_delete(timer_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - return ret; + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) diff --git a/criu/timer.c b/criu/timer.c index 0413e2a720..856501be6b 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -195,6 +195,7 @@ int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) if (!img) return -1; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; @@ -234,6 +235,7 @@ int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) From 3284a835299b1458a64720f20a7baf50887269a2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 20:56:23 +0100 Subject: [PATCH 094/198] cuda: use pr_perror for libc function errors When handing errors for functions such as `ptrace()`, `pipe()`, and `fork()` it would be better to use `pr_perror` instead of `pr_err` as it would include a message describing the encountered error. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 99e4caf743..1aaad6842b 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -93,7 +93,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int fd[2], buf_off; if (pipe(fd) != 0) { - pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); return -1; } @@ -101,7 +101,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int child_pid = fork(); if (child_pid == -1) { - pr_err("Failed to fork to exec cuda-checkpoint\n"); + pr_perror("Failed to fork to exec cuda-checkpoint"); close(fd[READ]); close(fd[WRITE]); return -1; @@ -166,7 +166,6 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); } else if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); @@ -283,8 +282,8 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", - restore_tid); + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); return -1; } @@ -295,12 +294,12 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); return -1; } @@ -312,7 +311,7 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); return -1; } @@ -320,18 +319,18 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_err("Failed to block signals on restore tid %d\n", restore_tid); + pr_perror("Failed to block signals on restore tid %d", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_err("Could not resume cuda restore tid %d\n", restore_tid); + pr_perror("Could not resume cuda restore tid %d", restore_tid); return -1; } From bc3e4e880ad2423b7c8dff7d3a1d465ef2ea6624 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 21:14:05 +0100 Subject: [PATCH 095/198] cuda: remove redundant goto label The `goto interrupt` label is unnecessary as the code directly returns after `cuda_process_checkpoint_action()`. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 1aaad6842b..9ccb042249 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -395,12 +395,9 @@ int cuda_plugin_checkpoint_devices(int pid) status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - goto interrupt; } -interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 5d5a1e15f3aaffa64262366162888e9275a38dfd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 9 Apr 2025 13:25:44 +0000 Subject: [PATCH 096/198] aarch64: fix build with missing NT_ARM_PAC_ENABLED_KEYS On a RHEL 8 based system building CRIU fails with: criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS criu/arch/aarch64/crtools.c:73:39: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS This adds the missing define if it is undefined. Signed-off-by: Adrian Reber --- criu/arch/aarch64/crtools.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 6cde03ee38..c077dd06bc 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,10 @@ #include "compel/infect.h" #include "pstree.h" +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e From 622b4ed448a07707593a04d4cfa7e005596188c7 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 23 Jan 2024 08:22:07 -0800 Subject: [PATCH 097/198] s390: Fix FP reg restore after parasite code runs Currently we save FP regs before parasite code runs, and restore after for --leave-running, --check-only, and in case of errors. In case of errors the error may have happened before FP regs were saved, so we should only restore them if they were actually saved. Signed-off-by: Younes Manton --- criu/arch/s390/crtools.c | 90 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 96cef819e3..e08c838783 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -294,7 +317,13 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { From 75eaa65f81fae28d35a11abd53afd434fd08fa4b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Apr 2025 06:33:41 +0000 Subject: [PATCH 098/198] net: nftables: avoid restore failure if the CRIU nft table already exist CRIU locks the network during restore in an "empty" network namespace. However, "empty" in this context means CRIU isn't restoring the namespace. This network namespace can be the same namespace where processes have been dumped and so the network is already locked in it. Fixes #2650 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 2 +- criu/include/net.h | 2 +- criu/net.c | 30 +++++++++++++++++------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 583b446e0b..30932f60a2 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2119,7 +2119,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } diff --git a/criu/include/net.h b/criu/include/net.h index 5e8a848620..7c5ede21e1 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; diff --git a/criu/net.c b/criu/net.c index ee46f1c495..300df480b0 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3206,12 +3206,12 @@ static inline FILE *redirect_nftables_output(struct nft_ctx *nft) } #endif -static inline int nftables_lock_network_internal(void) +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3224,11 +3224,16 @@ static inline int nftables_lock_network_internal(void) fp = redirect_nftables_output(nft); if (!fp) - goto out; + goto err2; snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; goto err2; + } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3246,17 +3251,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3288,7 +3292,7 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; @@ -3301,7 +3305,7 @@ int network_lock_internal(void) if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3427,7 +3431,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) From 799504d9ffd52abe8ef2d604112670b29f0339f3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Apr 2025 14:12:31 +0100 Subject: [PATCH 099/198] aarch64/crtools: fix define for missing constants Building CRIU package on Debian 11 aarch64 fails with criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:32:31: error: storage size of 'paca' isn't known struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c:33:31: error: storage size of 'pacg' isn't known struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:47:15: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (hwcaps & HWCAP_PACA) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:47:15: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c:53:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:82:15: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (hwcaps & HWCAP_PACG) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:88:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:33:31: error: unused variable 'pacg' [-Werror=unused-variable] struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:32:31: error: unused variable 'paca' [-Werror=unused-variable] struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:227:31: error: storage size of 'upaca' isn't known struct user_pac_address_keys upaca; ^~~~~ criu/arch/aarch64/crtools.c:228:31: error: storage size of 'upacg' isn't known struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:241:18: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (!(hwcaps & HWCAP_PACA)) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:255:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:268:18: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (!(hwcaps & HWCAP_PACG)) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:275:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:233:6: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] int ret; ^~~ criu/arch/aarch64/crtools.c:228:31: error: unused variable 'upacg' [-Werror=unused-variable] struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:227:31: error: unused variable 'upaca' [-Werror=unused-variable] struct user_pac_address_keys upaca; ^~~~~ This patch adds the missing constants and structs if undefined. Signed-off-by: Radostin Stoyanov --- criu/arch/aarch64/crtools.c | 47 +++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index c077dd06bc..3ed5c9d635 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,45 @@ #include "compel/infect.h" #include "pstree.h" +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + #ifndef NT_ARM_PAC_ENABLED_KEYS #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ #endif @@ -33,8 +72,8 @@ extern unsigned long getauxval(unsigned long type); static int save_pac_keys(int pid, CoreEntry *core) { - struct user_pac_address_keys paca; - struct user_pac_generic_keys pacg; + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; PacKeys *pac_entry; long pac_enabled_key; struct iovec iov; @@ -228,8 +267,8 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) int arch_ptrace_restore(int pid, struct pstree_item *item) { unsigned long hwcaps = getauxval(AT_HWCAP); - struct user_pac_address_keys upaca; - struct user_pac_generic_keys upacg; + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; PacAddressKeys *paca; PacGenericKeys *pacg; long pac_enabled_keys; From 6a91ad8f71aa03b8a054aa6e96f7a2d65009b745 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 12:02:46 +0800 Subject: [PATCH 100/198] mount: restore root mount flags Mount flags belong to mount and mount namespace of the Container, so we should preserve them, as Container user will not expect mounts switching between ro and rw over c/r. Fixes: #2632 v5: fix both mount-v1 and mount-v2 Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 6 ++++++ criu/mount.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 5d53e9a226..eb4dd8119a 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ static int do_bind_mount_v2(struct mount_info *mi) /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; diff --git a/criu/mount.c b/criu/mount.c index 82bbd52d6c..06b9595427 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2690,9 +2690,16 @@ static bool can_mount_now(struct mount_info *mi) static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } From 7362ad2f02e93d6a9d9c6b504ff6464617176fe3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 101/198] zdtm/lib: add "bind" desc option Add {'bind': 'path/to/bindmount'} zdtm descriptor option, so that in test mount namespace a directory bindmount can be created before running the test. This is useful to leave test directory writable (e.g. for logs) while the test makes root mount readonly. note: We create this bindmount early so that all test files are opened on it initially and not on the below mount. Will be used in mnt_ro_root test. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 +++ test/zdtm/lib/ns.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 37ebe63b7b..e3ddc762a3 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -443,6 +443,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins = [name] self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) + self._bind = desc.get('bind') self.auto_reap = True def __make_action(self, act, env=None, root=None): @@ -513,6 +514,8 @@ def start(self): if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + if self._bind: + env['ZDTM_BIND'] = self._bind env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 3c0dbdeb80..5fe81561fe 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -28,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -52,6 +53,18 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + dev_path = getenv("ZDTM_DEV"); if (dev_path) { snprintf(path, sizeof(path), "%s/dev", root); From 69f990d88b21c48bf9944dd49901f706ba8327b3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 102/198] zdtm: add mnt_ro_root test It makes root mount readonly and checks that it is still readonly after migration. Make zdtm/static writable for logs via "bind" desc option. v2: explain why we don't have explicit rw/ro flag check v3: use new zdtm "bind" desc option Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/ns.c | 3 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ro_root.c | 32 +++++++++++++++++++++++++++++++ test/zdtm/static/mnt_ro_root.desc | 6 ++++++ 4 files changed, 42 insertions(+) create mode 100644 test/zdtm/static/mnt_ro_root.c create mode 100644 test/zdtm/static/mnt_ro_root.desc diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 5fe81561fe..822e09c928 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -57,6 +57,9 @@ static int prepare_mntns(void) if (zdtm_bind) { /* * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. */ snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6a19cad3c2..81e44de221 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -423,6 +423,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 0000000000..2d8370150b --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 0000000000..c9a8e4f186 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} From 0e9d0767fb3fd8db35e8ebaada27a0871bbe36de Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 7 May 2025 14:06:55 +0100 Subject: [PATCH 103/198] sk-inet: add message how to disable MPTCP in Go With Go version 1.24, ListenConfig now uses MPTCP by default [1]. Checkpoint/restore for this protocol is not currently supported and adding support requires kernel changes that are not trivial to implement. As a result, checkpointing of many containers that run Go programs is likely to fail with the following error [2]: (00.026522) Error (criu/sk-inet.c:130): inet: Unsupported proto 262 for socket 2f9bc5 This patch adds a message with suggested workaround for this problem. [1] https://go.dev/doc/go1.24#netpkgnet [2] https://github.com/checkpoint-restore/criu/issues/2655 Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 92f53e5697..a191e78c48 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -128,6 +128,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } From f6c14eece3ea5d51d18f76d070b6c57767bf938b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 May 2025 15:38:26 +0000 Subject: [PATCH 104/198] kerndat: check that hardware breakpoints work In some cases, they might not work in virtual machines if the hypervisor doesn't virtualize them. For example, they don't work in AMD SEV virtual machines if the Debug Virtualization extension isn't supported or isn't enabled in SEV_FEATURES. Fixes #2658 Signed-off-by: Andrei Vagin --- criu/cr-check.c | 17 +++++++++ criu/cr-restore.c | 3 +- criu/include/kerndat.h | 1 + criu/kerndat.c | 80 +++++++++++++++++++++++++++++++++++++++++ criu/parasite-syscall.c | 2 +- 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 7b4a6415a5..9c4778490e 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1589,6 +1589,17 @@ static int check_overlayfs_maps(void) return status == 0 ? 0 : -1; } +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + + static int (*chk_feature)(void); /* @@ -1616,6 +1627,7 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; @@ -1724,6 +1736,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1836,6 +1852,7 @@ static struct feature_list feature_list[] = { { "pagemap_scan", check_pagemap_scan }, { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, { NULL, NULL }, }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 30932f60a2..cabe2f464d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1820,6 +1820,7 @@ static int restore_rseq_cs(void) static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -1847,7 +1848,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index bd8744d62b..c5deb32832 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -90,6 +90,7 @@ struct kerndat_s { bool has_shstk; bool has_close_range; bool has_timer_cr_ids; + bool has_breakpoints; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index 930117b0a4..fa43f7d3f2 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1736,6 +1736,83 @@ static int kerndat_has_timer_cr_ids(void) return 0; } +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1999,6 +2076,9 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6db9d21fee..e19847b377 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -421,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; From 69500a9e1334f73bee9f54ef3c020ed984ddefcf Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Wed, 14 May 2025 19:02:06 +0200 Subject: [PATCH 105/198] make: remove checks and warnings for bsd strlcat and strlcpy In 0a7c5fd1bd8d1e49e273b51ff39af473d6c68cbc we swapped the BSD implementation of strlcat and strlcpy in favor of our own replacement. The checks and the predefined macros are not needed anymore. Signed-off-by: Lorenzo Fontana --- Makefile.config | 4 ++-- scripts/feature-tests.mak | 28 ---------------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/Makefile.config b/Makefile.config index 5ab689d411..5cf4b8216d 100644 --- a/Makefile.config +++ b/Makefile.config @@ -9,7 +9,7 @@ ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) + $(info Note: Building without setproctitle() support.) $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif @@ -84,7 +84,7 @@ endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index fb5d2ef7ad..727e9689ea 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include From c61329b30387aa50634e794a4781dde64cb2a6c3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 May 2025 11:33:29 +0100 Subject: [PATCH 106/198] seize: fix pause devices for frozen containers The container checkpointing procedure in Kubernetes freezes running containers to create a consistent snapshot of both the runtime state and the rootfs of the container. However, when checkpointing a GPU container, the container must be unfrozen before invoking the cuda-checkpoint tool. This is achieved in prepare_freezer_for_interrupt_only_mode(), which needs to be called before the PAUSE_DEVICES hook. The patch introducing this functionality fixes this problem for containers with multiple processes. However, if the container has a single process, prepare_freezer_for_interrupt_only_mode() must be invoked immediately before the PAUSE_DEVICES hook. Fixes: #2514 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index f56357ac7b..23f192d46d 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1060,22 +1060,32 @@ int collect_pstree(void) */ alarm(opts.timeout); - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (freeze_processes()) goto err; } else { if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1136,4 +1146,4 @@ int checkpoint_devices(void) exit_code = 0; err: return exit_code; -} \ No newline at end of file +} From 3d9ef354abb78f7e3544fab87f8d721071cdf4e1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 May 2025 12:43:14 +0100 Subject: [PATCH 107/198] sk-inet: add MPTCP definition Building CRIU on Ubuntu 20.04 fails with the following error: criu/sk-inet.c: In function 'can_dump_ipproto': criu/sk-inet.c:131:16: error: 'IPPROTO_MPTCP' undeclared (first use in this function); did you mean 'IPPROTO_MTP'? 131 | if (proto == IPPROTO_MPTCP) | ^~~~~~~~~~~~~ | IPPROTO_MTP Add definition for MPTCP to fix this error. Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a191e78c48..1238b03dc5 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -44,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { From 8902353057962184fa07bfa308793d4f86daab18 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:07:38 +0000 Subject: [PATCH 108/198] criu: Introduce a new device plugin hook for restore Currently, in the target process, device-related restore operations and other restore operations almost run sequentially. When the target process executes the corresponding CRIU hook functions, it can't perform other restore operations. However, for GPU applications, some device restore operations have no logical dependencies on other common restore operations and can be parallelized with other operations to speed up the process. Instead of launching a thread in child processes for parallelization, this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU process to handle these restore operations. This is because the restoration of memory state in the restore blob is one of the most time-consuming parts of all restore logic. The main CRIU process can easily parallelize these operations, whereas parallelizing in threads within child processes is challenging. - POST_FORKING *POST_FORKING: Hook to enable the main CRIU process to perform some restore operations of plugins. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 3 +++ criu/include/criu-plugin.h | 4 ++++ criu/plugin.c | 1 + 3 files changed, 8 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cabe2f464d..9cc77b21ff 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 392ea9f534..9fb21a4497 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -60,6 +60,8 @@ enum { CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__MAX }; @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); enum { CR_PLUGIN_STAGE__DUMP, @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 65e79a0692..18da0499d7 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); #undef __assign_hook From 4ba058060ce10e040bc70464f9aff1d407407e9f Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:13:28 +0000 Subject: [PATCH 109/198] cr-restore: Move `cr_plugin_init` after `fdstore_init` Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not initialized. However, during the plugin restore procedure, there may be some common file operations used in multiple hooks. This patch moves `cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use `fdstore` to place these file operations. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9cc77b21ff..c1d1f4b9d5 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2366,41 +2366,47 @@ int cr_restore_tasks(void) return 1; if (check_img_inventory(/* restore = */ true) < 0) - goto err; - - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; From 0a274b6afa1b2f51fe617beb006faf15687ed3c1 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:34:14 +0000 Subject: [PATCH 110/198] pstree: Add `has_children` function Currently, parallel restore only focuses on the single-process situation. Therefore, it needs an interface to know if there is only one process to restore. This patch adds a `has_children` function in `pstree.h` and replaces some existing implementations with this function. Signed-off-by: Yanning Yang --- criu/cr-dump.c | 2 +- criu/include/pstree.h | 1 + criu/pstree.c | 9 +++++++-- criu/seize.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 302078caa0..b8cf7d64d9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1396,7 +1396,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 1137046d43..b750a919e6 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/pstree.c b/criu/pstree.c index 660f1b9d99..75c2fc8d0a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret) return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { diff --git a/criu/seize.c b/criu/seize.c index 23f192d46d..d0cf7b36c8 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } From e4c151eab3bf17817f372a88b2fb633005db3336 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 111/198] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 0000000000..9e957ae54b --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 0000000000..4e7aa2aa41 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From bfd9aa269b3d80dda93bcc4d2516340e0f9c0946 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 112/198] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54b..c8bf6d1ba3 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa41..d7200c6bd5 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From bfb4a3d8422923c90867c8ec4a2e697d0a45049c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 113/198] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 418 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 373 insertions(+), 50 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index a20d1d1639..4bf5e499fb 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c0861628..69194fbc79 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } + if (!e->device_entries[i]->gpu_id) + continue; - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - thread_i++; - } - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0cc..730f2e0284 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3ddae..e19f8e7ce9 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 7c4bcdb2d4308047da34ce786885d45a08b86c80 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 114/198] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3dbc..fe76fc3bc6 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6f..b808fbc4f0 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From 2da6a6faffe643f859033c74ed71124760e32157 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Apr 2025 11:39:18 +0800 Subject: [PATCH 115/198] zdtm.py: add an option to change pycriu import path By default zdtm expects that criu is built from source first and only then you can run zdtm tests against it. But what if you really want to run tests against a criu version installed on the system? Yes there is already a nice option for zdtm to change the criu binary it uses "--criu-bin", but it would still end up using the pycriu module from source and you would still have to build everything beforehand. Let's add an option to change the path where zdtm searches for pycriu module "--pycriu-search-path". This way we can run zdtm tests on the criu installed on the system directly without building criu from source, e.g. on Fedora it works like: test/zdtm.py run --criu-bin /usr/sbin/criu \ --pycriu-search-path /usr/lib/python3.13/site-packages \ -t zdtm/static/env00 Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index e3ddc762a3..d5514af712 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -22,11 +22,11 @@ import tempfile import time import uuid +import site from builtins import input, int, open, range, str, zip import yaml -import pycriu as crpc from zdtm.criu_config import criu_config # File to store content of streamed images @@ -1142,6 +1142,24 @@ def __init__(self, opts): self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) @@ -2169,7 +2187,8 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2860,6 +2879,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') From 0d15e2f4d996658ea77630be596404f6fb4417d1 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 19 May 2025 11:53:18 +0800 Subject: [PATCH 116/198] zdtm: fix check for criu binary The opts['action'] contains actor function and not the action name, so we should compare it with a function. While on it let's also add a comment about --criu-bin option if CRIU binary is missing. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d5514af712..3339dd8167 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1611,6 +1611,7 @@ def check(feature): def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -2972,7 +2973,7 @@ def fork_zdtm(): if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() From 3a3a3f0f27143763f499b04eca65601ff913d6f5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 May 2025 19:26:01 +0000 Subject: [PATCH 117/198] image: use `protoc` instead of `protoc-c` The new protoc 1.5.2 reports warnings: `protoc-c` is deprecated. Please use `protoc` instead! Signed-off-by: Andrei Vagin --- images/Makefile | 4 ++-- plugins/amdgpu/Makefile | 2 +- test/others/rpc/Makefile | 2 +- test/others/unix-callback/Makefile | 2 +- test/zdtm/static/Makefile | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/images/Makefile b/images/Makefile index 1e40b8a8f0..d966fbfca0 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,7 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto +proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -96,7 +96,7 @@ makefile-deps := Makefile $(obj)/Makefile define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 4bf5e499fb..870a039cdb 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -25,7 +25,7 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index b2f907abee..384eb05397 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -47,7 +47,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/unix-callback/Makefile b/test/others/unix-callback/Makefile index 25bcf228b3..9840440773 100644 --- a/test/others/unix-callback/Makefile +++ b/test/others/unix-callback/Makefile @@ -4,7 +4,7 @@ run: all ./run.sh unix.pb-c.c: unix.proto - protoc-c --proto_path=. --c_out=. unix.proto + protoc --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 81e44de221..61cacbb4eb 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -734,7 +734,7 @@ criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ From 777c0a7632e7678cd1a7b68c3083a2b752462e5a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:27:32 +0200 Subject: [PATCH 118/198] criu/proc_parse: support MADV_WIPEONFORK/VM_WIPEONFORK Support VM_WIPEONFORK [1] by detecting it from /proc//smaps and setting a corresponding MADV_WIPEONFORK flag on vma. [1] https://github.com/torvalds/linux/commit/d2cd9ede6e193dd7d88b6d27399e96229a551b19 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/proc_parse.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf9..a553564909 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -13,5 +13,8 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 0fa9b7ba56..bc13398580 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -160,6 +160,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) From 237f099b292c3d193af3d8e47963b38153cee297 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:32:01 +0200 Subject: [PATCH 119/198] test/zdtm/static/maps02: add MADV_WIPEONFORK testcase In addition to that I did small non-functional corrections. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2f..d9ac8b1ce7 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -18,6 +18,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -57,6 +61,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9a..37c09dc71c 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -6,7 +6,11 @@ #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -43,12 +47,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[6] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +68,14 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From 708228f5b84b4225a8bdb8becc6157362678a82f Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:11:28 +0200 Subject: [PATCH 120/198] criu/proc_parse: support MAP_DROPPABLE mappings Support MAP_DROPPABLE [1] by detecting it from /proc//smaps and restoring it as a normal private mapping flag on vma with only difference that instead of MAP_PRIVATE we should use MAP_DROPPABLE. [1] https://github.com/torvalds/linux/commit/9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/mem.c | 12 ++++++++++++ criu/proc_parse.c | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index a553564909..086753bcf5 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif diff --git a/criu/mem.c b/criu/mem.c index c9578ef441..803cb545b5 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -398,6 +399,17 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. diff --git a/criu/proc_parse.c b/criu/proc_parse.c index bc13398580..bb642648e9 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -144,6 +144,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -206,6 +208,20 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) From ae9d0d0557e1f8a67c3925a454ed13039b38b1c9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 18:55:46 +0200 Subject: [PATCH 121/198] pycriu/images/pb2dict: add MAP_DROPPABLE flag Signed-off-by: Alexander Mikhalitsyn --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index e3dd95ac0a..6c4f688896 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -83,6 +83,7 @@ def _custom_conv(field): mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] From 27653caa4c598e71fc09a7e0a3e71a94d013e45e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:21:23 +0200 Subject: [PATCH 122/198] test/zdtm/static/maps02: add MAP_DROPPABLE testcase Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 20 +++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index d9ac8b1ce7..3d952ac95d 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -45,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 37c09dc71c..38244f0205 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,6 +2,10 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif @@ -27,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -47,7 +57,7 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[6] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); @@ -76,6 +86,10 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) return -1; + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From 0dd7c5a87a6eb033bdbe3f4fcff20d545f39311f Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 21:11:29 +0200 Subject: [PATCH 123/198] test/zdtm/static: add maps11 test for MAP_DROPPABLE/MADV_WIPEONFORK In this test we want to ensure that contents of droppable mappings and mappings with MADV_WIPEONFORK is properly restored in parent/child processes. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps11.c | 205 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 test/zdtm/static/maps11.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 61cacbb4eb..34fc90513a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -150,6 +150,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 0000000000..df309714b0 --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} From fd353fdd8eba6f5e094a3bd474edf9b39a58cca6 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Mon, 14 Apr 2025 14:06:40 +0530 Subject: [PATCH 124/198] feat: introduce Nix flake CRIU currently requires a number of dependencies in order to build from source. The package names vary across distributions and package managers. A Nix flake allows developers to spin up a dev environment with `nix develop`, eliminating the hassle of manual dependency management. It also prevents polluting the global package set on the machine. Signed-off-by: Prajwal S N --- CONTRIBUTING.md | 2 +- flake.lock | 61 +++++++++++++++++++++++++++++++++++++++ flake.nix | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37965e5fba..712e7b8132 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,7 +34,7 @@ To clone CRIU repo and switch to the proper branch, run: ### Compile -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. To compile CRIU, run: diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000000..90c914452b --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000000..dc2429ffc9 --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} From 3cbb864ae062e847b0df406f9bb3ba929917d1fb Mon Sep 17 00:00:00 2001 From: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:41:51 +0000 Subject: [PATCH 125/198] crtools: simplify check for cpuinfo subcommands The cpuinfo command requires a "dump" or "check" subcommand. Thus, we replace `CR_CPUINFO` with `CR_CPUINFO_DUMP` and `CR_CPUINFO_CHECK`. This allows us to remove unnecessary subcommand check in `image_dir_mode()` and perform all parsing in `parse_criu_mode()`. With this change the check for validating the cpuinfo subcommand is now done only once with `CR_CPUINFO_DUMP` or `CR_CPUINFO_CHECK` enum. Signed-off-by: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 2 +- criu/crtools.c | 57 ++++++++++++++++++++------------------- criu/include/cr_options.h | 3 ++- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b9d11ced22..d8c5967bc9 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1261,7 +1261,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; diff --git a/criu/crtools.c b/criu/crtools.c index 6f493850b9..4734c90f2f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,7 +74,7 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) +static int parse_criu_mode(char *mode, char *subcommand) { if (!strcmp(mode, "dump")) opts.mode = CR_DUMP; @@ -96,8 +94,12 @@ static int parse_criu_mode(char *mode) opts.mode = CR_SWRK; else if (!strcmp(mode, "dedup")) opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; + else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) + return -2; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; else if (!strcmp(mode, "exec")) opts.mode = CR_EXEC_DEPRECATED; else if (!strcmp(mode, "show")) @@ -115,6 +117,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *subcommand; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -165,9 +168,15 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { + has_sub_command = (argc - optind) > 1; + subcommand = has_sub_command ? argv[optind + 1] : NULL; + ret = parse_criu_mode(argv[optind], subcommand); + if (ret == -1) { pr_err("unknown command: %s\n", argv[optind]); goto usage; + } else if (ret == -2) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; } /* * util_init initializes criu_run_id and compel_run_id so that sockets @@ -223,25 +232,20 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { + } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { + if (opts.stream && image_dir_mode() == -1) { pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -335,15 +339,12 @@ int main(int argc, char *argv[], char *envp[]) if (opts.mode == CR_DEDUP) return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); + if (opts.mode == CR_CPUINFO_DUMP) { + return cpuinfo_dump(); + } + + if (opts.mode == CR_CPUINFO_CHECK) { + return cpuinfo_check(); } if (opts.mode == CR_EXEC_DEPRECATED) { diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ab0bd8fa36..4df8056b7b 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -125,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; From 6eb5bb06bc1f647f2453bacee1531acd070a66ad Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 May 2025 14:47:55 +0000 Subject: [PATCH 126/198] crtools: do a few minor cleanups Signed-off-by: Andrei Vagin --- criu/crtools.c | 140 +++++++++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 68 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 4734c90f2f..509e73d741 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -74,40 +74,55 @@ static int image_dir_mode(void) return -1; } -static int parse_criu_mode(char *mode, char *subcommand) +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; + +static int parse_criu_mode(int argc, char **argv, int *optind) { - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) - return -2; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) - opts.mode = CR_CPUINFO_DUMP; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) - opts.mode = CR_CPUINFO_CHECK; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } - return 0; + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -117,7 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; - char *subcommand; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -168,16 +183,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - has_sub_command = (argc - optind) > 1; - subcommand = has_sub_command ? argv[optind + 1] : NULL; - ret = parse_criu_mode(argv[optind], subcommand); - if (ret == -1) { - pr_err("unknown command: %s\n", argv[optind]); - goto usage; - } else if (ret == -2) { - pr_err("cpuinfo requires an action: dump or check\n"); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } + /* * util_init initializes criu_run_id and compel_run_id so that sockets * are generated with an unique name identifying the specific process @@ -232,14 +242,13 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); goto usage; } if (opts.stream && image_dir_mode() == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } @@ -290,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -307,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -322,43 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO_DUMP) { + case CR_CPUINFO_DUMP: return cpuinfo_dump(); - } - if (opts.mode == CR_CPUINFO_CHECK) { + case CR_CPUINFO_CHECK: return cpuinfo_check(); - } - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" From 9622b9045c5c77aa8a6c770c7d7601961fb54acf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 23 May 2025 08:33:20 +0100 Subject: [PATCH 127/198] cpuinfo: show error when image is missing The `criu cpuinfo check` command calls cpu_validate_cpuinfo(), which attempts to open the cpuinfo.img file using `open_image()`. If the image file is not found, `open_image()` returns an "empty image" object. As a result, `cpu_validate_cpuinfo()` tries to read from it and fails with the following error: (00.002473) Error (criu/protobuf.c:72): Unexpected EOF on (empty-image) This patch adds a check for an empty image and appropriate error message. Signed-off-by: Radostin Stoyanov --- criu/arch/ppc64/cpu.c | 6 ++++++ criu/arch/s390/cpu.c | 6 ++++++ criu/arch/x86/cpu.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e2..b87230f40a 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f4550..e227fad5e1 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569fa..2e1f2de9ad 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; From ca32bfb611c4584cfb298d6412f29fd512a68f4b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 8 Jun 2025 17:19:52 -0700 Subject: [PATCH 128/198] test/zdtm: remove unused compiler argument Fixes a clang compile-time error: "argument unused during compilation: '-c'". Signed-off-by: Andrei Vagin --- test/zdtm/Makefile.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 24f32c6068..c19888da31 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -76,7 +76,7 @@ endef %.d: %.c $(E) " DEP " $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ %.o: %.c | %.d $(E) " CC " $@ From 84991f1717cb4619aef2831cfa7ea4f92092e111 Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Tue, 6 May 2025 22:40:25 -0400 Subject: [PATCH 129/198] rpc/log: return first error always Use shared first error buffer to return correct first error in rpc. Fixes: #338 Signed-off-by: Ivan Pravdin --- criu/cr-service.c | 24 +++++++++++++++++++++++- criu/log.c | 4 ++++ test/others/rpc/errno.py | 22 +++++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index d8c5967bc9..a1089ad5c7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -895,6 +895,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -919,6 +924,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -927,6 +933,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1005,6 +1016,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1078,6 +1094,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1252,6 +1269,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1301,7 +1323,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } diff --git a/criu/log.c b/criu/log.c index 70e267fd65..a02a8df204 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index 4ea6c9d441..a5a3eb54dc 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -40,7 +40,7 @@ def recv_resp(self): resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -49,6 +49,9 @@ def check_resp(self, resp, typ, err): if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + + if errmsg and errmsg not in resp.cr_errmsg: + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): print('Try to dump unexisting process') @@ -131,12 +134,29 @@ def bad_request(self): self.check_resp(resp, rpc.EMPTY, None) print('Success') + + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + + # mntns_compat_mode options is only allowed on restore + req.opts.mntns_compat_mode = True + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + + print('Success') def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() From 765f8221785615887b2ad94eb87bdc6cfc3df03a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:24:11 +0800 Subject: [PATCH 130/198] ipc/sysctl: fix CTL_FLAGS_IPC_EACCES_SKIP by making it a flag Having CTL_FLAGS_IPC_EACCES_SKIP == (CTL_FLAGS_OPTIONAL | CTL_FLAGS_READ_EIO_SKIP) is probably not what we want. So let's make it a real distinct flag. Fixes: 840735aa0 ("ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach") Signed-off-by: Pavel Tikhomirov --- criu/include/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index cb3eba8174..2d689a9a04 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -37,6 +37,6 @@ enum { #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 5 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ From d83f5b458a0207408471aae8532042c93e8608ea Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:34:19 +0800 Subject: [PATCH 131/198] net/sysctl: fix missprint in an error message Fixes: f38e58836 ("net/sysctl: c/r ipv4/ping_group_range value") Signed-off-by: Pavel Tikhomirov --- criu/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 300df480b0..e5d2f1c4d1 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2147,7 +2147,7 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) size_t n = *pn; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } From 0f94256bf9d28d57c8fca213d5d20394b93339ee Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 10 Jun 2025 11:33:59 +0800 Subject: [PATCH 132/198] net/sysctl: fix broken ipv4_sysctls_op We have ability to skip sysctl if there is no value, but we still give n requests to sysctl_op, that is not correct and probably can segfault on nullptr access. Fix it by adding ri to count non skipped requests. To be on the safe side, let's add a check that ri == n on read, as we should not do any skips there. While on it lets fix bad error message prefix: s/unix/ipv4/. Remove excess has_iarg set, and add sarg reset to NULL for the case sysctl_op skipped it. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/criu/net.c b/criu/net.c index e5d2f1c4d1..2c018ef7bb 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2144,51 +2144,53 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; SysctlEntry **sysctl = *rsysctl; - size_t n = *pn; + size_t n = *pn, ri; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { - snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[i].name = path[i]; - req[i].flags = flags; + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; switch (sysctl[i]->type) { case SYSCTL_TYPE__CTL_STR: - req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); /* skip write if have no value */ if (op == CTL_WRITE && !sysctl[i]->sarg) continue; - req[i].arg = sysctl[i]->sarg; + req[ri].arg = sysctl[i]->sarg; break; default: pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); return -1; } + ri++; } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { - pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); return -1; } if (op == CTL_READ) { bool has_entries = false; + BUG_ON(ri != n); for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { - sysctl[i]->has_iarg = true; - if (!has_entries) - has_entries = true; + has_entries = true; + } else { + sysctl[i]->sarg = NULL; } } From c3bc8cc5b2016881375ffb3b13dc839a5d635307 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 14:07:13 +0800 Subject: [PATCH 133/198] net/sysctl: make ipv4/ping_group_range work in user namespaces We dump sysctls from criu user namespace, but restore from restored user namespace. So group id values should be mapped to the restored user namespace gid space to restore correctly. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 44 ++++++++++++++++++++++++++ test/zdtm/static/netns_sub_sysctl.desc | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 2c018ef7bb..e5775a3287 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2203,6 +2203,42 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) return 0; } +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2220,6 +2256,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) SysctlEntry *ipv4_sysctls = NULL; size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2310,6 +2347,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; } else { /* Need to handle this case when we have more sysctls */ BUG(); @@ -2338,6 +2376,12 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 5358426683..0c357aefe4 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } From 80dcaf1e5c3691f2cad02a8200aa8ba048857cb4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 9 Jun 2025 21:17:57 -0700 Subject: [PATCH 134/198] zdtm/netns_sub_sysctl: skip unsupported sysctls net/unix/max_dgram_qlen can't be tuned from non-root userns before: v5.17-rc1~170^2~215 ("net: Enable max_dgram_qlen unix sysctl to be configurable by non-init user namespaces") Signed-off-by: Andrei Vagin --- test/zdtm/static/netns_sub_sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 0f94c40a79..03b478b7d7 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,4 +1,6 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" @@ -20,6 +22,7 @@ typedef struct { int new; char s_old[MAX_STR_SYSCTL_LEN]; char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" @@ -38,6 +41,11 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; if (p->type == SYSCTL_INT) { p->old = (((unsigned)lrand48()) % 1023) + 1; if (sysctl_write_int(p->path, p->old)) { @@ -56,6 +64,8 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { + if (!p->set) + continue; if (p->type == SYSCTL_INT) { if (sysctl_read_int(p->path, &p->new)) ret = 1; From 52fd3d6eeabd04ea10872c52b517e0dae0158da1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Fri, 27 Dec 2024 03:47:35 +0530 Subject: [PATCH 135/198] sk-inet: Add support for checkpoint/restore of ICMP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is no option to checkpoint/restore programs that use ICMP sockets, such as `ping`. This patch adds support for the same. Fixes #2557 Signed-off-by: समीर सिंह Sameer Singh --- criu/sk-inet.c | 7 +++++-- criu/sockets.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 1238b03dc5..6e0acf2ce3 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -130,6 +130,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); @@ -922,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* diff --git a/criu/sockets.c b/criu/sockets.c index f9ce999bed..0affccad02 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -65,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -161,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -171,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -282,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -773,6 +785,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -797,7 +813,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -905,6 +921,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -944,6 +967,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; From b24f6e25bb99211ddfe971bb7f47778928a2f737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Sat, 28 Dec 2024 09:35:11 +0530 Subject: [PATCH 136/198] test: add static tests for ICMP socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ZDTM static tests for IP4/ICMP and IP6/ICMP socket feature. Signed-off-by: समीर सिंह Sameer Singh Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/socket6_icmp.c | 1 + test/zdtm/static/socket_icmp.c | 128 ++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 120000 test/zdtm/static/socket6_icmp.c create mode 100644 test/zdtm/static/socket_icmp.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 34fc90513a..d427659e0e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -37,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -630,6 +632,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +socket6-icmp: CFLAGS += -DZDTM_IPV6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 0000000000..24d8fd8067 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 0000000000..f72e348bf4 --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} From f8708ee4a02e4669a0494344202eaa3f43ec7005 Mon Sep 17 00:00:00 2001 From: Chuan Qiu Date: Thu, 12 Jun 2025 22:49:26 -0700 Subject: [PATCH 137/198] mount: Fix trailing / when a file is bind-mounted E.g. I have a /etc/hosts in workspace mounted from the host, and get the following message. (00.141008) 1: mnt-v2: Create plain mountpoint /tmp/.criu.mntns.K1biY1/mnt-0000000938 for 938 (00.141546) 1: mnt-v2: Mounting unsupported @938 (0) (00.141887) 1: mnt-v2: Bind /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/ to /tmp/.criu.mntns.K1biY1/mnt-0000000938 (00.142179) 1: Error (criu/mount-v2.c:319): mnt-v2: Failed to open_tree /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/: Not a directory (00.143774) Error (criu/cr-restore.c:2320): Restoring FAILED. Signed-off-by: Chuan Qiu --- criu/mount.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index 06b9595427..b643a7f26e 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; From 1bfa74d904a2f144130bb961f5c359ad09d1044b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 20 Jun 2025 13:44:32 +0800 Subject: [PATCH 138/198] zdtm: Add ztatic/mnt_ext_file_bind_auto test The test creates a file bindmount in criu mntns and binds it into test mntns, this external file bindmount is autodetected and restored via "--external mnt[]" criu option. Note: In previous patch we fix the problem on this code path where file bindmount restore fails as there is excess "/" in source path. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ext_file_bind_auto.c | 104 +++++++++++++++++++ test/zdtm/static/mnt_ext_file_bind_auto.desc | 4 + 3 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.c create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index d427659e0e..ab69f389ed 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -381,6 +381,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 0000000000..0c3b9f5fbd --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 0000000000..825b081274 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} From 68f92b551c2a8f4a3e7945a95704ac32f6df9083 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 11 Jul 2025 22:16:49 +0100 Subject: [PATCH 139/198] images: remove symlink for descriptor.proto Currently the build scripts create the following symlink: criu-4.1/images/google/protobuf/descriptor.proto -> /usr/include/google/protobuf/descriptor.proto This symlink points to a system-wide absolute-path target. Also, this symlink ends up in the release tarball. The tarball may later be downloaded and unpacked by e.g. OS distributions. If unpacking is done using Python 3.14+, it will fail. This happens because Python 3.14 will switch the default behavior of extractall() from "fully trusting the content of archive" to "disallow common attack vectors while extracting the archive". With this new behavior, extractall() raises an exception when at least one file in the archive extracts or points to outside of the extraction directory (these are called path traversal attacks and zip slip attacks). Reported-by: Dmitrii Kuvaiskii Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 7 ------- .lgtm.yml | 5 ----- images/Makefile | 17 ++++++++++++++++- images/google/protobuf/descriptor.proto | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) delete mode 120000 images/google/protobuf/descriptor.proto diff --git a/.cirrus.yml b/.cirrus.yml index a4b53a54b0..bddd5a3f1c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -33,7 +32,6 @@ task: memory: 8G setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel @@ -67,7 +65,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -88,7 +85,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -101,7 +97,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local task: @@ -113,7 +108,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local CLANG=1 task: @@ -125,6 +119,5 @@ task: script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 diff --git a/.lgtm.yml b/.lgtm.yml index 0dd49cda41..4beadcc637 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -23,8 +23,3 @@ extraction: - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/images/Makefile b/images/Makefile index d966fbfca0..e94346eee9 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -91,6 +90,22 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +PROTOBUF_DIR := images/google +DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $$(Q) echo "Generating descriptor.pb-c.c" + $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $$(Q) rm -rf $(PROTOBUF_DIR) +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add6..0000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file From 3b242c9ac44138684d27463ba603fe6f44c30fe7 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 20:14:45 -0700 Subject: [PATCH 140/198] images/Makefile: fix using $(Q) Commit 68f92b551 used `$$(Q)` instead of `$(Q)` in the Makefile target, which resulted in the following error: $(Q) echo "Generating descriptor.pb-c.c" /bin/sh: 1: Q: not found Generating descriptor.pb-c.c $(Q) protoc --proto_path=/usr/include --proto_path=images/ --c_out=images/ /usr/include/google/protobuf/descriptor.proto /bin/sh: 1: Q: not found as well as: $(Q) rm -rf images/google /bin/sh: line 1: Q: command not found Fix it. Signed-off-by: Kir Kolyshkin --- images/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/images/Makefile b/images/Makefile index e94346eee9..cb30a51268 100644 --- a/images/Makefile +++ b/images/Makefile @@ -96,13 +96,13 @@ PROTOBUF_DIR := images/google DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $$(Q) echo "Generating descriptor.pb-c.c" - $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + $(Q) echo "Generating descriptor.pb-c.c" + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $$(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -rf $(PROTOBUF_DIR) .PHONY: submrproper mrproper: submrproper From e9521d8e416598ce5fc4c3b28918fb5b58d8de46 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 22:44:50 -0700 Subject: [PATCH 141/198] Keep images/google/protobuf directory Commit 68f92b551 removed images/google/protobuf directory, so it is re-created each time during the build process. This resulted in a weird behavior change. Previously, one could do something like this: git clone $CRURL criu (cd criu && sudo make install-criu) rm -rf criu This worked fine, including running rm -rf as a non-root user, since no new directories were created under criu -- all directories were still owned by the original user. Since commit 68f92b551 the same sequence fails: rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.c': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.d': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.h': Permission denied A workaround is to keep empty images/google/protobuf directory, which is what this commit does. Signed-off-by: Kir Kolyshkin --- .gitignore | 2 -- images/Makefile | 5 ++--- images/google/protobuf/.gitignore | 2 ++ 3 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 images/google/protobuf/.gitignore diff --git a/.gitignore b/.gitignore index 854657d1c1..94daa13ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest diff --git a/images/Makefile b/images/Makefile index cb30a51268..6f310e553c 100644 --- a/images/Makefile +++ b/images/Makefile @@ -92,8 +92,7 @@ makefile-deps := Makefile $(obj)/Makefile # # Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. -PROTOBUF_DIR := images/google -DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto $(Q) echo "Generating descriptor.pb-c.c" @@ -102,7 +101,7 @@ $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -f $(DESCRIPTOR_DIR)/* .PHONY: submrproper mrproper: submrproper diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 0000000000..68359a7869 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h From 59970a606db314fd6b24cc6e00de054f7aebfb92 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 23:07:37 -0700 Subject: [PATCH 142/198] images/Makefile: use msg-gen In general, we use "$(E)" instead of "$(Q) echo", but we also have a msg-gen macro which can be used here. Signed-off-by: Kir Kolyshkin --- images/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/Makefile b/images/Makefile index 6f310e553c..2c33152e91 100644 --- a/images/Makefile +++ b/images/Makefile @@ -95,7 +95,7 @@ makefile-deps := Makefile $(obj)/Makefile DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $(Q) echo "Generating descriptor.pb-c.c" + $(call msg-gen, $@) $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d From 04012eac7f4dfe6bbe1a08e7ce7acf411e7a6229 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:32:25 +0200 Subject: [PATCH 143/198] compel: flush caches after parasite injection After the CRIU process saves the parasite code for the target thread in the shared mmap, it is necessary to call __clear_cache before the target thread executes the code. Without this step, the target thread may not see the correct code to execute, which can result in a SIGILL signal. For the specific arm64 case. this is important so that the newly copied code is flushed from d-cache to RAM, so that the target thread sees the new code. The change is based on commit 6be10a2 by @fu.lin and on input received from @adrianreber. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 4ea27bc633..22fcf24fad 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1054,6 +1054,16 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; From 6f0e4e848bcff71efa95a049c7b804ce1dcf2627 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:38:13 +0200 Subject: [PATCH 144/198] restore: flush caches during restore See the previous commit for rationale and architecture-specific details. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d5..b376035631 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2569,6 +2569,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } From 27eb6c57c57d8c925839000f23f4ca31a069b111 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 25 Jul 2025 00:05:06 +0000 Subject: [PATCH 145/198] mount-v2: enter the mount namesapce to propagation properties A kernel change (commit 12f147ddd6de, "do_change_type(): refuse to operate on unmounted/not ours mounts") modified how mount propagation properties can be changed. Previously, these properties could be changed from any mount namespace. Now, they can only be modified from the specific mount namespace where the target mount is actually mounted This commit addresses this new restriction by ensuring that CRIU enters the correct mount namespace before attempting to restore mount propagation properties (MS_SLAVE or MS_SHARED) for a mount. Signed-off-by: Andrei Vagin --- criu/mount-v2.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index eb4dd8119a..1e33ac12a2 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -933,8 +933,12 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { + int nsfd = -1, orig_nsfd = -1, exit_code = -1; char target_path[PATH_MAX]; - int target_fd; + int target_fd = -1; + + if (!sg->master_id && !sg->shared_id) + return 0; target_fd = fdstore_get(target->mnt_fd_id); BUG_ON(target_fd < 0); @@ -949,8 +953,7 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ first = get_first_mount(sg->parent); if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); - close(target_fd); - return -1; + goto err; } } else { /* @@ -962,16 +965,23 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ */ if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); - close(target_fd); - return -1; + goto err; } } + } + + nsfd = fdstore_get(target->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + if (sg->master_id) { /* Convert shared_id to master_id */ if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { pr_perror("Failed to make mount %d slave", target->mnt_id); - close(target_fd); - return -1; + goto err; } } @@ -979,13 +989,16 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ if (sg->shared_id) { if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { pr_perror("Failed to make mount %d shared", target->mnt_id); - close(target_fd); - return -1; + goto err; } } - close(target_fd); - - return 0; + exit_code = 0; +err: + close_safe(&target_fd); + close_safe(&nsfd); + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; } static int restore_one_sharing_group(struct sharing_group *sg) From e852a760b61f5f75645de22bebdaf28bb1f664d5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 07:53:55 +0100 Subject: [PATCH 146/198] vagrant: fix 'qemu' install Installing this package currently fails with the following message: Package qemu is not available, but is referred to by another package. This may mean that the package is missing, has been obsoleted, or is only available from another source E: Package 'qemu' has no installation candidate Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ed5a011787..c3e15007c2 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,7 +22,7 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ + ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ openssh-client systemctl restart libvirtd From f7d70b15e5df2644d520003faca5897cea14895f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:34:31 +0100 Subject: [PATCH 147/198] vagrant: update image to fedora 42 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c3e15007c2..81af5d2e5f 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.4.1 -FEDORA_VERSION=40 -FEDORA_BOX_VERSION=40.20240414.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { if [ -n "$TRAVIS" ]; then @@ -27,7 +27,7 @@ setup() { openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. From 116e56ba46382c05066d33a8bbadcc495dbdb644 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:50:29 +0100 Subject: [PATCH 148/198] vagrant: fix tar including archive in itself The tar command was failing with the following message: $ tar cf criu.tar ../../../criu tar: Removing leading `../../../' from member names tar: ../../../criu/scripts/ci/criu.tar: archive cannot contain itself; not dumped In addition, the /vagrant no-longer exist in the new Fedora images. bash: line 1: cd: /vagrant: No such file or directory Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 81af5d2e5f..008a01fb35 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -17,7 +17,7 @@ setup() { fi # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb @@ -28,10 +28,16 @@ setup() { systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config @@ -40,8 +46,11 @@ setup() { libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' ssh default cat /proc/cmdline } @@ -49,7 +58,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -74,12 +83,12 @@ fedora-rawhide() { # In the container it is not possible to change the state of selinux. # Let's just disable it for this test run completely. ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } fedora-non-root() { ssh default uname -a - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' # Setting the capability should be the only line needed to run as non-root on Fedora # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' From 9d116b68785edc0105cd8319769e13d0d325cc24 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:46:39 +0000 Subject: [PATCH 149/198] zdtm/socket-tcp-closing: fill socket buffers effectivly Send large chunks to fill socket buffers. Signed-off-by: Andrei Vagin --- test/zdtm/static/socket-tcp-closing.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d75337..df291d4464 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) From 90cbfdf9365620df81a74c6ef983a021d94b1442 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:50:41 +0000 Subject: [PATCH 150/198] CI: Consolidate arm64 tests on GitHub runners The arm64 tests are currently being executed on both actuated and GitHub runners. This change removes the actuated runner to avoid redundancy and streamline our CI process. Signed-off-by: Andrei Vagin --- .github/workflows/aarch64-test.yaml | 32 +++++++++++ .github/workflows/actuated-aarch64-test.yaml | 58 -------------------- 2 files changed, 32 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/aarch64-test.yaml delete mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 0000000000..32b19e1766 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,32 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04-arm + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml deleted file mode 100644 index 567746a5f4..0000000000 --- a/.github/workflows/actuated-aarch64-test.yaml +++ /dev/null @@ -1,58 +0,0 @@ -name: aarch64 test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: actuated-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - # Actuated runners are not available in all repositories. - if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected for - # the actuated runners. 3GB and 4 CPUs seems to be enough according to the - # result from 'vmmeter'. - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] - target: [GCC=1, CLANG=1] - - steps: - # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md - # vmmeter start - - name: Prepare arkade - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: alexellis/arkade-get@master - with: - crane: latest - print-summary: false - - - name: Install vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - run: | - crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - - - name: Run vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: self-actuated/vmmeter-action@master - # vmmeter end - - - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} - # Following tests are failing on the actuated VMs: - # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out - # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) - # - # In combination with '--remote-lazy-pages' following error occurs: - # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) - run: | - # The 'sched_policy00' needs the following: - sudo sysctl -w kernel.sched_rt_runtime_us=-1 - # etc/hosts entry is needed for netns_lock_iptables - echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts - sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ - ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From 91847a6181b2dabfbd32166da347a3bdb1a9494d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:22:57 +0200 Subject: [PATCH 151/198] criu/include/mman: define MADV_GUARD_INSTALL Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 086753bcf5..43e0b6cc7a 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -19,5 +19,8 @@ #ifndef MADV_WIPEONFORK #define MADV_WIPEONFORK 18 #endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ From 54c078e6c2d2aaf218eca790b9d7065bc7ca3e11 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:37:48 +0200 Subject: [PATCH 152/198] kerndat: add madvise(MADV_GUARD_INSTALL) feature-detection Signed-off-by: Alexander Mikhalitsyn --- criu/include/kerndat.h | 1 + criu/kerndat.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index c5deb32832..66db756497 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -91,6 +91,7 @@ struct kerndat_s { bool has_close_range; bool has_timer_cr_ids; bool has_breakpoints; + bool has_madv_guard; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index fa43f7d3f2..7e2edb72d0 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -31,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -1813,6 +1814,33 @@ static int kerndat_breakpoints(void) return exit_code; } +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -2081,6 +2109,10 @@ int kerndat_init(void) pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 66b46e59884f773cc45e992a4d16ef226baedc1c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:36:45 +0200 Subject: [PATCH 153/198] kerndat: add pagemap_scan_guard_pages feature check logic Signed-off-by: Alexander Mikhalitsyn --- criu/cr-check.c | 8 ++++++++ criu/include/kerndat.h | 3 +++ criu/include/pagemap_scan.h | 1 + criu/kerndat.c | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/criu/cr-check.c b/criu/cr-check.c index 9c4778490e..7c3dc76dd8 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1599,6 +1599,12 @@ static int check_breakpoints(void) return 0; } +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} static int (*chk_feature)(void); @@ -1724,6 +1730,7 @@ int cr_check(void) ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1853,6 +1860,7 @@ static struct feature_list feature_list[] = { { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 66db756497..e4922f401d 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -92,6 +92,7 @@ struct kerndat_s { bool has_timer_cr_ids; bool has_breakpoints; bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -114,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h index 0ad4c9bc0b..9046e01edf 100644 --- a/criu/include/pagemap_scan.h +++ b/criu/include/pagemap_scan.h @@ -14,6 +14,7 @@ #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/criu/kerndat.c b/criu/kerndat.c index 7e2edb72d0..997181ce75 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -87,6 +87,10 @@ static int check_pagemap(void) if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { pr_debug("PAGEMAP_SCAN is supported\n"); kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; } else { switch (errno) { case EINVAL: @@ -1841,6 +1845,14 @@ static int kerndat_has_madv_guard(void) return -1; } +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the From 66251ba1d9253e0864f471077a4fb4abb6ae5200 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:42:43 +0200 Subject: [PATCH 154/198] cr-dump: warn if MADV_GUARD is supported but isn't shown in pagemap Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d9..f02db1a57f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2124,6 +2124,8 @@ int cr_dump_tasks(pid_t pid) int pre_dump_ret = 0; int ret = -1; + kerndat_warn_about_madv_guards(); + pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); From 0852cf97ec49326089669f7b2f6078fe5d7eb634 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:51:24 +0200 Subject: [PATCH 155/198] criu/pagemap-cache: pagescan: look for PAGE_IS_GUARD pages Signed-off-by: Alexander Mikhalitsyn --- criu/pagemap-cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index f04a517de3..457c0d6497 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -194,6 +194,9 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) }; long ret; + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); if (ret == -1) { pr_perror("PAGEMAP_SCAN"); From 53d3e0c7d3ecf19dcc28a16e6aefeb2793bd6a38 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 1 May 2025 20:02:37 +0200 Subject: [PATCH 156/198] criu/mem: refactor should_dump_page helper Make should_dump_page to return int to indicate failure, also return useful data back through the struct page_info structure passed as a pointer. Also, correspondingly convert all call sites. No functional changes intended, except fixing a bug in should_dump_page() as it could return (-1) when pmc_fill() fails, while caller didn't expect that before. Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 8 +++++- criu/mem.c | 67 ++++++++++++++++++++++++++++++---------------- criu/shmem.c | 27 ++++++++++++------- 3 files changed, 68 insertions(+), 34 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 3618c9cc3b..0ce97822b2 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -49,5 +49,11 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 803cb545b5..9fcf7a44c6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -115,27 +115,37 @@ static bool should_dump_entire_vma(VmaEntry *vmae) } /* - * should_dump_page returns vaddr if an addressed page has to be dumped. - * Otherwise, it returns an address that has to be inspected next. + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. */ -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) { + if (!page_info) + goto err; + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - return -1; + goto err; if (pmc->regs) { while (1) { - if (pmc->regs_idx == pmc->regs_len) - return pmc->end; + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + if (vaddr < pmc->regs[pmc->regs_idx].end) break; pmc->regs_idx++; } - if (vaddr < pmc->regs[pmc->regs_idx].start) - return pmc->regs[pmc->regs_idx].start; - if (softdirty) - *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - return vaddr; + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; @@ -143,16 +153,26 @@ u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) * Optimisation for private mapping pages, that haven't * yet being COW-ed */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return vaddr + PAGE_SIZE; + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - if (softdirty) - *softdirty = pme & PME_SOFT_DIRTY; - return vaddr; + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } - return vaddr + PAGE_SIZE; + page_info->next = vaddr + PAGE_SIZE; + return 0; } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; } bool page_is_zero(u64 pme) @@ -202,14 +222,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct nr_scanned = 0; for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; - bool softdirty = false; - u64 next; + struct page_info page_info = {}; int st; /* If dump_all_pages is true, should_dump_page is called to get pme. */ - next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; + + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } @@ -223,7 +244,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(softdirty)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352d..bc7aa36695 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,31 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - bool softdirty = false; - u64 next; + struct page_info page_info = {}; + + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; - next = should_dump_page(pmc, vma, vaddr, &softdirty); - if (next != vaddr) { - vaddr = next - PAGE_SIZE; + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (softdirty) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -667,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -684,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } From 6ca4d6fa98fa40c110b82a5dc917e861c0494ce3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:42:26 +0200 Subject: [PATCH 157/198] criu/mem: dump: skip MADV_GUARD pages content dump 1. get info about MADV_GUARD_INSTALL-protected pages with help of pagemap by looking for PME_GUARD_REGION flag if /proc//pagemap is used or by looking for PAGE_IS_GUARD flag if ioctl(PAGEMAP_SCAN) is used 2. skip those pages Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 1 + criu/mem.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/criu/include/mem.h b/criu/include/mem.h index 0ce97822b2..b2cbd4b640 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -35,6 +35,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_l #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) diff --git a/criu/mem.c b/criu/mem.c index 9fcf7a44c6..58c4130c67 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -143,12 +143,18 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa return 0; } + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; page_info->next = vaddr; return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + /* * Optimisation for private mapping pages, that haven't * yet being COW-ed @@ -173,6 +179,10 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", vmae->start, vmae->end, vaddr); return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; } bool page_is_zero(u64 pme) From 556d1c44060cb17402a835fad69919774fdba4d3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 09:42:10 +0200 Subject: [PATCH 158/198] criu/{mem, vdso, cr-restore}: introduce VMA_AREA_GUARD fake VMAs Introduce a new kind of VMA - VMA_AREA_GUARD. In fact, it is not a real VMA as it is not represented as struct vm_area_struct in the kernel. We want to reuse an existing vma infrastructure in CRIU to dump an information about MADV_GUARD_INSTALL-covered address space ranges as VMAs. Then, on restore, we need to carefully skip those fake VMAs everywhere we expect a normal VMAs to be processed. And only in restorer we use these VMAs to get an information about where to call MADV_GUARD_INSTALL. Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 6 ++++-- criu/include/image.h | 7 +++++++ criu/mem.c | 13 +++++++++++-- criu/vdso.c | 6 ++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b376035631..1c3b364518 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2447,7 +2447,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2460,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } diff --git a/criu/include/image.h b/criu/include/image.h index afa7d5e12f..934f7d4e97 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -68,6 +68,12 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -87,6 +93,7 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/mem.c b/criu/mem.c index 58c4130c67..ee841aca2e 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -599,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -861,14 +864,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -1069,6 +1072,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1276,6 +1282,9 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; diff --git a/criu/vdso.c b/criu/vdso.c index d4d3511314..2d9e57c4da 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO From f091422ad6b936f554fc86dadfc04767d436c27a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:10:10 +0200 Subject: [PATCH 159/198] criu/pie/restorer: add madvise(MADV_GUARD_INSTALL) restore logic Signed-off-by: Alexander Mikhalitsyn --- criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd5..394d3dea08 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -28,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -1665,6 +1666,30 @@ static int restore_membarrier_registrations(int mask) return ret; } +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1972,6 +1997,13 @@ __visible long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ From 8619ea2711ccbce1b4406be4ecfe7f81609e86c8 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 10:48:47 +0200 Subject: [PATCH 160/198] criu/mem: dump: note MADV_GUARD pages as VMA_AREA_GUARD VMAs Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 17 ++++++++++++ criu/include/mem.h | 1 + criu/mem.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f02db1a57f..10c485cbe9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -130,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); diff --git a/criu/include/mem.h b/criu/include/mem.h index b2cbd4b640..e9ce3518ae 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -31,6 +31,7 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) diff --git a/criu/mem.c b/criu/mem.c index ee841aca2e..0636273cbe 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1548,3 +1548,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} From d0f5da9dd7d6d49af8fab1e3063c82bf74b811a9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 20 Apr 2025 20:20:20 +0200 Subject: [PATCH 161/198] test/zdtm/static/maps12: add madv guards test Test for madvise(MADV_GUARD_INSTALL). Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps12.c | 350 +++++++++++++++++++++++++++++++++++ test/zdtm/static/maps12.desc | 1 + 3 files changed, 352 insertions(+) create mode 100644 test/zdtm/static/maps12.c create mode 100644 test/zdtm/static/maps12.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab69f389ed..e73f964be5 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -315,6 +315,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 0000000000..b645595bec --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 0000000000..3f7627ff3c --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} From 17a5c6e1446c6fa712e752435b25c50b602077d9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 10 Aug 2025 18:22:23 +0200 Subject: [PATCH 162/198] ci/vagrant: install vanilla kernel for Fedora Rawhide test We need at least 6.16 to test MADV_GUARD_INSTALL support, but our current Fedora Rawhide test uses only Rawhide's user space, while using Fedora 42 kernel. Let's start using a vanilla kernel. Suggested-by: Adrian Reber Signed-off-by: Alexander Mikhalitsyn --- scripts/ci/vagrant.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 008a01fb35..98942e7565 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -66,6 +66,10 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously # installed this reboots the VM. From d8c349270cc786eb7ffeb4a379dc07e7eb6f3586 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Aug 2025 01:44:01 +0000 Subject: [PATCH 163/198] make: Disable branch-protection for PIE code on ARM64 Branch protection uses PAC. It cryptographically "signs" a function's return address before it is stored on the stack. Upon return, the address is authenticated using a secret key. If the signature is invalid, the program will fault. The PIE code is used for the parasite and the restorer. In both cases, it runs in a foreign process. The case of the restorer is even trickier because it needs to restore the original PAC keys, which invalidates all previously "signed" pointers within the restorer itself. Fixes #2709 Signed-off-by: Andrei Vagin --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 5d8e89ac1b..7272cfce19 100644 --- a/Makefile +++ b/Makefile @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) From 5c2f02b2ea5e131deec105a48a3b6a16d4e46633 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 16 Aug 2025 15:45:05 +0100 Subject: [PATCH 164/198] test/zdtm/static/maps12: fix pointer-to-int cast The `offset` argument to `mmap()` was computed with a direct cast from pointer to `off_t`: `(off_t)addr_hint - (off_t)map_base` This causes a build failure when compiling since pointers and `off_t` may differ in size on some platforms. maps12.c: In function 'mmap_pages': maps12.c:114:50: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); | ^ maps12.c:114:69: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); The fix in this patch is to cast both pointers to `intptr_t`, perform the subtraction in that type, and then cast the result back to `off_t`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/maps12.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c index b645595bec..f0d6c23819 100644 --- a/test/zdtm/static/maps12.c +++ b/test/zdtm/static/maps12.c @@ -111,7 +111,8 @@ static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), - filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) return MAP_FAILED; From 2ea697ba2d7959a92a03cf894b7111485af22c4f Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:18 +0000 Subject: [PATCH 165/198] zdtm/static/sock_opts00: use unix socket to test SO_PASSCRED and SO_PASSSEC SO_PASSCRED and SO_PASSSEC are only valid for AF_UNIX and AF_NETLINK This patch updates the test logic to use a unix socket for these options, while preserving the original value consistency check Fixes: #2705 Signed-off-by: Dong Sunchao --- test/zdtm/static/sock_opts00.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index fcf00ffed8..854aaa5911 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) static const int NOPTS = sizeof(vname) / sizeof(*vname); #undef OPT - int sock, ret = 0, val[NOPTS], rval, i; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); test_init(argc, argv); @@ -42,8 +42,15 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { pr_perror("can't get %s", vname[i].name); return 1; @@ -51,13 +58,13 @@ int main(int argc, char **argv) val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't re-get %s", vname[i].name); return 1; @@ -78,7 +85,8 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't verify %s", vname[i].name); return 1; @@ -93,6 +101,7 @@ int main(int argc, char **argv) pass(); close(sock); + close(usock); return 0; } From 0c679d9b2c417c35b20e32ce55d701363d52cd2e Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:37 +0000 Subject: [PATCH 166/198] criu/sockets: Restrict SO_PASSCRED and SO_PASSSEC to supported families Linux 6.16+ restricts SO_PASSCRED and SO_PASSSEC to AF_UNIX, AF_NETLINK, and AF_BLUETOOTH This patch updates CRIU to check the socket family before dumping these options Fixes: #2705 Signed-off-by: Dong Sunchao --- criu/include/sockets.h | 2 +- criu/sk-inet.c | 2 +- criu/sk-netlink.c | 2 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- criu/sockets.c | 16 +++++++++------- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/criu/include/sockets.h b/criu/include/sockets.h index c3e7c879a7..6c81d3edd7 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,7 +25,7 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 6e0acf2ce3..422edc6567 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -581,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a219b69be1..dc2baa1b80 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -165,7 +165,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522a..6530bff580 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 70ca16be4a..6145fe7347 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -527,7 +527,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); diff --git a/criu/sockets.c b/criu/sockets.c index 0affccad02..e4adae03cd 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -649,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -688,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; From 2c0b716a8c1f8e6eccf67554798a3b2b25ff088b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Sep 2025 12:48:34 -0700 Subject: [PATCH 167/198] ci: avoid Docker 28 due to regression This change modifies the CI script to avoid Docker version 28, which has a known regression that breaks Checkpoint/Restore (C/R) functionality. The issue is tracked in the moby/moby project as https://github.com/moby/moby/issues/50750. Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index aaf443afdc..ae7f52454d 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,6 +2,24 @@ set -x -e -o pipefail +# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +./apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" + # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From 4cd4a6b1ac15a92cb9324141d4a3cc37ba1818c2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 19:29:16 -0700 Subject: [PATCH 168/198] zdtm: stop importing junit_xml We are dropping support for generating JUnit XML reports in zdtm.py as we've migrated testing infrastructure entirely to `GitHub Actions` and other third-party test runners. This package has been removed from some distribution repositories (e.g., Fedora), making it simpler to remove the dependency than to force installation via pip. Signed-off-by: Andrei Vagin --- .cirrus.yml | 2 +- scripts/build/Dockerfile.alpine | 2 -- scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 2 -- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- test/jenkins/criu-lazy-migration.pipeline | 1 - test/zdtm.py | 24 +---------------------- 9 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bddd5a3f1c..848e141329 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index d843793ea2..819fda0c38 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -48,6 +48,4 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml --break-system-packages - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 9d11194bb0..d4b432f8d6 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -32,7 +32,6 @@ RUN pacman -Syu --noconfirm \ go \ python-yaml \ asciidoctor \ - python-junit-xml \ python-importlib-metadata \ libdrm \ util-linux-libs \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index a672123441..5ab6c9cfa4 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -45,6 +45,4 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 test -RUN pip3 install junit_xml - RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8ad9cf978..f8f797c1e5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -26,7 +26,6 @@ dnf install -y \ protobuf-devel \ python3-PyYAML \ python3-protobuf \ - python3-junit_xml \ python3-pip \ python3-importlib-metadata \ python-unversioned-command \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 0c4a089757..617f54fc6e 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml libdrm-dev) + python3-importlib-metadata libdrm-dev) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 98942e7565..c222e30e05 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -44,7 +44,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + protobuf-devel python3-protobuf python3-importlib-metadata \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170d..45dc2c7766 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/zdtm.py b/test/zdtm.py index 3339dd8167..7e83aa4df9 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2078,8 +2078,6 @@ def __init__(self, opts, nr_tests): self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2091,22 +2089,14 @@ def __init__(self, opts, nr_tests): if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2141,10 +2131,6 @@ def skip(self, name, reason): self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2247,10 +2233,6 @@ def __wait_one(self, flags): # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2307,10 +2289,6 @@ def finish(self): if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: From 116991736c7641258a5b7f53f5079b90fc80b99e Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Thu, 18 Sep 2025 10:01:48 +0200 Subject: [PATCH 169/198] pagemap: prevent integer overflow in pagemap_len Fixes #2738 Original-patch-by: Andrey Vagin Signed-off-by: Lorenzo Fontana --- criu/include/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9c..fae110108c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return pe->nr_pages * PAGE_SIZE; + return (unsigned long)pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) From dcd9df7389f7cebb2e50f70b87d478a17b889589 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Thu, 18 Sep 2025 03:09:30 +1000 Subject: [PATCH 170/198] compel/mips: Relax ELF magic check to support MIPS libraries On MIPS platforms, shared libraries may use EI_ABIVERSION = 5 to indicate support for .MIPS.xhash sections. The previous ELF header check in handle_binary() strictly compared e_ident against a hardcoded value, causing legitimate shared objects to be rejected. This patch replaces the memcmp-based check with a structured validation of ELF magic and class, and allows EI_ABIVERSION values beside 0. fixes: #2745 Signed-off-by: dong sunchao --- compel/arch/mips/src/lib/handle-elf.c | 31 +++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a452..e086761c21 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; + + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + return __handle_elf(mem, size); } From cbf9063c4235c4f60191b306d5d53f35f7ba1bdc Mon Sep 17 00:00:00 2001 From: Filip Hejsek Date: Sat, 13 Sep 2025 19:49:24 +0200 Subject: [PATCH 171/198] lsm: use attr/apparmor/current to get apparmor label On some kernels, attr/current can be intercepted by BPF LSM, causing errors (#2033). Using attr/apparmor/current is preferable, because it is guaranteed to return the apparmor label. attr/current will still be used as a fallback for older kernels. Fixes: #2033 Signed-off-by: Filip Hejsek --- criu/lsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index 70b66d42ee..5faf3e5b2f 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; From 0ac0a70c0c192d28b5cf63c5a0ea198b4f95e7a9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 18:44:51 -0700 Subject: [PATCH 172/198] files: fork helpers without CLONE_FILES | CLONE_FS On restore, CRIU needs to change mount namespaces to properly restore files and unix sockets. However, the kernel prevents this if a process is sharing its file system information (fs) with other processes. Fixes #2687 Signed-off-by: Andrei Vagin --- criu/files.c | 1 - criu/pstree.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/criu/files.c b/criu/files.c index f16ec32a23..af4b8aeac8 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1329,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) diff --git a/criu/pstree.c b/criu/pstree.c index 75c2fc8d0a..cee8b5741a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -237,9 +237,8 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } From f6552d596e3d36e016416d2526a10ad57cb938da Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 14:48:42 +0000 Subject: [PATCH 173/198] pagemap: change PagemapEntry.nr_pages to uint64 to support huge mappings Update the nr_pages field in PagemapEntry to uint64 to prepare for checkpointing and restoring huge memory mappings. Backward compatibility with older pagemap images is preserved. Signed-off-by: Andrei Vagin --- criu/include/pagemap.h | 2 +- criu/page-xfer.c | 1 + criu/pagemap.c | 5 ++++- images/pagemap.proto | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index fae110108c..3ae15deb9c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return (unsigned long)pe->nr_pages * PAGE_SIZE; + return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6d..b0e04d82c5 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -326,6 +326,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { diff --git a/criu/pagemap.c b/criu/pagemap.c index 85bb922596..d9ccc03eb6 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f6..f2436a51ac 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } From 8b804bca8ab4600fcfe136e0aaa2f4db3987408e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 15:20:32 +0000 Subject: [PATCH 174/198] pagemap: use unsigned long for page counts Variables storing page counts were previously `unsigned int`, limiting them to a maximum of 2^32 pages. With a 4k page size, this corresponds to a 16TB memory mapping, which is insufficient for larger mappings. This commit changes the type for these variables to `unsigned long` to support larger memory mappings. Signed-off-by: Andrei Vagin --- criu/include/page-pipe.h | 6 +++--- criu/include/page-xfer.h | 6 +++--- criu/include/pagemap.h | 6 +++--- criu/include/parasite.h | 2 +- criu/mem.c | 2 +- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 20 ++++++++++---------- criu/pagemap.c | 22 +++++++++++----------- criu/pie/parasite.c | 2 +- criu/uffd.c | 25 ++++++++++++------------- 10 files changed, 48 insertions(+), 49 deletions(-) diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c0150..65292b7ab1 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe670928..0d9b350194 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9c..4cbc87cc6d 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,8 +52,8 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index b33d6710f8..1763577111 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) diff --git a/criu/mem.c b/criu/mem.c index 0636273cbe..f8c5508428 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -336,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); diff --git a/criu/page-pipe.c b/criu/page-pipe.c index aab6742be7..f8e3520f79 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -381,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,7 +446,7 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index b0e04d82c5..4d057163d9 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -32,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -886,7 +886,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -1071,7 +1071,7 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; @@ -1348,7 +1348,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1551,13 +1551,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1567,7 +1567,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1667,7 +1667,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1684,7 +1684,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1695,7 +1695,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap.c b/criu/pagemap.c index d9ccc03eb6..16d680fdbb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 1bc03dc2a0..c966e9e62c 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } diff --git a/criu/uffd.c b/criu/uffd.c index 98c2b7e075..8e12dcd636 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ static int collect_iovs(struct lazy_pages_info *lpi) return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; From 6df6beb510a7e0767319c5292367e07c320da1b5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 15:10:25 +0000 Subject: [PATCH 175/198] pagemap: print page regions in the format `start - end` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During investigations, it’s much easier to read logs when regions are printed in the start - end format rather than `start/size`. In addition, all page counters and memory sizes are now printed in hexadecimal, as they are hard to read in decimal form. Signed-off-by: Andrei Vagin --- criu/cr-dedup.c | 3 ++- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 23 +++++++++++++---------- criu/pagemap.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53ee..feeb9ebb03 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index f8e3520f79..4601d8f9cd 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -446,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 4d057163d9..e2913b9244 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -178,12 +178,12 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -340,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -850,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -886,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -898,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1071,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index 16d680fdbb..b6ec3e3332 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 42e7ca4f662b32dee16e887e2dc79a92464f8e53 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Tue, 23 Sep 2025 01:00:12 +1000 Subject: [PATCH 176/198] vdso: relax EI_OSABI check to support linux in ELF header On some ARM/aarch64 systems, the VDSO ELF header sets EI_OSABI to 3 (Linux), while CRIU expects 0 (System V). This strict check causes restore to fail with "ELF header magic mismatch" This patch relaxes the check to accept both values, improving compatibility with modern toolchains and kernels (e.g. Linux 6.12+) Fixes: #2751 Signed-off-by: dong sunchao --- criu/pie/util-vdso.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 8daf5c71f3..45fb6a648b 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -98,25 +98,45 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } From 31a58f6ababf6e987bdecd61020f7a57864a9876 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 22 Sep 2025 17:59:29 +0000 Subject: [PATCH 177/198] zdtm: Remove junit_xml leftovers The previous commit 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") removed the junit_xml library, but some variables related to it were left in the code. This commit removes the unused `tc` variable and a call to its `add_error_info` method. Fixes: 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") Signed-off-by: Andrei Vagin --- test/zdtm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 7e83aa4df9..e21356c30a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2232,7 +2232,6 @@ def __wait_one(self, flags): # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2243,7 +2242,6 @@ def __wait_one(self, flags): with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, From 1a3be784fe533f1a7f6612501e4f1fa455141c1e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 00:34:56 +0000 Subject: [PATCH 178/198] docs: add developer overviews for AI assistants This commit adds the document to provide high-level overviews of the CRIU project for AI assistants like Claude and Gemini. These documents are intended to be used as context for AI-powered developer assistants to help them understand the project's goals, architecture, and development process. This will allow them to provide more accurate and helpful responses to developer questions. The documents include: - A brief introduction to CRIU - A quick start guide for checkpointing and restoring a simple process - An overview of the dump and restore process - A description of the Compel subproject - Information about the project's coding style, code layout, and tests Signed-off-by: Andrei Vagin --- CLAUDE.md | 1 + GEMINI.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 120000 CLAUDE.md create mode 100644 GEMINI.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000000..e3c5a92d9f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000000..e56c1de12d --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. From 4c250c1b74c14f32979044f43a22cca5a22b49c5 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 17 Sep 2025 19:14:36 +0900 Subject: [PATCH 179/198] ci: use package-manager dependency install scripts Currently, adding a package which is required either for development or testing requires it to be added in multiple places due to many duplicated Dockerfiles and installation scripts. This makes it difficult to ensure that all scripts are updated appropriately and can lead to some places being missed. This patch consolidates the list of dependencies and adds installation scripts for each package-manager used in our CI (apk, apt, dnf, pacman). This change also replaces the `debian/dev-packages.lst` as this subfolder conflicts with the Ubuntu/Debian packing scripts used for CRIU: https://github.com/rst0git/criu-deb-packages This patch also removes the CentOS 8 build scripts as it is EOL and the container registry is no longer available. Signed-off-by: Shashank Balaji Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 12 +-- .github/workflows/check-commits.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/nftables-test.yml | 2 +- CONTRIBUTING.md | 84 ++++++++++++------- Makefile | 3 +- {scripts/ci => contrib}/apt-install | 0 contrib/debian/dev-packages.lst | 19 ----- contrib/dependencies/apk-packages.sh | 38 +++++++++ contrib/dependencies/apt-cross-packages.sh | 34 ++++++++ contrib/dependencies/apt-packages.sh | 40 +++++++++ contrib/dependencies/dnf-packages.sh | 35 ++++++++ contrib/dependencies/pacman-packages.sh | 31 +++++++ scripts/build/Dockerfile.alpine | 43 +--------- scripts/build/Dockerfile.archlinux | 35 +------- scripts/build/Dockerfile.centos8 | 48 ----------- scripts/build/Dockerfile.fedora.tmpl | 5 +- scripts/build/Dockerfile.hotspot-alpine | 25 +----- scripts/build/Dockerfile.hotspot-ubuntu | 28 +------ scripts/build/Dockerfile.linux32.tmpl | 26 +----- scripts/build/Dockerfile.openj9-ubuntu | 28 +------ .../Dockerfile.riscv64-stable-cross.tmpl | 33 +------- scripts/build/Dockerfile.stable-cross.tmpl | 25 +----- scripts/build/Dockerfile.tmpl | 34 +------- scripts/build/Dockerfile.unstable-cross.tmpl | 26 +----- scripts/build/Dockerfile.x86_64.hdr | 2 +- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- scripts/ci/docker-test.sh | 4 +- scripts/ci/java-test.sh | 2 + scripts/ci/loongarch64-qemu-test.sh | 4 +- scripts/ci/prepare-for-fedora-rawhide.sh | 29 +------ scripts/ci/run-ci-tests.sh | 12 +-- scripts/ci/vagrant.sh | 12 +-- scripts/install-debian-pkgs.sh | 25 ------ 35 files changed, 294 insertions(+), 458 deletions(-) rename {scripts/ci => contrib}/apt-install (100%) delete mode 100644 contrib/debian/dev-packages.lst create mode 100755 contrib/dependencies/apk-packages.sh create mode 100755 contrib/dependencies/apt-cross-packages.sh create mode 100755 contrib/dependencies/apt-packages.sh create mode 100755 contrib/dependencies/dnf-packages.sh create mode 100755 contrib/dependencies/pacman-packages.sh delete mode 100644 scripts/build/Dockerfile.centos8 delete mode 100755 scripts/install-debian-pkgs.sh diff --git a/.cirrus.yml b/.cirrus.yml index 848e141329..99dd70d63f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,7 +13,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel + contrib/dependencies/dnf-packages.sh # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -63,7 +63,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -83,7 +83,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -96,7 +96,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local task: @@ -107,7 +107,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local CLANG=1 task: diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 354873909e..bf7d06697c 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 88e21d3d17..9c9e46c1b2 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | - sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml index eb3d8e8141..7a7d8bd309 100644 --- a/.github/workflows/nftables-test.yml +++ b/.github/workflows/nftables-test.yml @@ -15,7 +15,7 @@ jobs: - name: Remove iptables run: sudo apt remove -y iptables - name: Install libnftables-dev - run: sudo scripts/ci/apt-install libnftables-dev + run: sudo contrib/apt-install libnftables-dev - name: chmod 755 /home/runner # CRIU's tests are sometimes running as some random user and need # to be able to access the test files. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 712e7b8132..3ad4aa1019 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,19 +27,43 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: + +``` +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: ``` - make +make ``` This should create the `./criu/criu` executable. @@ -63,7 +87,7 @@ The following command can be used to automatically run a code linter for Python text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` - make lint +make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) @@ -73,7 +97,7 @@ results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` - make indent +make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to @@ -83,7 +107,7 @@ can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` - make indent OPTS=--diff BASE=HEAD~N +make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected @@ -96,7 +120,7 @@ Here are some bad examples of clang-format-ing: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } - + if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); @@ -129,7 +153,7 @@ Here are some bad examples of clang-format-ing: CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. @@ -166,21 +190,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -263,7 +287,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -275,14 +299,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -316,8 +340,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -335,7 +359,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -346,8 +370,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -359,14 +383,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address diff --git a/Makefile b/Makefile index 7272cfce19..3e5d62726c 100644 --- a/Makefile +++ b/Makefile @@ -464,7 +464,8 @@ ruff: shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 100% rename from scripts/ci/apt-install rename to contrib/apt-install diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index ce45f1b7cf..0000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,19 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 0000000000..0084dea3ab --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + nftables \ + nftables-dev \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 0000000000..588be40d02 --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 0000000000..c60ba9041c --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 0000000000..efbb659c54 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 0000000000..5fe6995fb9 --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libnet \ + libnl \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 819fda0c38..ed883f3002 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,49 +1,12 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo \ - libcap-utils \ - libdrm-dev \ - util-linux \ - util-linux-dev - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - iptables-legacy \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-importlib-metadata \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d4b432f8d6..261bd2d799 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -5,40 +5,11 @@ ARG CC=gcc # Initialize machine ID RUN systemd-machine-id-setup -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - asciidoctor \ - python-importlib-metadata \ - libdrm \ - util-linux-libs \ - diffutils - COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index 5ab6c9cfa4..0000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,48 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f879..c26a5fd576 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index 6caf9d0b1b..cd632dddf5 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,30 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - util-linux-dev \ - bash - COPY . /criu WORKDIR /criu +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 67de916acb..76aa571fac 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,33 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index d218e06414..a37f16e495 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - uuid-dev \ - python3-minimal - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 0ae4727d2c..8254956596 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index e95a433067..8933a6c828 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -1,5 +1,3 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 @@ -12,33 +10,6 @@ COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ apt-get update -y -# Install required packages -RUN apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - git \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libnftables-dev:${DEBIAN_ARCH} \ - libgnutls28-dev:${DEBIAN_ARCH} \ - iproute2:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -55,4 +26,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 65ae558334..56104081f0 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,30 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libdrm-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -41,6 +18,8 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu +RUN contrib/dependencies/apt-cross-packages.sh + # amdgpu_plugin with armv7 is not supported RUN make mrproper && date && \ make -j $(nproc) && \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 3d6de10441..498b99be9f 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,39 +1,11 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install +COPY . /criu +WORKDIR /criu # On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default # We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-protobuf \ - uuid-dev \ - python3-yaml - -COPY . /criu -WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh RUN git clean -dfx && date && \ # Check single object build diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index 3504b0433c..7edb289b6f 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,29 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -40,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 566b4c9160..a666f6c262 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 3893152270..a420cea942 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 9dc0190b37..ed30e42686 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ae7f52454d..bc5a746675 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -7,7 +7,7 @@ set -x -e -o pipefail # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive apt remove -y docker-ce docker-ce-cli -./apt-install -y ca-certificates curl +../../contrib/apt-install -y ca-certificates curl install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc chmod a+r /etc/apt/keyrings/docker.asc @@ -18,7 +18,7 @@ echo \ apt update -y apt-cache madison docker-ce | awk '{ print $3 }' verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" -./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh index 7cf704f074..a5b13a1071 100755 --- a/scripts/ci/java-test.sh +++ b/scripts/ci/java-test.sh @@ -2,6 +2,8 @@ cd ../.. || exit 1 +sudo modprobe iptable_filter + failures="" docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index d5646468e8..7e00ab65a8 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -4,7 +4,7 @@ set -o nounset set -o errexit set -x -./apt-install \ +../../contrib/apt-install \ apt-transport-https \ ca-certificates \ curl \ @@ -19,7 +19,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +../../contrib/apt-install docker-ce # shellcheck source=/dev/null . /etc/lsb-release diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8f797c1e5..ff75717c59 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,43 +1,22 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ gawk \ - gcc \ - git \ - gnutls-devel \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ + kmod \ libselinux-utils \ - make \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-PyYAML \ - python3-protobuf \ python3-pip \ - python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - libdrm-devel \ - libuuid-devel \ - kmod + which # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 617f54fc6e..9fbdd8e309 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,12 +1,7 @@ #!/bin/bash set -x -e -CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev - libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata libdrm-dev) +CI_PKGS=() X86_64_PKGS=(gcc-multilib) @@ -60,7 +55,8 @@ ci_prep () { CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "${CI_PKGS[@]}" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" } @@ -187,7 +183,7 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then done apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "${IA32_PKGS[@]}" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c222e30e05..f69b113523 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,9 +22,8 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} @@ -41,16 +40,13 @@ setup() { vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 8be49c7871..0000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi From 71e51b554b96dd623662920a64433e8b1f616462 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 26 Sep 2025 16:54:49 +0100 Subject: [PATCH 180/198] Remove travis-ci leftovers Travis CI stopped providing CI minutes for open-source projects some time ago and we have migrated to GitHub actions. Signed-off-by: Radostin Stoyanov --- .travis.yml | 35 ----------------------------------- CONTRIBUTING.md | 7 ------- Makefile | 2 +- Makefile.compel | 4 ++-- scripts/ci/Makefile | 4 ++-- scripts/ci/run-ci-tests.sh | 16 +++++++--------- scripts/ci/vagrant.sh | 7 +------ test/inhfd/memfd.py.checkskip | 2 +- test/zdtm/Makefile.inc | 2 +- 9 files changed, 15 insertions(+), 64 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3c..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ad4aa1019..2d1dc8227e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -158,11 +158,6 @@ make test The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -420,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Makefile b/Makefile index 3e5d62726c..611bcdd5aa 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a diff --git a/Makefile.compel b/Makefile.compel index 764afadc81..a4209edc5d 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ed30e42686..bad8065f23 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -30,9 +30,9 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 9fbdd8e309..7a8345b7c0 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,13 +11,11 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be availble + # when running in a container. Here we assume that x86_64 + # systems are baremetal, and skip the tests for all other + # CPU architectures. We can override this using the RUN_TESTS + # environment variable (e.g., for aarch64). [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -31,7 +29,7 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user @@ -258,7 +256,7 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index f69b113523..5f2de32b84 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -11,11 +11,6 @@ FEDORA_VERSION=42 FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. @@ -29,7 +24,7 @@ setup() { vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' # Sync /tmp/criu.tar into the VM diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 27e2b7b155..32c57d929c 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -3,5 +3,5 @@ import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c19888da31..3b349ed4d7 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) else ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) + # To build aarch32 on armv8 (see criu Makefile) ARCHCFLAGS += -march=armv7-a ARMV := 7 endif From 0ef7a1e3c21196bbc77a1b33563144b25adda317 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 26 Sep 2025 23:38:08 +0900 Subject: [PATCH 181/198] ci/java: update base image from focal to jammy Ubuntu Focal Fossa (20.04) reached its end-of-life on 31 May 2025. So, move over to using Ubuntu Jammy (22.04) base images. Also, focal repos do not have libtracefs, which the uprobes zdtm test needs. Signed-off-by: Shashank Balaji --- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- scripts/ci/run-ci-tests.sh | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 76aa571fac..a459e1ec71 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:11-focal +FROM docker.io/library/eclipse-temurin:11-jammy ARG CC=gcc COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 8254956596..18664f100a 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7a8345b7c0..05a3b71e8d 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,11 +11,10 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # Some tests rely on kernel features that may not be availble - # when running in a container. Here we assume that x86_64 - # systems are baremetal, and skip the tests for all other - # CPU architectures. We can override this using the RUN_TESTS - # environment variable (e.g., for aarch64). + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi From de30e24356b19f303676c83dca6488af2d92f702 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 27 Sep 2025 09:21:26 +0100 Subject: [PATCH 182/198] ci: consolidate aarch64 tests on GitHub runners Currently we run aarch64 tests on both Cirrus CI and GitHub runners. However, Cirrus CI fails with "Monthly compute limit exceeded!". This change removes the redundant tests to streamline our CI process. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 22 ---------------------- .github/workflows/aarch64-test.yaml | 6 ++++-- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 99dd70d63f..72dbb38981 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -88,28 +88,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: aarch64 build GCC (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local - -task: - name: aarch64 build CLANG (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local CLANG=1 - task: name: aarch64 Fedora Rawhide arm_container: diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml index 32b19e1766..ebbecadb33 100644 --- a/.github/workflows/aarch64-test.yaml +++ b/.github/workflows/aarch64-test.yaml @@ -9,14 +9,16 @@ concurrency: jobs: build: - runs-on: ubuntu-24.04-arm strategy: matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} # Following tests are failing on the VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From 6549697c0f245f212a2911f8d820a41baa0bfe21 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 1 Oct 2025 11:20:13 +0100 Subject: [PATCH 183/198] contributing: update links to mailing list Our previous mailing list had some technical issues and we created a new one that is hopefully more reliable. Signed-off-by: Radostin Stoyanov --- CONTRIBUTING.md | 12 ++++++------ crit/pyproject.toml | 2 +- crit/setup.cfg | 2 +- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d1dc8227e..03875639df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -366,7 +366,7 @@ We recommend to post patches using `git send-email` ``` git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -381,11 +381,11 @@ configure it to point it to your SMTP server with something like: git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` -git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -398,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 9089f0a394..f0b185eb7a 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "crit" description = "CRiu Image Tool" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/crit/setup.cfg b/crit/setup.cfg index fbc9a51439..37895923f3 100644 --- a/crit/setup.cfg +++ b/crit/setup.cfg @@ -7,7 +7,7 @@ name = crit description = CRiu Image Tool author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: crit.__version__ diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 8eb4b7084d..c9e11551b0 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "pycriu" description = "Python bindings for CRIU" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 23ee48dd5b..5d75719ca9 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -7,7 +7,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: pycriu.__version__ From cc7f457b3cab7e851b2dcfdde105955233a20f41 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 2 Oct 2025 08:39:30 +0100 Subject: [PATCH 184/198] page-xfer: fix incompatible pointer type on armv7 page_pipe_read() expects an 'unsigned long *', but pi->nr_pages is u64. On 32-bit platforms (e.g., armv7), passing &pi->nr_pages directly causes a compiler error. To fix this we introduce a temporary variable and copy the result back to pi->nr_pages. Fixes: #2756 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/page-xfer.c | 9 +++++++-- criu/pagemap.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index e2913b9244..463d4c506f 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1139,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1154,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index b6ec3e3332..6c9c4f7feb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From b3c809075c5b8da4681193b71bf0647b57750623 Mon Sep 17 00:00:00 2001 From: Pepper Gray <111446242+peppergrayxyz@users.noreply.github.com> Date: Tue, 30 Sep 2025 22:58:29 +0200 Subject: [PATCH 185/198] make: prevent redefinition of 'struct sigcontext' Compilation on gentoo/arm64 (llvm+musl) fails with: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ This is happening because and are mutually incompatible on Linux. To fix, use instead of for arm64 (like all others arches do). Fixes: #2766 Signed-off-by: Pepper Gray --- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 3 ++- criu/arch/aarch64/include/asm/restorer.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 9152024fd8..a3528500db 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,10 +1,11 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb9..2174df4fa1 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" From df8c163ba2a1aca405123426a0c4945938f58667 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 2 Oct 2025 12:03:57 -0700 Subject: [PATCH 186/198] ci: run alpine tests on arm64 These tests reveal the following build error: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ Inspired by #2766 / #2767. Signed-off-by: Kir Kolyshkin Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 3 ++- contrib/dependencies/apk-packages.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 73530d79ae..0f5c20f48b 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,10 +9,11 @@ concurrency: jobs: build: - runs-on: ubuntu-22.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index 0084dea3ab..d02704b15c 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -22,6 +22,7 @@ apk add --no-cache \ libnl3-dev \ nftables \ nftables-dev \ + perl \ pkgconfig \ procps \ protobuf-c-compiler \ From 849ed721e0cafb9c018a64702d069e68ce5283c5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 3 Oct 2025 17:02:25 +0100 Subject: [PATCH 187/198] zdtm: add sk-unix-restore-fs-share test Add a ZDTM test case where CRIU uses a helper process to restore a non-empty process group with a terminated leader and a Unix domain socket. This reproduces a corner case in which mount namespace switching can fail during restore: https://github.com/checkpoint-restore/criu/issues/2687 Signed-off-by: Qiao Ma Signed-off-by: Radostin Stoyanov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/sk-unix-restore-fs-share.c | 196 ++++++++++++++++++ .../zdtm/static/sk-unix-restore-fs-share.desc | 1 + 3 files changed, 198 insertions(+) create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.c create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index e73f964be5..6b262c4439 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -382,6 +382,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + sk-unix-restore-fs-share \ mnt_ext_file_bind_auto \ TST_DIR = \ diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 0000000000..d4f6dde75d --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 0000000000..6c4afe5f03 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} From b2bed94d962718d78f0260b998bf52465bb50699 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 11:00:07 +0900 Subject: [PATCH 188/198] vma: introduce VMA_AREA_UPROBES flag This flag will be used for a "[uprobes]" vma. Signed-off-by: Shashank Balaji --- criu/include/image.h | 7 +++++++ criu/util.c | 1 + 2 files changed, 8 insertions(+) diff --git a/criu/include/image.h b/criu/include/image.h index 934f7d4e97..b5951d3d49 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -74,6 +74,12 @@ * about virtual address space ranges covered by * MADV_GUARD_INSTALL guards. These ones must be always at * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -94,6 +100,7 @@ #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) #define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/util.c b/criu/util.c index 58c18e20be..e2f80e4c61 100644 --- a/criu/util.c +++ b/criu/util.c @@ -195,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } From fc8ed5bb059d66ff2af984da8098fd6b602e3be8 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:53:18 +0900 Subject: [PATCH 189/198] criu-coredump: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- coredump/criu_coredump/coredump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index c6a758c8ad..9454d8f0bb 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -55,6 +55,7 @@ "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } From beba2696345c5e5090f3fec4ac55c3e0d26bffbc Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:54:28 +0900 Subject: [PATCH 190/198] crit: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 6c4f688896..a35dd3c3fc 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -105,6 +105,7 @@ def _custom_conv(field): ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] From 6c25e56a5710b8503614626e5132ce46eca1177d Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:03:39 +0900 Subject: [PATCH 191/198] vma: introduce --allow-uprobes option This commit teaches criu to deal with processes which have a "[uprobes]" vma. This vma is mapped by the kernel when execution hits a uprobe location. This is done so as to execute the uprobe'd instruciton out-of-line in the special vma. The uprobe'd location is replaced by a software breakpoint instruction, which is int3 on x86. When execution reaches that location, control is transferred over to the kernel, which then executes whatever handler code it has to, for the uprobe, and then executed the replaced instruction out-of-line in the special vma. For more details, refer to this commit: https://github.com/torvalds/linux/commit/d4b3b6384f98f8692ad0209891ccdbc7e78bbefe Reason for adding a new option ------------------------------ A new option is added instead of making the uprobes vma handling transparent to the user, so that when a dump is attempted on a process tree in which a process has the uprobes vma, criu will error, asking the user to use this option. This gives the user a chance to check what uprobes are attached to the processes being dumped, and try to ensure that those uprobes are active on restore as well. Again, the same reason for requiring this option on restore as well. Because if a process is dumped with an active uprobe, and on restore if the uprobe is not active, then if execution reaches the uprobe location, then the process will be sent a SIGTRAP, whose default behaviour will terminate and core dump the process. This is because the code pages are dumped with the software breakpoint instruction replacement at the uprobe'd locations. On restore, if execution reaches these locations and the kernel sees no associated active uprobes, then it'll send a SIGTRAP. So, using this option is on dump and restore is an implicit guarantee on the user's behalf that they'll take care of the active uprobes and that any future SIGTRAPs because of this are not on us! :) Handling uprobes vma on dump ---------------------------- We don't need to store any information about the uprobes vma because it's completely handled by the kernel, transparent to userspace. So, when a uprobes vma is detected, we check if the --allow-uprobes option was specified or not. If so, then the allow_uprobes boolean in the inventory image is set (this is used on restore). The uprobes vma is skipped from being added to the vma list. Handling uprobes vma on restore ------------------------------- If allow_uprobes is set in the inventory image, then check if --allow-uprobes is specified or not. Restoring the vma is not required. Fixes: checkpoint-restore#1961 Signed-off-by: Shashank Balaji --- criu/config.c | 2 ++ criu/cr-dump.c | 4 ++++ criu/crtools.c | 2 ++ criu/image.c | 5 +++++ criu/include/cr_options.h | 1 + criu/include/image.h | 2 ++ criu/include/proc_parse.h | 2 ++ criu/proc_parse.c | 24 +++++++++++++++++++++++- images/inventory.proto | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 1322a490ab..d7ef3f8e8b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -703,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 10c485cbe9..60b8e793c9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2319,6 +2319,10 @@ int cr_dump_tasks(pid_t pid) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } ret = write_img_inventory(&he); if (ret) diff --git a/criu/crtools.c b/criu/crtools.c index 509e73d741..203bded811 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -427,6 +427,8 @@ int main(int argc, char *argv[], char *envp[]) " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/image.c b/criu/image.c index f3747d6ff5..c4f05e1597 100644 --- a/criu/image.c +++ b/criu/image.c @@ -95,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7b..8c5707b415 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -196,6 +196,7 @@ struct cr_options { char *work_dir; int network_lock_method; int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first diff --git a/criu/include/image.h b/criu/include/image.h index b5951d3d49..b06dbf7062 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -114,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0bd79bf553..76d3242d2b 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -105,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index bb642648e9..f51f2e801e 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -74,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -202,8 +204,11 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -603,6 +608,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -739,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -2929,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/images/inventory.proto b/images/inventory.proto index 1e18815bb9..feed5b8509 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -33,4 +33,5 @@ message inventory_entry { // This is currently used to delete the correct nftables // network locking rule. optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } From 61dddfe61cdb37f7d87ea5978a6251f62aa0e1d6 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:04:10 +0900 Subject: [PATCH 192/198] docs: add documentation for --allow-uprobes Signed-off-by: Shashank Balaji --- Documentation/criu.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790b..40ede84e25 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -465,6 +465,30 @@ The 'mode' may be one of the following: *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -692,6 +716,10 @@ The 'mode' may be one of the following: *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to From f0e6b6dee311f5ce4a972fd9ec185e2accd09440 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 20 Aug 2025 22:05:03 +0900 Subject: [PATCH 193/198] crtools: remove "consult documentation" Most people know this, don't they? :) Suggested-by: Radostin Stoyanov Signed-off-by: Shashank Balaji --- criu/crtools.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 203bded811..e207133ac0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -426,9 +426,7 @@ int main(int argc, char *argv[], char *envp[]) " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" - " consult documentation for further details\n" " --allow-uprobes allow dump/restore with uprobes vma\n" - " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" From 08fa6c369a82b9945867c9f06fb1a44ff7a023c0 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 22 Aug 2025 12:47:16 +0900 Subject: [PATCH 194/198] zdtm: add a test for --allow-uprobes option Program flow: - Parse the test's own executable to calculate the file offset of the uprobe target function symbol - Enable the uprobe at the target function - Call the target function to trigger the uprobe, and hence the uprobes vma creation - C/R - Call the target function again to check that no SIGTRAP is sent, since the uprobe is still active At least v1.7 of libtracefs is required because that's when tracefs_instance_reset was introduced. The uprobes API was introduced in v1.4, and the dynamic events API was introduced in v1.3. Ubuntu Focal doesn't have libtracefs. Jammy has v1.2.5, and Noble has v1.7. Signed-off-by: Shashank Balaji --- contrib/dependencies/apk-packages.sh | 3 + contrib/dependencies/apt-cross-packages.sh | 5 +- contrib/dependencies/apt-packages.sh | 3 + contrib/dependencies/dnf-packages.sh | 5 +- contrib/dependencies/pacman-packages.sh | 3 + test/zdtm/static/Makefile | 9 +- test/zdtm/static/uprobes.c | 295 +++++++++++++++++++++ test/zdtm/static/uprobes.desc | 6 + 8 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 test/zdtm/static/uprobes.c create mode 100644 test/zdtm/static/uprobes.desc diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index d02704b15c..c47fb9fe07 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -6,6 +6,7 @@ apk add --no-cache \ build-base \ coreutils \ e2fsprogs \ + elfutils-dev \ git \ gnutls-dev \ go \ @@ -20,6 +21,8 @@ apk add --no-cache \ libdrm-dev \ libnet-dev \ libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ nftables \ nftables-dev \ perl \ diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh index 588be40d02..30ce6874c8 100755 --- a/contrib/dependencies/apt-cross-packages.sh +++ b/contrib/dependencies/apt-cross-packages.sh @@ -14,6 +14,8 @@ fi libc6-"${DEBIAN_ARCH}"-cross \ libc6-dev-"${DEBIAN_ARCH}"-cross \ libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ libexpat1-dev:"${DEBIAN_ARCH}" \ libgnutls28-dev:"${DEBIAN_ARCH}" \ libnet-dev:"${DEBIAN_ARCH}" \ @@ -23,9 +25,10 @@ fi libprotobuf-c-dev:"${DEBIAN_ARCH}" \ libprotobuf-dev:"${DEBIAN_ARCH}" \ libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ ncurses-dev:"${DEBIAN_ARCH}" \ uuid-dev:"${DEBIAN_ARCH}" \ - libdrm-dev:"${DEBIAN_ARCH}" \ build-essential \ pkg-config \ git \ diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index c60ba9041c..1fd42d4e68 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -19,6 +19,7 @@ fi libbsd-dev \ libcap-dev \ libdrm-dev \ + libelf-dev \ libgnutls28-dev \ libgnutls30 \ libnet-dev \ @@ -28,6 +29,8 @@ fi libprotobuf-c-dev \ libprotobuf-dev \ libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index efbb659c54..00dc91a2e8 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -3,6 +3,7 @@ dnf install -y \ asciidoc \ binutils \ + elfutils-libelf-devel \ gcc \ git \ glibc-devel \ @@ -18,6 +19,8 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ libuuid-devel \ make \ nftables \ @@ -27,9 +30,9 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ - python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ + python-devel \ rubygem-asciidoctor \ xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh index 5fe6995fb9..260797606b 100755 --- a/contrib/dependencies/pacman-packages.sh +++ b/contrib/dependencies/pacman-packages.sh @@ -15,8 +15,11 @@ pacman -Syu --noconfirm \ libbsd \ libcap \ libdrm \ + libelf \ libnet \ libnl \ + libtraceevent \ + libtracefs \ nftables \ pkg-config \ protobuf \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6b262c4439..ea901a805d 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -290,6 +290,7 @@ TST_NOFILE := \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ @@ -298,7 +299,10 @@ endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -727,6 +731,9 @@ sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 cgroupv2_01: LDLIBS += -pthread +uprobes: CFLAGS += $(call pkg-cflags, libtracefs libtraceevent) +uprobes: LDLIBS += $(call pkg-libs, libtracefs libelf) + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 0000000000..4164375b7a --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 0000000000..6eab1f4982 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +} From 003fe3223a67df4bb9a1c785d4f4a02a6075b30f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Sep 2025 10:50:46 +0100 Subject: [PATCH 195/198] cr-service: refactor rpc config parsing When an additional configuration file is specified via RPC, this file is parsed twice: first at an early stage to load options such as --log-file, --work-dir, and --images-dir; and again after all RPC options and configuration files have been evaluated. This allows users to overwrite options specified via RPC by the container runtime (e.g., --tcp-established). However, processing the RPC config file twice leads to silently duplicating the values of repeatable options such as `--action-script`. To address this problem, we adjust the order of options parsing so that the RPC config file is evaluated only once. This change should not introduce any functional changes. Note that this change does not affect the logging functionality, as early log messages are temporarily buffered and only written to the log file once it has been initialized (see commit 1ff2333 "Printout early log messages"). Fixes #2727 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 299 +++++++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 161 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index a1089ad5c7..e6aac232e7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -312,156 +312,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else if (req->images_dir_fd != -1) - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - else if (req->images_dir) - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - goto err; - } - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - } - if (req->has_unprivileged) opts.unprivileged = req->unprivileged; @@ -753,14 +603,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -781,13 +623,148 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_display_stats) opts.display_stats = req->display_stats; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + } else if (req->images_dir_fd != -1) { + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; } if (req->mntns_compat_mode) From 3dde8338e9292d832f1d26e1718c609e45998df0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 18:29:34 +0100 Subject: [PATCH 196/198] test/others/rpc: parse action-script via config Extend the test for overwriting config options via RPC with repeatable option (--action-script) and verify that the value will not be silently duplicated. Signed-off-by: Radostin Stoyanov --- test/others/rpc/Makefile | 1 + test/others/rpc/action-script.sh | 17 +++++++++++++++++ test/others/rpc/config_file.py | 11 +++++++++++ 3 files changed, 29 insertions(+) create mode 100755 test/others/rpc/action-script.sh diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 384eb05397..c0e56d5289 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -12,6 +12,7 @@ run: all chmod a+rwx build chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* @# Create all log files to be accessible for anybody @# so that they can be displayed by any user. for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 0000000000..991e315de4 --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 6cffe270d0..f5ec408187 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -13,6 +13,9 @@ log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -156,6 +159,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +184,18 @@ def test_rpc_with_configuration_file_overwriting_rpc(): cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) From c14c2ae842ebe6c15e635e5ed6ff2b65cdd229d8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 21:40:02 +0100 Subject: [PATCH 197/198] test/others/rpc: show logs on error Signed-off-by: Radostin Stoyanov --- test/others/rpc/config_file.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index f5ec408187..c1a8276d86 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -92,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): From 4faffd1198a52391c562d21d72ac82f1dfa9062f Mon Sep 17 00:00:00 2001 From: tohanov <5686071+tohanov@users.noreply.github.com> Date: Wed, 15 Oct 2025 15:02:50 +0300 Subject: [PATCH 198/198] cr-restore: Fix log message typo Changed 'do' to 'does' in a log message. Signed-off-by: :| --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1c3b364518..172bba4f89 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1536,7 +1536,7 @@ static int __restore_task_with_children(void *_arg) pid = getpid(); if (vpid(current) != pid) { - pr_err("Pid %d do not match expected %d\n", pid, vpid(current)); + pr_err("Pid %d does not match expected %d\n", pid, vpid(current)); set_task_cr_err(EEXIST); goto err; }