Description: Fix CVE-2023-41914
 Fix filesystem handling race conditions that could lead to an attacker
 taking control of an arbitrary file, or removing entire directories'
 contents.
Author: Tim Wickberg <tim@schedmd.com>
Last-Update: 2023-09-28

diff --git a/src/common/fd.c b/src/common/fd.c
index 421058e56f..bf94e6e52b 100644
--- a/src/common/fd.c
+++ b/src/common/fd.c
@@ -67,6 +67,7 @@ strong_alias(fd_set_nonblocking,slurm_fd_set_nonblocking);
 strong_alias(fd_get_socket_error, slurm_fd_get_socket_error);
 strong_alias(send_fd_over_pipe, slurm_send_fd_over_pipe);
 strong_alias(receive_fd_over_pipe, slurm_receive_fd_over_pipe);
+strong_alias(rmdir_recursive, slurm_rmdir_recursive);
 
 static int fd_get_lock(int fd, int cmd, int type);
 static pid_t fd_test_lock(int fd, int type);
@@ -454,3 +455,83 @@ extern int receive_fd_over_pipe(int socket)
 
 	return fd;
 }
+
+static int _rmdir_recursive(int dirfd)
+{
+	int rc = 0;
+	DIR *dp;
+	struct dirent *ent;
+
+	if (!(dp = fdopendir(dirfd))) {
+		error("%s: can't open directory: %m", __func__);
+		return 1;
+	}
+
+	while ((ent = readdir(dp))) {
+		int childfd = -1;
+
+		/* skip special directories */
+		if (!strcmp(ent->d_name, ".") ||
+		    !strcmp(ent->d_name, "..")) {
+			continue;
+		}
+
+		/* try to remove entry, first as a file, then as a directory */
+		if (unlinkat(dirfd, ent->d_name, 0) != -1) {
+			debug("%s: removed file `%s`", __func__, ent->d_name);
+			continue;
+		} else if (unlinkat(dirfd, ent->d_name, AT_REMOVEDIR) != -1) {
+			debug("%s: removed empty directory `%s`",
+			      __func__, ent->d_name);
+			continue;
+		}
+
+		/* removal didn't work. assume it's a non-empty directory */
+		if ((childfd = openat(dirfd, ent->d_name,
+				      (O_DIRECTORY | O_NOFOLLOW))) < 0) {
+			debug("%s: openat() failed for `%s`: %m",
+			      __func__, ent->d_name);
+			rc++;
+			continue;
+		}
+
+		debug("%s: descending into directory `%s`",
+		      __func__, ent->d_name);
+		rc += _rmdir_recursive(childfd);
+		(void) close(childfd);
+
+		if (unlinkat(dirfd, ent->d_name, AT_REMOVEDIR) != -1) {
+			debug("%s: removed now-empty directory `%s`",
+			      __func__, ent->d_name);
+		} else {
+			debug("%s: unlinkat() failed for `%s`: %m",
+			      __func__, ent->d_name);
+			rc++;
+		}
+	}
+	closedir(dp);
+
+	return rc;
+}
+
+extern int rmdir_recursive(const char *path, bool remove_top)
+{
+	int rc = 0;
+	int dirfd;
+
+	if ((dirfd = open(path, O_DIRECTORY | O_NOFOLLOW)) < 0) {
+		error("%s: could not open %s", __func__, path);
+		return 1;
+	}
+
+	if ((rc = _rmdir_recursive(dirfd)))
+		error("%s: could not completely remove `%s`, %d files left",
+		      __func__, path, rc);
+
+	close(dirfd);
+
+	if (remove_top && (rmdir(path) < 0))
+		rc++;
+
+	return rc;
+}
diff --git a/src/common/fd.h b/src/common/fd.h
index d729f81689..307749542b 100644
--- a/src/common/fd.h
+++ b/src/common/fd.h
@@ -156,4 +156,13 @@ extern char *poll_revents_to_str(const short revents);
 extern void send_fd_over_pipe(int socket, int fd);
 extern int receive_fd_over_pipe(int socket);
 
+/*
+ * Recursively remove a directory and all contents.
+ * Takes care not to follow any symlinks outside the target directory.
+ *
+ * Returns the number of files/directories it failed to remove,
+ * or 0 on success.
+ */
+extern int rmdir_recursive(const char *path, bool remove_top);
+
 #endif /* !_FD_H */
diff --git a/src/common/slurm_xlator.h b/src/common/slurm_xlator.h
index 60705f5796..a3db33bbcc 100644
--- a/src/common/slurm_xlator.h
+++ b/src/common/slurm_xlator.h
@@ -123,6 +123,7 @@
 #define fd_get_socket_error	slurm_fd_get_socket_error
 #define send_fd_over_pipe	slurm_send_fd_over_pipe
 #define receive_fd_over_pipe	slurm_receive_fd_over_pipe
+#define rmdir_recursive		slurm_rmdir_recursive
 
 /* hostlist.[ch] functions */
 #define	hostlist_create_dims	slurm_hostlist_create_dims
diff --git a/src/plugins/acct_gather_profile/hdf5/acct_gather_profile_hdf5.c b/src/plugins/acct_gather_profile/hdf5/acct_gather_profile_hdf5.c
index ea17246bd8..fca1244e1f 100644
--- a/src/plugins/acct_gather_profile/hdf5/acct_gather_profile_hdf5.c
+++ b/src/plugins/acct_gather_profile/hdf5/acct_gather_profile_hdf5.c
@@ -44,6 +44,7 @@
  *  Copyright (C) 2002 The Regents of the University of California.
 \*****************************************************************************/
 
+#include <grp.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
@@ -101,6 +102,14 @@ const char plugin_name[] = "AcctGatherProfile hdf5 plugin";
 const char plugin_type[] = "acct_gather_profile/hdf5";
 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
 
+struct priv_state {
+	uid_t	saved_uid;
+	gid_t	saved_gid;
+	gid_t * gid_list;
+	int	ngids;
+	char	saved_cwd [4096];
+};
+
 typedef struct {
 	char *dir;
 	uint32_t def;
@@ -133,6 +142,88 @@ static table_t *tables = NULL;
 static size_t   tables_max_len = 0;
 static size_t   tables_cur_len = 0;
 
+/* If get_list is false make sure ps->gid_list is initialized before
+ * hand to prevent xfree.
+ */
+static int
+_drop_privileges(stepd_step_rec_t *job, bool do_setuid,
+		 struct priv_state *ps, bool get_list)
+{
+	ps->saved_uid = getuid();
+	ps->saved_gid = getgid();
+
+	if (!getcwd (ps->saved_cwd, sizeof (ps->saved_cwd))) {
+		error ("Unable to get current working directory: %m");
+		strlcpy(ps->saved_cwd, "/tmp", sizeof(ps->saved_cwd));
+	}
+
+	ps->ngids = getgroups(0, NULL);
+	if (ps->ngids == -1) {
+		error("%s: getgroups(): %m", __func__);
+		return -1;
+	}
+	if (get_list) {
+		ps->gid_list = (gid_t *) xmalloc(ps->ngids * sizeof(gid_t));
+
+		if (getgroups(ps->ngids, ps->gid_list) == -1) {
+			error("%s: couldn't get %d groups: %m",
+			      __func__, ps->ngids);
+			xfree(ps->gid_list);
+			return -1;
+		}
+	}
+
+	/*
+	 * No need to drop privileges if we're not running as root
+	 */
+	if (getuid() != (uid_t) 0)
+		return SLURM_SUCCESS;
+
+	if (setegid(job->gid) < 0) {
+		error("setegid: %m");
+		return -1;
+	}
+
+	if (setgroups(job->ngids, job->gids) < 0) {
+		error("setgroups: %m");
+		return -1;
+	}
+
+	if (do_setuid && seteuid(job->uid) < 0) {
+		error("seteuid: %m");
+		return -1;
+	}
+
+	return SLURM_SUCCESS;
+}
+
+static int
+_reclaim_privileges(struct priv_state *ps)
+{
+	int rc = SLURM_SUCCESS;
+
+	/*
+	 * No need to reclaim privileges if our uid == job->uid
+	 */
+	if (geteuid() == ps->saved_uid)
+		goto done;
+	else if (seteuid(ps->saved_uid) < 0) {
+		error("seteuid: %m");
+		rc = -1;
+	} else if (setegid(ps->saved_gid) < 0) {
+		error("setegid: %m");
+		rc = -1;
+	} else if (setgroups(ps->ngids, ps->gid_list) < 0) {
+		error("setgroups: %m");
+		rc = -1;
+	}
+
+done:
+	xfree(ps->gid_list);
+
+	return rc;
+}
+
 static void _reset_slurm_profile_conf(void)
 {
 	xfree(hdf5_conf.dir);
@@ -156,30 +247,66 @@ static uint32_t _determine_profile(void)
 
 static void _create_directories(void)
 {
-	char *user_dir = NULL;
+	char *parent_dir = NULL, *user_dir = NULL, *hdf5_dir_rel = NULL;
+	char *slash = NULL;
+	int parent_dirfd, user_parent_dirfd;
 
 	xassert(g_job);
 	xassert(hdf5_conf.dir);
 
-	xstrfmtcat(user_dir, "%s/%s", hdf5_conf.dir, g_job->user_name);
+	parent_dir = xstrdup(hdf5_conf.dir);
+	/* split into base and new directory name */
+	while ((slash = strrchr(parent_dir, '/'))) {
+		/* fix a path with one or more trailing slashes */
+		if (slash[1] == '\0')
+			slash[0] = '\0';
+		else
+			break;
+	}
+
+	if (!slash)
+		fatal("Invalid ProfileHDF5Dir=\"%s\"", hdf5_conf.dir);
+
+	slash[0] = '\0';
+	hdf5_dir_rel = slash + 1;
+
+	parent_dirfd = open(parent_dir, O_DIRECTORY | O_NOFOLLOW);
 
 	/*
-	 * To avoid race conditions (TOCTOU) with stat() calls, always
-	 * attempt to create the ProfileHDF5Dir and the user directory within.
+	 * Use *at family of syscalls to prevent TOCTOU abuse by working
+	 * on file descriptors instead of path names.
 	 */
-	if (((mkdir(hdf5_conf.dir, 0755)) < 0) && (errno != EEXIST))
-		fatal("mkdir(%s): %m", hdf5_conf.dir);
-	if (chmod(hdf5_conf.dir, 0755) < 0)
-		fatal("chmod(%s): %m", hdf5_conf.dir);
-
-	if (((mkdir(user_dir, 0700)) < 0) && (errno != EEXIST))
-		fatal("mkdir(%s): %m", user_dir);
-	if (chmod(user_dir, 0700) < 0)
-		fatal("chmod(%s): %m", user_dir);
-	if (chown(user_dir, g_job->uid, g_job->gid) < 0)
-		fatal("chown(%s): %m", user_dir);
+	if ((mkdirat(parent_dirfd, hdf5_dir_rel, 0755)) < 0) {
+		/* Never chmod on EEXIST */
+		if (errno != EEXIST)
+			fatal("mkdirat(%s): %m", hdf5_conf.dir);
+	} else if (fchmodat(parent_dirfd, hdf5_dir_rel, 0755,
+			    AT_SYMLINK_NOFOLLOW) < 0)
+		fatal("fchmodat(%s): %m", hdf5_conf.dir);
 
+	xstrfmtcat(user_dir, "%s/%s", hdf5_conf.dir, g_job->user_name);
+	user_parent_dirfd = openat(parent_dirfd, hdf5_dir_rel,
+				   O_DIRECTORY | O_NOFOLLOW);
+	close(parent_dirfd);
+
+	if ((mkdirat(user_parent_dirfd, g_job->user_name, 0700)) < 0) {
+		/* Never chmod on EEXIST */
+		if (errno != EEXIST)
+			fatal("mkdirat(%s): %m", user_dir);
+	} else {
+		/* fchmodat(2) man says AT_SYMLINK_NOFOLLOW not implemented. */
+		if (fchmodat(user_parent_dirfd, g_job->user_name, 0700, 0) < 0)
+			fatal("fchmodat(%s): %m", user_dir);
+
+		if (fchownat(user_parent_dirfd, g_job->user_name, g_job->uid,
+			     g_job->gid, AT_SYMLINK_NOFOLLOW) < 0)
+			fatal("fchmodat(%s): %m", user_dir);
+	}
+
+	close(user_parent_dirfd);
 	xfree(user_dir);
+	xfree(parent_dir);
+	/* Do not xfree() hdf5_dir_rel (interior pointer to freed data). */
 }
 
 /*
@@ -268,7 +395,7 @@ extern void acct_gather_profile_p_get(enum acct_gather_profile_info info_type,
 extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
 {
 	int rc = SLURM_SUCCESS;
-
+	struct priv_state sprivs = { 0 };
 	char *profile_file_name;
 
 	xassert(running_in_slurmstepd());
@@ -311,16 +438,24 @@ extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
 		 acct_gather_profile_to_string(g_profile_running),
 		 profile_file_name);
 
+	if (_drop_privileges(g_job, true, &sprivs, false) < 0) {
+		error("%s: Unable to drop privileges", __func__);
+		xfree(profile_file_name);
+		return SLURM_ERROR;
+	}
+
 	/*
 	 * Create a new file using the default properties
 	 */
 	file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT,
 			    H5P_DEFAULT);
-	if (chown(profile_file_name, (uid_t)g_job->uid,
-		  (gid_t)g_job->gid) < 0)
-		error("chown(%s): %m", profile_file_name);
-	if (chmod(profile_file_name, 0600) < 0)
-		error("chmod(%s): %m", profile_file_name);
+
+	if (_reclaim_privileges(&sprivs) < 0) {
+		error("%s: Unable to reclaim privileges", __func__);
+		xfree(profile_file_name);
+		return SLURM_ERROR;
+	}
+
 	xfree(profile_file_name);
 
 	if (file_id < 1) {
diff --git a/src/plugins/job_container/tmpfs/job_container_tmpfs.c b/src/plugins/job_container/tmpfs/job_container_tmpfs.c
index 0c2dad2adc..046fb68d03 100644
--- a/src/plugins/job_container/tmpfs/job_container_tmpfs.c
+++ b/src/plugins/job_container/tmpfs/job_container_tmpfs.c
@@ -39,7 +39,6 @@
 \*****************************************************************************/
 
 #define _GNU_SOURCE
-#define _XOPEN_SOURCE 500 /* For ftw.h */
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -47,7 +46,6 @@
 #include <sys/mman.h>
 #include <sched.h>
 #include <fcntl.h>
-#include <ftw.h>
 #include <sys/mount.h>
 #include <linux/limits.h>
 #include <semaphore.h>
@@ -55,6 +53,7 @@
 #include "src/common/slurm_xlator.h"
 
 #include "src/common/env.h"
+#include "src/common/fd.h"
 #include "src/common/log.h"
 #include "src/common/read_config.h"
 #include "src/common/run_command.h"
@@ -79,7 +78,6 @@ const uint32_t plugin_version   = SLURM_VERSION_NUMBER;
 
 static slurm_jc_conf_t *jc_conf = NULL;
 static int step_ns_fd = -1;
-static bool force_rm = true;
 List legacy_jobs;
 
 typedef struct {
@@ -382,35 +380,6 @@ static int _mount_private_shm(void)
 	return rc;
 }
 
-static int _rm_data(const char *path, const struct stat *st_buf,
-		    int type, struct FTW *ftwbuf)
-{
-	int rc = SLURM_SUCCESS;
-
-	if (remove(path) < 0) {
-		log_level_t log_lvl;
-		if (force_rm) {
-			rc = SLURM_ERROR;
-			log_lvl = LOG_LEVEL_ERROR;
-		} else
-			log_lvl = LOG_LEVEL_DEBUG2;
-
-		if (type == FTW_NS)
-			log_var(log_lvl,
-					"%s: Unreachable file of FTW_NS type: %s",
-					__func__, path);
-		else if (type == FTW_DNR)
-			log_var(log_lvl,
-					"%s: Unreadable directory: %s",
-					__func__, path);
-
-		log_var(log_lvl,
-				"%s: could not remove path: %s: %s",
-				__func__, path, strerror(errno));
-	}
-
-	return rc;
-}
 
 static int _create_ns(uint32_t job_id, uid_t uid, bool remount)
 {
@@ -681,11 +650,11 @@ exit1:
 
 exit2:
 	if (rc) {
+		int failures;
 		/* cleanup the job mount */
-		force_rm = true;
-		if (nftw(job_mount, _rm_data, 64, FTW_DEPTH|FTW_PHYS) < 0) {
-			error("%s: Directory traversal failed: %s: %s",
-			      __func__, job_mount, strerror(errno));
+		if ((failures = rmdir_recursive(job_mount, false))) {
+			error("%s: failed to remove %d files from %s",
+			      __func__, failures, job_mount);
 			return SLURM_ERROR;
 		}
 		umount2(job_mount, MNT_DETACH);
@@ -774,7 +743,7 @@ static int _delete_ns(uint32_t job_id, bool is_slurmd)
 {
 	char job_mount[PATH_MAX];
 	char ns_holder[PATH_MAX];
-	int rc = 0;
+	int rc = 0, failures = 0;
 
 #ifdef HAVE_NATIVE_CRAY
 	return SLURM_SUCCESS;
@@ -827,23 +796,9 @@ static int _delete_ns(uint32_t job_id, bool is_slurmd)
 		}
 	}
 
-	/*
-	 * Traverses the job directory, and delete all files.
-	 * Doesn't -
-	 *	traverse filesystem boundaries,
-	 *	follow symbolic links
-	 * Does -
-	 *	a post order traversal and delete directory after processing
-	 *      contents
-	 * NOTE: Can happen EBUSY here so we need to ignore this.
-	 */
-	force_rm = false;
-	if (nftw(job_mount, _rm_data, 64, FTW_DEPTH|FTW_PHYS) < 0) {
-		error("%s: Directory traversal failed: %s: %s",
-		      __func__, job_mount, strerror(errno));
-		return SLURM_ERROR;
-	}
-
+	if ((failures = rmdir_recursive(job_mount, false)))
+		error("%s: failed to remove %d files from %s",
+		      __func__, failures, job_mount);
 	if (umount2(job_mount, MNT_DETACH))
 		debug2("umount2: %s failed: %s", job_mount, strerror(errno));
 	rmdir(job_mount);
diff --git a/src/plugins/mpi/cray_shasta/apinfo.c b/src/plugins/mpi/cray_shasta/apinfo.c
index 2e03ed0e57..7236c83383 100644
--- a/src/plugins/mpi/cray_shasta/apinfo.c
+++ b/src/plugins/mpi/cray_shasta/apinfo.c
@@ -381,7 +381,7 @@ static int _open_apinfo(const stepd_step_rec_t *job)
 	apinfo = xstrdup_printf("%s/apinfo", appdir);
 
 	// Create file
-	fd = creat(apinfo, 0600);
+	fd = open(apinfo, (O_CREAT | O_WRONLY | O_TRUNC | O_EXCL), 0600);
 	if (fd == -1) {
 		error("%s: Couldn't open apinfo file %s: %m",
 		      plugin_type, apinfo);
diff --git a/src/plugins/mpi/cray_shasta/mpi_cray_shasta.c b/src/plugins/mpi/cray_shasta/mpi_cray_shasta.c
index 2528e65aa7..4d4dec893c 100644
--- a/src/plugins/mpi/cray_shasta/mpi_cray_shasta.c
+++ b/src/plugins/mpi/cray_shasta/mpi_cray_shasta.c
@@ -45,6 +45,7 @@
 #include "src/common/slurm_xlator.h"
 
 #include "src/common/env.h"
+#include "src/common/fd.h"
 #include "src/common/parse_config.h"
 #include "src/common/read_config.h"
 #include "src/common/slurm_mpi.h"
@@ -175,63 +176,6 @@ static void _set_pmi_port(char ***env)
 	env_array_overwrite_fmt(env, "PMI_CONTROL_PORT", "%lu", pmi_port);
 }
 
-/*
- * Determine whether the given path is a directory
- */
-static int _is_dir(char *path)
-{
-	struct stat stat_buf;
-
-	if (stat(path, &stat_buf)) {
-		error("%s: Cannot stat %s: %m", plugin_type, path);
-		return 1;
-	} else if (!S_ISDIR(stat_buf.st_mode)) {
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * Recursively remove a directory
- */
-static int _rmdir_recursive(char *path)
-{
-	char nested_path[PATH_MAX];
-	DIR *dp;
-	struct dirent *ent;
-
-	if (!(dp = opendir(path))) {
-		error("%s: Can't open directory %s: %m", plugin_type, path);
-		return SLURM_ERROR;
-	}
-
-	while ((ent = readdir(dp))) {
-		if (!xstrcmp(ent->d_name, ".") ||
-		    !xstrcmp(ent->d_name, "..")) {
-			/* skip special dir's */
-			continue;
-		}
-		snprintf(nested_path, sizeof(nested_path), "%s/%s", path,
-			 ent->d_name);
-		if (_is_dir(nested_path)) {
-			_rmdir_recursive(nested_path);
-		} else {
-			debug("%s: Removed file %s", plugin_type, nested_path);
-			unlink(nested_path);
-		}
-	}
-	closedir(dp);
-
-	if (rmdir(path) == -1) {
-		error("%s: Can't remove directory %s: %m",
-		      plugin_type, path);
-		return SLURM_ERROR;
-	}
-
-	debug("%s: Removed directory %s", plugin_type, path);
-	return SLURM_SUCCESS;
-}
-
 extern int mpi_p_slurmstepd_prefork(const stepd_step_rec_t *job, char ***env)
 {
 	/* do the node_name substitution once */
@@ -291,7 +235,7 @@ extern int fini(void)
 {
 	// Remove application spool directory
 	if (appdir)
-		_rmdir_recursive(appdir);
+		rmdir_recursive(appdir, true);
 
 	// Free allocated storage
 	xfree(appdir);
diff --git a/src/plugins/mpi/pmix/pmixp_client.c b/src/plugins/mpi/pmix/pmixp_client.c
index 9f4b74e4f1..e32d9b1fe3 100644
--- a/src/plugins/mpi/pmix/pmixp_client.c
+++ b/src/plugins/mpi/pmix/pmixp_client.c
@@ -475,16 +475,14 @@ static void _set_localinfo(List lresp)
 extern int pmixp_libpmix_init(void)
 {
 	int rc;
-	mode_t rights = (S_IRUSR | S_IWUSR | S_IXUSR) |
-			(S_IRGRP | S_IWGRP | S_IXGRP);
 
-	if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_lib(), rights))) {
+	if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_lib()))) {
 		PMIXP_ERROR_STD("Cannot create server lib tmpdir: \"%s\"",
 				pmixp_info_tmpdir_lib());
 		return errno;
 	}
 
-	if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_cli(), rights))) {
+	if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_cli()))) {
 		PMIXP_ERROR_STD("Cannot create client cli tmpdir: \"%s\"",
 				pmixp_info_tmpdir_cli());
 		return errno;
@@ -499,12 +497,6 @@ extern int pmixp_libpmix_init(void)
 	/* TODO: must be deleted in future once info-key approach harden */
 	setenv(PMIXP_PMIXLIB_TMPDIR, pmixp_info_tmpdir_lib(), 1);
 
-	/*
-	if( pmixp_fixrights(pmixp_info_tmpdir_lib(),
-		(uid_t) pmixp_info_jobuid(), rights) ){
-	}
-	*/
-
 	return 0;
 }
 
@@ -514,14 +506,14 @@ extern int pmixp_libpmix_finalize(void)
 
 	rc = pmixp_lib_finalize();
 
-	rc1 = pmixp_rmdir_recursively(pmixp_info_tmpdir_lib());
+	rc1 = rmdir_recursive(pmixp_info_tmpdir_lib(), true);
 	if (0 != rc1) {
 		PMIXP_ERROR_STD("Failed to remove %s\n",
 				pmixp_info_tmpdir_lib());
 		/* Not considering this as fatal error */
 	}
 
-	rc1 = pmixp_rmdir_recursively(pmixp_info_tmpdir_cli());
+	rc1 = rmdir_recursive(pmixp_info_tmpdir_cli(), true);
 	if (0 != rc1) {
 		PMIXP_ERROR_STD("Failed to remove %s\n",
 				pmixp_info_tmpdir_cli());
diff --git a/src/plugins/mpi/pmix/pmixp_utils.c b/src/plugins/mpi/pmix/pmixp_utils.c
index 50aae40ca9..79ed60a1d6 100644
--- a/src/plugins/mpi/pmix/pmixp_utils.c
+++ b/src/plugins/mpi/pmix/pmixp_utils.c
@@ -479,128 +479,12 @@ int pmixp_p2p_send(const char *nodename, const char *address, const char *data,
 	return rc;
 }
 
-static int _is_dir(char *path)
+int pmixp_mkdir(char *path)
 {
-	struct stat stat_buf;
-	int rc;
-	if (0 > (rc = stat(path, &stat_buf))) {
-		PMIXP_ERROR_STD("Cannot stat() path=\"%s\"", path);
-		return rc;
-	} else if (!S_ISDIR(stat_buf.st_mode)) {
-		return 0;
-	}
-	return 1;
-}
-
-int pmixp_rmdir_recursively(char *path)
-{
-	char nested_path[PATH_MAX];
-	DIR *dp;
-	struct dirent *ent;
-
-	int rc;
-
-	/*
-	 * Make sure that "directory" exists and is a directory.
-	 */
-	if (1 != (rc = _is_dir(path))) {
-		PMIXP_ERROR("path=\"%s\" is not a directory", path);
-		return (rc == 0) ? -1 : rc;
-	}
-
-	if ((dp = opendir(path)) == NULL) {
-		PMIXP_ERROR_STD("cannot open path=\"%s\"", path);
-		return -1;
-	}
-
-	while ((ent = readdir(dp)) != NULL) {
-		if (0 == xstrcmp(ent->d_name, ".")
-		    || 0 == xstrcmp(ent->d_name, "..")) {
-			/* skip special dir's */
-			continue;
-		}
-		snprintf(nested_path, sizeof(nested_path), "%s/%s", path,
-			 ent->d_name);
-		if (_is_dir(nested_path)) {
-			pmixp_rmdir_recursively(nested_path);
-		} else {
-			unlink(nested_path);
-		}
-	}
-	closedir(dp);
-	if ((rc = rmdir(path))) {
-		PMIXP_ERROR_STD("Cannot remove path=\"%s\"", path);
-	}
-	return rc;
-}
-
-static inline int _file_fix_rights(char *path, uid_t uid, mode_t mode)
-{
-	if (chmod(path, mode) < 0) {
-		PMIXP_ERROR("chown(%s): %m", path);
-		return errno;
-	}
-
-	if (chown(path, uid, (gid_t) -1) < 0) {
-		PMIXP_ERROR("chown(%s): %m", path);
-		return errno;
-	}
-	return 0;
-}
+	char *base = NULL, *newdir = NULL, *slash;
+	int dirfd;
+	mode_t rights = (S_IRUSR | S_IWUSR | S_IXUSR);
 
-int pmixp_fixrights(char *path, uid_t uid, mode_t mode)
-{
-	char nested_path[PATH_MAX];
-	DIR *dp;
-	struct dirent *ent;
-	int rc = 0;
-
-	/*
-	 * Make sure that "directory" exists and is a directory.
-	 */
-	if (1 != (rc = _is_dir(path))) {
-		PMIXP_ERROR("path=\"%s\" is not a directory", path);
-		return (rc == 0) ? -1 : rc;
-	}
-
-	if ((dp = opendir(path)) == NULL) {
-		PMIXP_ERROR_STD("cannot open path=\"%s\"", path);
-		return -1;
-	}
-
-	while ((ent = readdir(dp)) != NULL) {
-		if (0 == xstrcmp(ent->d_name, ".")
-		    || 0 == xstrcmp(ent->d_name, "..")) {
-			/* skip special dir's */
-			continue;
-		}
-		snprintf(nested_path, sizeof(nested_path), "%s/%s", path,
-			 ent->d_name);
-		if (_is_dir(nested_path)) {
-			if ((rc = _file_fix_rights(nested_path, uid, mode))) {
-				PMIXP_ERROR_STD("cannot fix permissions for "
-						"\"%s\"",
-						nested_path);
-				goto exit;
-			}
-			pmixp_rmdir_recursively(nested_path);
-		} else {
-			if ((rc = _file_fix_rights(nested_path, uid, mode))) {
-				PMIXP_ERROR_STD("cannot fix permissions for "
-						"\"%s\"",
-						nested_path);
-				goto exit;
-			}
-		}
-	}
-
-exit:
-	closedir(dp);
-	return rc;
-}
-
-int pmixp_mkdir(char *path, mode_t rights)
-{
 	/* NOTE: we need user who owns the job to access PMIx usock
 	 * file. According to 'man 7 unix':
 	 * "... In the Linux implementation, sockets which are visible in the
@@ -610,26 +494,51 @@ int pmixp_mkdir(char *path, mode_t rights)
 	 * access to the unix socket we do the following:
 	 * 1. Owner ID is set to the job owner.
 	 * 2. Group ID corresponds to slurmstepd.
-	 * 3. Set 0770 access mode
+	 * 3. Set 0700 access mode
 	 */
 
-	if (0 != mkdir(path, rights) ) {
-		PMIXP_ERROR_STD("Cannot create directory \"%s\"",
-				path);
+	base = xstrdup(path);
+	/* split into base and new directory name */
+	while ((slash = strrchr(base, '/'))) {
+		/* fix a path with one or more trailing slashes */
+		if (slash[1] == '\0')
+			slash[0] = '\0';
+		else
+			break;
+	}
+
+	if (!slash) {
+		PMIXP_ERROR_STD("Invalid directory \"%s\"", path);
+		xfree(base);
+		return EINVAL;
+	}
+
+	slash[0] = '\0';
+	newdir = slash + 1;
+
+	if ((dirfd = open(base, O_DIRECTORY | O_NOFOLLOW)) < 0) {
+		PMIXP_ERROR_STD("Could not open parent directory \"%s\"", base);
+		xfree(base);
 		return errno;
 	}
 
-	/* There might be umask that will drop essential rights.
-	 * Fix it explicitly.
-	 * TODO: is there more elegant solution? */
-	if (chmod(path, rights) < 0) {
-		error("%s: chown(%s): %m", __func__, path);
+	if (mkdirat(dirfd, newdir, rights) < 0) {
+		PMIXP_ERROR_STD("Cannot create directory \"%s\"",
+				path);
+		close(dirfd);
+		xfree(base);
 		return errno;
 	}
 
-	if (chown(path, (uid_t) pmixp_info_jobuid(), (gid_t) -1) < 0) {
-		error("%s: chown(%s): %m", __func__, path);
+	if (fchownat(dirfd, newdir, (uid_t) pmixp_info_jobuid(), (gid_t) -1,
+		     AT_SYMLINK_NOFOLLOW) < 0) {
+		error("%s: fchownath(%s): %m", __func__, path);
+		close(dirfd);
+		xfree(base);
 		return errno;
 	}
+
+	close(dirfd);
+	xfree(base);
 	return 0;
 }
diff --git a/src/plugins/mpi/pmix/pmixp_utils.h b/src/plugins/mpi/pmix/pmixp_utils.h
index 524740b401..1be71d0727 100644
--- a/src/plugins/mpi/pmix/pmixp_utils.h
+++ b/src/plugins/mpi/pmix/pmixp_utils.h
@@ -61,9 +61,7 @@ int pmixp_stepd_send(const char *nodelist, const char *address,
 int pmixp_p2p_send(const char *nodename, const char *address, const char *data,
 		   uint32_t len, unsigned int start_delay,
 		   unsigned int retry_cnt, int silent);
-int pmixp_rmdir_recursively(char *path);
-int pmixp_fixrights(char *path, uid_t uid, mode_t mode);
-int pmixp_mkdir(char *path, mode_t rights);
+int pmixp_mkdir(char *path);
 
 /* lightweight pmix list of pointers */
 #define PMIXP_LIST_DEBUG 0
diff --git a/src/plugins/switch/cray_aries/iaa.c b/src/plugins/switch/cray_aries/iaa.c
index 2a008b3e5c..20ffed6fda 100644
--- a/src/plugins/switch/cray_aries/iaa.c
+++ b/src/plugins/switch/cray_aries/iaa.c
@@ -65,7 +65,7 @@ int write_iaa_file(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job,
 		}
 
 		// chown the file to the job user
-		rc = chown(fname, job->uid, job->gid);
+		rc = lchown(fname, job->uid, job->gid);
 		if (rc == -1) {
 			CRAY_ERR("chown(%s, %d, %d) failed: %m",
 				 fname, (int)job->uid, (int)job->gid);
diff --git a/src/plugins/switch/cray_aries/switch_cray_aries.h b/src/plugins/switch/cray_aries/switch_cray_aries.h
index 9c539b7328..0d9fe15a43 100644
--- a/src/plugins/switch/cray_aries/switch_cray_aries.h
+++ b/src/plugins/switch/cray_aries/switch_cray_aries.h
@@ -45,6 +45,7 @@
 #include <stdint.h>
 
 #include "src/common/bitstring.h"
+#include "src/common/fd.h"
 #include "src/common/log.h"
 #include "src/common/slurm_protocol_defs.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
diff --git a/src/plugins/switch/cray_aries/util.c b/src/plugins/switch/cray_aries/util.c
index 090cd17752..0f4ebf20c0 100644
--- a/src/plugins/switch/cray_aries/util.c
+++ b/src/plugins/switch/cray_aries/util.c
@@ -50,7 +50,6 @@
 
 
 #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
-static void _recursive_rmdir(const char *dirnm);
 
 /*
  * Create APID directory with given uid/gid as the owner.
@@ -69,7 +68,7 @@ int create_apid_dir(uint64_t apid, uid_t uid, gid_t gid)
 		return SLURM_ERROR;
 	}
 
-	rc = chown(apid_dir, uid, gid);
+	rc = lchown(apid_dir, uid, gid);
 	if (rc) {
 		CRAY_ERR("chown %s, %d, %d failed: %m",
 			 apid_dir, (int)uid, (int)gid);
@@ -116,7 +115,7 @@ int remove_spool_files(uint64_t apid)
 
 	// Remove the apid directory LEGACY_SPOOL_DIR/<APID>
 	path_name = xstrdup_printf(LEGACY_SPOOL_DIR "%" PRIu64, apid);
-	_recursive_rmdir(path_name);
+	rmdir_recursive(path_name, true);
 	xfree(path_name);
 
 	// Remove the backwards compatibility ALPS placement file
@@ -299,75 +298,6 @@ int list_str_to_array(char *list, int *cnt, int32_t **numbers)
 	return ret;
 }
 
-/*
- * Recursive directory delete
- *
- * Call with a directory name and this function will delete
- * all files and directories rooted in this name. Finally
- * the named directory will be deleted.
- * If called with a file name, only that file will be deleted.
- */
-static void _recursive_rmdir(const char *dirnm)
-{
-	int st;
-	size_t dirnm_len, fnm_len, name_len;
-	char *fnm = 0;
-	DIR *dirp;
-	struct dirent *dir;
-	struct stat st_buf;
-
-	/* Don't do anything if there is no directory name */
-	if (!dirnm) {
-		return;
-	}
-	dirp = opendir(dirnm);
-	if (!dirp) {
-		if (errno == ENOTDIR)
-			goto fileDel;
-		CRAY_ERR("Error opening directory %s", dirnm);
-		return;
-	}
-
-	dirnm_len = strlen(dirnm);
-	if (dirnm_len == 0)
-		return;
-	while ((dir = readdir(dirp))) {
-		name_len = strlen(dir->d_name);
-		if (name_len == 1 && dir->d_name[0] == '.')
-			continue;
-		if (name_len == 2 && xstrcmp(dir->d_name, "..") == 0)
-			continue;
-		fnm_len = dirnm_len + name_len + 2;
-		free(fnm);
-		fnm = malloc(fnm_len);
-		snprintf(fnm, fnm_len, "%s/%s", dirnm, dir->d_name);
-		st = stat(fnm, &st_buf);
-		if (st < 0) {
-			CRAY_ERR("stat of %s", fnm);
-			continue;
-		}
-		if (st_buf.st_mode & S_IFDIR) {
-			_recursive_rmdir(fnm);
-		} else {
-
-			st = unlink(fnm);
-			if (st < 0 && errno == EISDIR)
-				st = rmdir(fnm);
-			if (st < 0 && errno != ENOENT) {
-				CRAY_ERR("Error removing %s", fnm);
-			}
-		}
-	}
-	free(fnm);
-	closedir(dirp);
-fileDel: st = unlink(dirnm);
-	if (st < 0 && errno == EISDIR)
-		st = rmdir(dirnm);
-	if (st < 0 && errno != ENOENT) {
-		CRAY_ERR("Error removing %s", dirnm);
-	}
-}
-
 void print_jobinfo(slurm_cray_jobinfo_t *job)
 {
 	int i;
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index eb1acbad66..c3f9bdde36 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -126,14 +126,6 @@
 #define RETRY_DELAY 15		/* retry every 15 seconds */
 #define MAX_RETRY   240		/* retry 240 times (one hour max) */
 
-struct priv_state {
-	uid_t	saved_uid;
-	gid_t	saved_gid;
-	gid_t *	gid_list;
-	int	ngids;
-	char	saved_cwd [4096];
-};
-
 step_complete_t step_complete = {
 	PTHREAD_COND_INITIALIZER,
 	PTHREAD_MUTEX_INITIALIZER,
@@ -172,9 +164,6 @@ static int  _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized);
 static int  _become_user(stepd_step_rec_t *job, struct priv_state *ps);
 static void  _set_prio_process (stepd_step_rec_t *job);
 static int  _setup_normal_io(stepd_step_rec_t *job);
-static int  _drop_privileges(stepd_step_rec_t *job, bool do_setuid,
-			     struct priv_state *state, bool get_list);
-static int  _reclaim_privileges(struct priv_state *state);
 static void _send_launch_resp(stepd_step_rec_t *job, int rc);
 static int  _slurmd_job_log_init(stepd_step_rec_t *job);
 static void _wait_for_io(stepd_step_rec_t *job);
@@ -458,7 +447,7 @@ _setup_normal_io(stepd_step_rec_t *job)
 	 * descriptors (which may be connected to files), then
 	 * reclaim privileges.
 	 */
-	if (_drop_privileges(job, true, &sprivs, true) < 0)
+	if (drop_privileges(job, true, &sprivs, true) < 0)
 		return ESLURMD_SET_UID_OR_GID_ERROR;
 
 	if (io_init_tasks_stdio(job) != SLURM_SUCCESS) {
@@ -572,7 +561,7 @@ _setup_normal_io(stepd_step_rec_t *job)
 	}
 
 claim:
-	if (_reclaim_privileges(&sprivs) < 0) {
+	if (reclaim_privileges(&sprivs) < 0) {
 		error("sete{u/g}id(%lu/%lu): %m",
 		      (u_long) sprivs.saved_uid, (u_long) sprivs.saved_gid);
 	}
@@ -927,7 +916,7 @@ static void _shutdown_x11_forward(stepd_step_rec_t *job)
 {
 	struct priv_state sprivs = { 0 };
 
-	if (_drop_privileges(job, true, &sprivs, false) < 0) {
+	if (drop_privileges(job, true, &sprivs, false) < 0) {
 		error("%s: Unable to drop privileges", __func__);
 		return;
 	}
@@ -935,7 +924,7 @@ static void _shutdown_x11_forward(stepd_step_rec_t *job)
 	if (shutdown_x11_forward(job) != SLURM_SUCCESS)
 		error("%s: x11 forward shutdown failed", __func__);
 
-	if (_reclaim_privileges(&sprivs) < 0)
+	if (reclaim_privileges(&sprivs) < 0)
 		error("%s: Unable to reclaim privileges", __func__);
 }
 
@@ -996,7 +985,7 @@ static int _set_xauthority(stepd_step_rec_t *job)
 {
 	struct priv_state sprivs = { 0 };
 
-	if (_drop_privileges(job, true, &sprivs, false) < 0) {
+	if (drop_privileges(job, true, &sprivs, false) < 0) {
 		error("%s: Unable to drop privileges before xauth", __func__);
 		return SLURM_ERROR;
 	}
@@ -1007,7 +996,7 @@ static int _set_xauthority(stepd_step_rec_t *job)
 		return SLURM_ERROR;
 	}
 
-	if (_reclaim_privileges(&sprivs) < 0) {
+	if (reclaim_privileges(&sprivs) < 0) {
 		error("%s: Unable to reclaim privileges after xauth", __func__);
 		return SLURM_ERROR;
 	}
@@ -1058,7 +1047,7 @@ static int _spawn_job_container(stepd_step_rec_t *job)
 	if (job->x11) {
 		struct priv_state sprivs = { 0 };
 
-		if (_drop_privileges(job, true, &sprivs, false) < 0) {
+		if (drop_privileges(job, true, &sprivs, false) < 0) {
 			error ("Unable to drop privileges");
 			return SLURM_ERROR;
 		}
@@ -1067,7 +1056,7 @@ static int _spawn_job_container(stepd_step_rec_t *job)
 			error("x11 port forwarding setup failed");
 			_exit(127);
 		}
-		if (_reclaim_privileges(&sprivs) < 0) {
+		if (reclaim_privileges(&sprivs) < 0) {
 			error ("Unable to reclaim privileges");
 			return SLURM_ERROR;
 		}
@@ -1558,7 +1547,7 @@ static int _pre_task_child_privileged(
 	int setwd = 0; /* set working dir */
 	int rc = 0;
 
-	if (_reclaim_privileges(sp) < 0)
+	if (reclaim_privileges(sp) < 0)
 		return SLURM_ERROR;
 
 	set_oom_adj(0); /* the tasks may be killed by OOM */
@@ -1585,9 +1574,9 @@ static int _pre_task_child_privileged(
 		return error("spank_task_init_privileged failed");
 
 	/* sp->gid_list should already be initialized */
-	rc = _drop_privileges(job, true, sp, false);
+	rc = drop_privileges(job, true, sp, false);
 	if (rc) {
-		error ("_drop_privileges: %m");
+		error ("drop_privileges: %m");
 		return rc;
 	}
 
@@ -1825,7 +1814,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized)
 	 * Temporarily drop effective privileges, except for the euid.
 	 * We need to wait until after pam_setup() to drop euid.
 	 */
-	if (_drop_privileges (job, false, &sprivs, true) < 0)
+	if (drop_privileges (job, false, &sprivs, true) < 0)
 		return ESLURMD_SET_UID_OR_GID_ERROR;
 
 	if (pam_setup(job->user_name, conf->hostname)
@@ -1837,7 +1826,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized)
 	/*
 	 * Reclaim privileges to do the io setup
 	 */
-	_reclaim_privileges(&sprivs);
+	reclaim_privileges(&sprivs);
 	if (rc)
 		goto fail1; /* pam_setup error */
 
@@ -1885,7 +1874,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized)
 	/*
 	 * Temporarily drop effective privileges
 	 */
-	if (_drop_privileges (job, true, &sprivs, true) < 0) {
+	if (drop_privileges (job, true, &sprivs, true) < 0) {
 		error ("_drop_privileges: %m");
 		rc = SLURM_ERROR;
 		goto fail2;
@@ -1943,7 +1932,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized)
 			 * Reclaim privileges for the child and call any plugin
 			 * hooks that may require elevated privs
 			 * sprivs.gid_list is already set from the
-			 * _drop_privileges call above, no not reinitialize.
+			 * drop_privileges call above, no not reinitialize.
 			 * NOTE: Only put things in here that are self contained
 			 * and belong in the child.
 			 */
@@ -2008,7 +1997,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized)
 	/*
 	 * Reclaim privileges
 	 */
-	if (_reclaim_privileges(&sprivs) < 0) {
+	if (reclaim_privileges(&sprivs) < 0) {
 		error ("Unable to reclaim privileges");
 		/* Don't bother erroring out here */
 	}
@@ -2119,7 +2108,7 @@ fail4:
 		error ("Unable to return to working directory");
 	}
 fail3:
-	_reclaim_privileges (&sprivs);
+	reclaim_privileges (&sprivs);
 fail2:
 	FREE_NULL_LIST(exec_wait_list);
 	io_close_task_fds(job);
@@ -2684,9 +2673,8 @@ _send_complete_batch_script_msg(stepd_step_rec_t *job, int err, int status)
 /* If get_list is false make sure ps->gid_list is initialized before
  * hand to prevent xfree.
  */
-static int
-_drop_privileges(stepd_step_rec_t *job, bool do_setuid,
-		 struct priv_state *ps, bool get_list)
+extern int drop_privileges(stepd_step_rec_t *job, bool do_setuid,
+			   struct priv_state *ps, bool get_list)
 {
 	ps->saved_uid = getuid();
 	ps->saved_gid = getgid();
@@ -2736,8 +2724,7 @@ _drop_privileges(stepd_step_rec_t *job, bool do_setuid,
 	return SLURM_SUCCESS;
 }
 
-static int
-_reclaim_privileges(struct priv_state *ps)
+extern int reclaim_privileges(struct priv_state *ps)
 {
 	int rc = SLURM_SUCCESS;
 
@@ -2999,7 +2986,7 @@ _run_script_as_user(const char *name, const char *path, stepd_step_rec_t *job,
 #endif
 
 		sprivs.gid_list = NULL;	/* initialize to prevent xfree */
-		if (_drop_privileges(job, true, &sprivs, false) < 0) {
+		if (drop_privileges(job, true, &sprivs, false) < 0) {
 			error("run_script_as_user _drop_privileges: %m");
 			/* child process, should not return */
 			exit(127);
diff --git a/src/slurmd/slurmstepd/mgr.h b/src/slurmd/slurmstepd/mgr.h
index ebac81c1a0..2ce8eb9e77 100644
--- a/src/slurmd/slurmstepd/mgr.h
+++ b/src/slurmd/slurmstepd/mgr.h
@@ -85,4 +85,16 @@ int job_manager(stepd_step_rec_t *job);
 extern void init_initgroups(int);
 
 
+struct priv_state {
+	uid_t saved_uid;
+	gid_t saved_gid;
+	gid_t *gid_list;
+	int ngids;
+	char saved_cwd[4096];
+};
+
+extern int drop_privileges(stepd_step_rec_t *step, bool do_setuid,
+			   struct priv_state *state, bool get_list);
+extern int reclaim_privileges(struct priv_state *state);
+
 #endif
diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c
index 27f9e10522..90bc1b9a1d 100644
--- a/src/slurmd/slurmstepd/slurmstepd.c
+++ b/src/slurmd/slurmstepd/slurmstepd.c
@@ -747,7 +747,18 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg)
 	}
 
 	if (job->container) {
-		int rc = setup_container(job);
+	        struct priv_state sprivs;
+		int rc;
+
+		if (drop_privileges(job, false, &sprivs, true) < 0) {
+			error("%s: drop_priviledges failed", __func__);
+			return NULL;
+		}
+		rc = setup_container(job);
+		if (reclaim_privileges(&sprivs) < 0) {
+			error("%s: reclaim_priviledges failed", __func__);
+			return NULL;
+		}
 
 		if (rc == ESLURM_CONTAINER_NOT_CONFIGURED) {
 			debug2("%s: container %s requested but containers are not configured on this node",
