;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2013, 2014, 2015 Ludovic Courtès ;;; Copyright © 2014, 2015 Mark H Weaver ;;; Copyright © 2015 Andreas Enge ;;; Copyright © 2017, 2018, 2019, 2020 Efraim Flashner ;;; Copyright © 2017 Leo Famulari ;;; Copyright © 2018 Tobias Geerinckx-Rice ;;; Copyright © 2018 Marius Bakke ;;; Copyright © 2020 Mark Wielaard ;;; ;;; This file is part of GNU Guix. ;;; ;;; GNU Guix is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU General Public License as published by ;;; the Free Software Foundation; either version 3 of the License, or (at ;;; your option) any later version. ;;; ;;; GNU Guix is distributed in the hope that it will be useful, but ;;; WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULA
aboutsummaryrefslogtreecommitdiff
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2017-2019, 2022, 2023 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.

(define-module (gnu build linux-container)
  #:use-module (ice-9 format)
  #:use-module (ice-9 match)
  #:use-module (ice-9 rdelim)
  #:use-module (srfi srfi-98)
  #:use-module (guix build utils)
  #:use-module (guix build syscalls)
  #:use-module (gnu system file-systems)          ;<file-system>
  #:use-module ((gnu build file-systems) #:select (mount-file-system))
  #:export (user-namespace-supported?
            unprivileged-user-namespace-supported?
            setgroups-supported?
            %namespaces
            run-container
            call-with-container
            container-excursion
            container-excursion*))

(define (user-namespace-supported?)
  "Return #t if user namespaces are supported on this system."
  (file-exists? "/proc/self/ns/user"))

(define (unprivileged-user-namespace-supported?)
  "Return #t if user namespaces can be created by unprivileged users."
  (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
    (if (file-exists? userns-file)
        (eqv? #\1 (call-with-input-file userns-file read-char))
        #t)))

(define (setgroups-supported?)
  "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
exists."
  (file-exists? "/proc/self/setgroups"))

(define %namespaces
  '(cgroup mnt pid ipc uts user net))

(define (call-with-clean-exit thunk)
  "Apply THUNK, but exit with a status code of 1 if it fails."
  (dynamic-wind
    (const #t)
    (lambda ()
      (thunk)

      ;; XXX: Somehow we sometimes get EBADF from write(2) or close(2) upon
      ;; exit (coming from fd finalizers) when used by the Shepherd.  To work
      ;; around that, exit forcefully so fd finalizers don't have a chance to
      ;; run and fail.
      (primitive-_exit 0))
    (lambda ()
      (primitive-_exit 1))))

(define (purify-environment)
  "Unset all environment variables."
  (for-each unsetenv
            (match (get-environment-variables)
              (((names . _) ...) names))))

;; The container setup procedure closely resembles that of the Docker
;; specification:
;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
(define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
  "Mount the essential file systems and the those in MOUNTS, a list of
<file-system> objects, relative to ROOT; then make ROOT the new root directory
for the process."
  (define (scope dir)
    (string-append root dir))

  (define (touch file-name)
    (call-with-output-file file-name (const #t)))

  (define (bind-mount src dest)
    (mount src dest "none" MS_BIND))

  ;; Like mount, but creates the mount point if it doesn't exist.
  (define* (mount* source target type #:optional (flags 0) options
                   #:key (update-mtab? #f))
    (mkdir-p target)
    (mount source target type flags options #:update-mtab? update-mtab?))

  ;; The container's file system is completely ephemeral, sans directories
  ;; bind-mounted from the host.
  (mount "none" root "tmpfs")

  ;; A proc mount requires a new pid namespace.
  (when mount-/proc?
    (mount* "none" (scope "/proc") "proc"
            (logior MS_NOEXEC MS_NOSUID MS_NODEV)))

  ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
  ;; the current network namespace.
  (when mount-/sys?
    (catch 'system-error
      (lambda ()
        (mount* "none" (scope "/sys") "sysfs"
                (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
      (lambda args
        ;; EPERM means that CAP_SYS_ADMIN is missing.  Ignore.
        (unless (= EPERM (system-error-errno args))
          (apply throw args)))))

  (mount* "none" (scope "/dev") "tmpfs"
          (logior MS_NOEXEC MS_STRICTATIME)
          "mode=755")

  ;; Create essential device nodes via bind-mounting them from the
  ;; host, because a process within a user namespace cannot create
  ;; device nodes.
  (for-each (lambda (device)
              (when (file-exists? device)
                ;; Create the mount point file.
                (touch (scope device))
                (bind-mount device (scope device))))
            '("/dev/null"
              "/dev/zero"
              "/dev/full"
              "/dev/random"
              "/dev/urandom"
              "/dev/tty"
              "/dev/fuse"))

  ;; Mount a new devpts instance on /dev/pts.
  (when (file-exists? "/dev/ptmx")
    (mount* "none" (scope "/dev/pts") "devpts" 0
            "newinstance,mode=0620")
    (symlink "/dev/pts/ptmx" (scope "/dev/ptmx")))

  ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
  ;; associated with standard input when there is one.
  (let* ((in      (current-input-port))
         (tty     (catch 'system-error
                    (lambda ()
                      ;; This call throws if IN does not correspond to a tty.
                      ;; This is more reliable than 'isatty?'.
                      (ttyname in))
                    (const #f)))
         (console (scope "/dev/console")))
    (when tty
      (touch console)
      (chmod console #o600)
      (bind-mount tty console)))

  ;; Setup standard input/output/error.
  (symlink "/proc/self/fd"   (scope "/dev/fd"))
  (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
  (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
  (symlink "/proc/self/fd/2" (scope "/dev/stderr"))

  ;; Mount user-specified file systems.
  (for-each (lambda (file-system)
              (mount-file-system file-system #:root root))
            mounts)

  ;; Jail the process inside the container's root file system.
  (let ((put-old (string-append root "/real-root")))
    (mkdir put-old)
    (pivot-root root put-old)
    (chdir "/")
    (umount "real-root" MNT_DETACH)
    (rmdir "real-root")
    (chmod "/" #o755)))

(define* (initialize-user-namespace pid host-uids
                                    #:key (guest-uid 0) (guest-gid 0))
  "Configure the user namespace for PID.  HOST-UIDS specifies the number of
host user identifiers to map into the user namespace.  GUEST-UID and GUEST-GID
specify the first UID (respectively GID) that host UIDs (respectively GIDs)
map to in the namespace."
  (define proc-dir
    (string-append "/proc/" (number->string pid)))

  (define (scope file)
    (string-append proc-dir file))

  (let ((uid (getuid))
        (gid (getgid)))

    ;; Only root can write to the gid map without first disabling the
    ;; setgroups syscall.
    (unless (and (zero? uid) (zero? gid))
      (call-with-output-file (scope "/setgroups")
        (lambda (port)
          (display "deny" port))))

    ;; Map the user/group that created the container to the root user
    ;; within the container.
    (call-with-output-file (scope "/uid_map")
      (lambda (port)
        (format port "~d ~d ~d" guest-uid uid host-uids)))
    (call-with-output-file (scope "/gid_map")
      (lambda (port)
        (format port "~d ~d ~d" guest-gid gid host-uids)))))

(define (namespaces->bit-mask namespaces)
  "Return the number suitable for the 'flags' argument of 'clone' that
corresponds to the symbols in NAMESPACES."
  ;; Use the same flags as fork(3) in addition to the namespace flags.
  (apply logior SIGCHLD
         (map (match-lambda
               ('cgroup  CLONE_NEWCGROUP)
               ('mnt  CLONE_NEWNS)
               ('uts  CLONE_NEWUTS)
               ('ipc  CLONE_NEWIPC)
               ('user CLONE_NEWUSER)
               ('pid  CLONE_NEWPID)
               ('net  CLONE_NEWNET))
              namespaces)))

(define* (run-container root mounts namespa