Add support for running a container with a private network interface

For example, the following sets up a container named ‘foo’.  The
container will have a single network interface eth0, with IP address
10.231.136.2.  The host will have an interface c-foo with IP address
10.231.136.1.

  systemd.containers.foo =
    { privateNetwork = true;
      hostAddress = "10.231.136.1";
      localAddress = "10.231.136.2";
      config =
        { services.openssh.enable = true; };
    };

With ‘privateNetwork = true’, the container has the CAP_NET_ADMIN
capability, allowing it to do arbitrary network configuration, such as
setting up firewall rules.  This is secure because it cannot touch the
interfaces of the host.

The helper program ‘run-in-netns’ is needed at the moment because ‘ip
netns exec’ doesn't quite do the right thing (it remounts /sys without
bind-mounting the original /sys/fs/cgroups).
This commit is contained in:
Eelco Dolstra 2014-03-18 10:49:25 +01:00
parent ac215779dd
commit 895bcdd1cb
3 changed files with 155 additions and 13 deletions

View file

@ -34,8 +34,9 @@ let
# Ignore peth* devices; on Xen, they're renamed physical
# Ethernet cards used for bridging. Likewise for vif* and tap*
# (Xen) and virbr* and vnet* (libvirt).
denyinterfaces ${toString ignoredInterfaces} peth* vif* tap* tun* virbr* vnet* vboxnet*
# (Xen) and virbr* and vnet* (libvirt) and c-* and ctmp-* (NixOS
# containers).
denyinterfaces ${toString ignoredInterfaces} peth* vif* tap* tun* virbr* vnet* vboxnet* c-* ctmp-*
${config.networking.dhcpcd.extraConfig}
'';

View file

@ -2,6 +2,20 @@
with pkgs.lib;
let
runInNetns = pkgs.stdenv.mkDerivation {
name = "run-in-netns";
unpackPhase = "true";
buildPhase = ''
mkdir -p $out/bin
gcc ${./run-in-netns.c} -o $out/bin/run-in-netns
'';
installPhase = "true";
};
in
{
options = {
@ -45,6 +59,39 @@ with pkgs.lib;
'';
};
privateNetwork = mkOption {
type = types.bool;
default = false;
description = ''
Whether to give the container its own private virtual
Ethernet interface. The interface is called
<literal>eth0</literal>, and is hooked up to the interface
<literal>c-<replaceable>container-name</replaceable></literal>
on the host. If this option is not set, then the
container shares the network interfaces of the host,
and can bind to any port on any interface.
'';
};
hostAddress = mkOption {
type = types.nullOr types.string;
default = null;
example = "10.231.136.1";
description = ''
The IPv4 address assigned to the host interface.
'';
};
localAddress = mkOption {
type = types.nullOr types.string;
default = null;
example = "10.231.136.2";
description = ''
The IPv4 address assigned to <literal>eth0</literal>
in the container.
'';
};
};
config = mkMerge
@ -97,32 +144,70 @@ with pkgs.lib;
config = {
systemd.services = mapAttrs' (name: container: nameValuePair "container-${name}"
{ description = "Container '${name}'";
systemd.services = mapAttrs' (name: cfg:
let
# FIXME: interface names have a maximum length.
ifaceHost = "c-${name}";
ifaceCont = "ctmp-${name}";
ns = "net-${name}";
in
nameValuePair "container-${name}" {
description = "Container '${name}'";
wantedBy = [ "multi-user.target" ];
unitConfig.RequiresMountsFor = [ container.root ];
unitConfig.RequiresMountsFor = [ cfg.root ];
path = [ pkgs.iproute ];
preStart =
''
mkdir -p -m 0755 ${container.root}/etc
if ! [ -e ${container.root}/etc/os-release ]; then
touch ${container.root}/etc/os-release
mkdir -p -m 0755 ${cfg.root}/etc
if ! [ -e ${cfg.root}/etc/os-release ]; then
touch ${cfg.root}/etc/os-release
fi
mkdir -p -m 0755 \
/nix/var/nix/profiles/per-container/${name} \
/nix/var/nix/gcroots/per-container/${name}
''
+ optionalString cfg.privateNetwork ''
# Cleanup from last time.
ip netns del ${ns} 2> /dev/null || true
ip link del ${ifaceHost} 2> /dev/null || true
ip link del ${ifaceCont} 2> /dev/null || true
# Create a pair of virtual ethernet devices. On the host,
# we get c-<container-name, and on the guest, we get
# eth0.
set -x
ip link add ${ifaceHost} type veth peer name ${ifaceCont}
ip netns add ${ns}
ip link set ${ifaceCont} netns ${ns}
ip netns exec ${ns} ip link set ${ifaceCont} name eth0
ip netns exec ${ns} ip link set dev eth0 up
ip link set dev ${ifaceHost} up
${optionalString (cfg.hostAddress != null) ''
ip addr add ${cfg.hostAddress} dev ${ifaceHost}
ip netns exec ${ns} ip route add ${cfg.hostAddress} dev eth0
ip netns exec ${ns} ip route add default via ${cfg.hostAddress}
''}
${optionalString (cfg.localAddress != null) ''
ip netns exec ${ns} ip addr add ${cfg.localAddress} dev eth0
ip route add ${cfg.localAddress} dev ${ifaceHost}
''}
'';
serviceConfig.ExecStart =
"${config.systemd.package}/bin/systemd-nspawn"
+ " -M ${name} -D ${container.root}"
(optionalString cfg.privateNetwork "${runInNetns}/bin/run-in-netns ${ns} ")
+ "${config.systemd.package}/bin/systemd-nspawn"
+ (optionalString cfg.privateNetwork " --capability=CAP_NET_ADMIN")
+ " -M ${name} -D ${cfg.root}"
+ " --bind-ro=/nix/store --bind-ro=/nix/var/nix/db --bind-ro=/nix/var/nix/daemon-socket"
+ " --bind=/nix/var/nix/profiles/per-container/${name}:/nix/var/nix/profiles"
+ " --bind=/nix/var/nix/gcroots/per-container/${name}:/nix/var/nix/gcroots"
+ " ${container.path}/init";
+ " ${cfg.path}/init";
preStop =
''
@ -146,10 +231,16 @@ with pkgs.lib;
serviceConfig.ExecReload =
"${pkgs.bash}/bin/bash -c '"
+ "echo ${container.path}/bin/switch-to-configuration test "
+ "| ${pkgs.socat}/bin/socat unix:${container.root}/var/lib/root-shell.socket -'";
+ "echo ${cfg.path}/bin/switch-to-configuration test "
+ "| ${pkgs.socat}/bin/socat unix:${cfg.root}/var/lib/root-shell.socket -'";
}) config.systemd.containers;
# Generate /etc/hosts entries for the containers.
networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
''
${cfg.localAddress} ${name}.containers
'') config.systemd.containers);
};
}

View file

@ -0,0 +1,50 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <fcntl.h>
#include <linux/limits.h>
int main(int argc, char * * argv)
{
if (argc < 3) {
fprintf(stderr, "%s: missing arguments\n", argv[0]);
return 1;
}
char nsPath[PATH_MAX];
sprintf(nsPath, "/run/netns/%s", argv[1]);
int fd = open(nsPath, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "%s: opening network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
if (setns(fd, CLONE_NEWNET) == -1) {
fprintf(stderr, "%s: setting network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
umount2(nsPath, MNT_DETACH);
if (unlink(nsPath) == -1) {
fprintf(stderr, "%s: unlinking network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
/* FIXME: Remount /sys so that /sys/class/net reflects the
interfaces visible in the network namespace. This requires
bind-mounting /sys/fs/cgroups etc. */
execv(argv[2], argv + 2);
fprintf(stderr, "%s: running command: %s\n", argv[0], strerror(errno));
return 1;
}