2024-03-02 06:33:14 +01:00
|
|
|
|
{ config, lib, ... }: let
|
|
|
|
|
|
|
|
|
|
pkgs = config.node.pkgs;
|
|
|
|
|
|
2020-03-23 21:35:16 +01:00
|
|
|
|
commonConfig = ./common/acme/client;
|
2020-02-09 03:09:34 +01:00
|
|
|
|
|
2022-06-06 23:49:59 +02:00
|
|
|
|
dnsServerIP = nodes: nodes.dnsserver.networking.primaryIPAddress;
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
dnsScript = nodes: let
|
2020-06-19 21:27:46 +02:00
|
|
|
|
dnsAddress = dnsServerIP nodes;
|
|
|
|
|
in pkgs.writeShellScript "dns-hook.sh" ''
|
2020-02-09 03:09:34 +01:00
|
|
|
|
set -euo pipefail
|
|
|
|
|
echo '[INFO]' "[$2]" 'dns-hook.sh' $*
|
|
|
|
|
if [ "$1" = "present" ]; then
|
2020-06-19 21:27:46 +02:00
|
|
|
|
${pkgs.curl}/bin/curl --data '{"host": "'"$2"'", "value": "'"$3"'"}' http://${dnsAddress}:8055/set-txt
|
2020-02-09 03:09:34 +01:00
|
|
|
|
else
|
2020-06-19 21:27:46 +02:00
|
|
|
|
${pkgs.curl}/bin/curl --data '{"host": "'"$2"'"}' http://${dnsAddress}:8055/clear-txt
|
2020-02-09 03:09:34 +01:00
|
|
|
|
fi
|
|
|
|
|
'';
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
dnsConfig = nodes: {
|
|
|
|
|
dnsProvider = "exec";
|
|
|
|
|
dnsPropagationCheck = false;
|
2023-07-21 16:01:48 +02:00
|
|
|
|
environmentFile = pkgs.writeText "wildcard.env" ''
|
2021-11-28 18:03:31 +01:00
|
|
|
|
EXEC_PATH=${dnsScript nodes}
|
|
|
|
|
EXEC_POLLING_INTERVAL=1
|
|
|
|
|
EXEC_PROPAGATION_TIMEOUT=1
|
|
|
|
|
EXEC_SEQUENCE_INTERVAL=1
|
|
|
|
|
'';
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
documentRoot = pkgs.runCommand "docroot" {} ''
|
2020-06-19 21:27:46 +02:00
|
|
|
|
mkdir -p "$out"
|
|
|
|
|
echo hello world > "$out/index.html"
|
|
|
|
|
'';
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
vhostBase = {
|
2020-06-19 21:27:46 +02:00
|
|
|
|
forceSSL = true;
|
2021-11-28 18:03:31 +01:00
|
|
|
|
locations."/".root = documentRoot;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
vhostBaseHttpd = {
|
|
|
|
|
forceSSL = true;
|
|
|
|
|
inherit documentRoot;
|
|
|
|
|
};
|
|
|
|
|
|
2022-09-19 02:07:29 +02:00
|
|
|
|
simpleConfig = {
|
|
|
|
|
security.acme = {
|
|
|
|
|
certs."http.example.test" = {
|
|
|
|
|
listenHTTP = ":80";
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
networking.firewall.allowedTCPPorts = [ 80 ];
|
|
|
|
|
};
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# Base specialisation config for testing general ACME features
|
|
|
|
|
webserverBasicConfig = {
|
|
|
|
|
services.nginx.enable = true;
|
|
|
|
|
services.nginx.virtualHosts."a.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# Generate specialisations for testing a web server
|
|
|
|
|
mkServerConfigs = { server, group, vhostBaseData, extraConfig ? {} }: let
|
|
|
|
|
baseConfig = { nodes, config, specialConfig ? {} }: lib.mkMerge [
|
|
|
|
|
{
|
|
|
|
|
security.acme = {
|
2022-01-08 21:05:34 +01:00
|
|
|
|
defaults = (dnsConfig nodes);
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# One manual wildcard cert
|
|
|
|
|
certs."example.test" = {
|
|
|
|
|
domain = "*.example.test";
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2022-01-08 21:05:34 +01:00
|
|
|
|
users.users."${config.services."${server}".user}".extraGroups = ["acme"];
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
services."${server}" = {
|
|
|
|
|
enable = true;
|
|
|
|
|
virtualHosts = {
|
|
|
|
|
# Run-of-the-mill vhost using HTTP-01 validation
|
|
|
|
|
"${server}-http.example.test" = vhostBaseData // {
|
|
|
|
|
serverAliases = [ "${server}-http-alias.example.test" ];
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# Another which inherits the DNS-01 config
|
|
|
|
|
"${server}-dns.example.test" = vhostBaseData // {
|
|
|
|
|
serverAliases = [ "${server}-dns-alias.example.test" ];
|
|
|
|
|
enableACME = true;
|
|
|
|
|
# Set acmeRoot to null instead of using the default of "/var/lib/acme/acme-challenge"
|
|
|
|
|
# webroot + dnsProvider are mutually exclusive.
|
|
|
|
|
acmeRoot = null;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# One using the wildcard certificate
|
|
|
|
|
"${server}-wildcard.example.test" = vhostBaseData // {
|
|
|
|
|
serverAliases = [ "${server}-wildcard-alias.example.test" ];
|
|
|
|
|
useACMEHost = "example.test";
|
|
|
|
|
};
|
2024-05-10 00:50:43 +02:00
|
|
|
|
} // (lib.optionalAttrs (server == "nginx") {
|
|
|
|
|
# The nginx module supports using a different key than the hostname
|
|
|
|
|
different-key = vhostBaseData // {
|
|
|
|
|
serverName = "${server}-different-key.example.test";
|
|
|
|
|
serverAliases = [ "${server}-different-key-alias.example.test" ];
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
});
|
2021-11-28 18:03:31 +01:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# Used to determine if service reload was triggered
|
|
|
|
|
systemd.targets."test-renew-${server}" = {
|
|
|
|
|
wants = [ "acme-${server}-http.example.test.service" ];
|
|
|
|
|
after = [ "acme-${server}-http.example.test.service" "${server}-config-reload.service" ];
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
specialConfig
|
|
|
|
|
extraConfig
|
|
|
|
|
];
|
|
|
|
|
in {
|
|
|
|
|
"${server}".configuration = { nodes, config, ... }: baseConfig {
|
|
|
|
|
inherit nodes config;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# Test that server reloads when an alias is removed (and subsequently test removal works in acme)
|
|
|
|
|
"${server}-remove-alias".configuration = { nodes, config, ... }: baseConfig {
|
|
|
|
|
inherit nodes config;
|
|
|
|
|
specialConfig = {
|
|
|
|
|
# Remove an alias, but create a standalone vhost in its place for testing.
|
|
|
|
|
# This configuration results in certificate errors as useACMEHost does not imply
|
|
|
|
|
# append extraDomains, and thus we can validate the SAN is removed.
|
|
|
|
|
services."${server}" = {
|
|
|
|
|
virtualHosts."${server}-http.example.test".serverAliases = lib.mkForce [];
|
|
|
|
|
virtualHosts."${server}-http-alias.example.test" = vhostBaseData // {
|
|
|
|
|
useACMEHost = "${server}-http.example.test";
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# Test that the server reloads when only the acme configuration is changed.
|
|
|
|
|
"${server}-change-acme-conf".configuration = { nodes, config, ... }: baseConfig {
|
|
|
|
|
inherit nodes config;
|
|
|
|
|
specialConfig = {
|
|
|
|
|
security.acme.certs."${server}-http.example.test" = {
|
|
|
|
|
keyType = "ec384";
|
|
|
|
|
# Also test that postRun is exec'd as root
|
|
|
|
|
postRun = "id | grep root";
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
};
|
2020-06-19 21:27:46 +02:00
|
|
|
|
};
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
in {
|
2017-07-27 13:24:17 +02:00
|
|
|
|
name = "acme";
|
2022-12-16 12:18:54 +01:00
|
|
|
|
meta = {
|
|
|
|
|
maintainers = lib.teams.acme.members;
|
|
|
|
|
# Hard timeout in seconds. Average run time is about 7 minutes.
|
|
|
|
|
timeout = 1800;
|
|
|
|
|
};
|
2017-07-27 13:24:17 +02:00
|
|
|
|
|
2020-06-19 21:27:46 +02:00
|
|
|
|
nodes = {
|
|
|
|
|
# The fake ACME server which will respond to client requests
|
2021-11-28 18:03:31 +01:00
|
|
|
|
acme = { nodes, ... }: {
|
2020-03-23 21:35:16 +01:00
|
|
|
|
imports = [ ./common/acme/server ];
|
2020-06-19 21:27:46 +02:00
|
|
|
|
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
|
2020-02-09 03:09:34 +01:00
|
|
|
|
};
|
|
|
|
|
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# A fake DNS server which can be configured with records as desired
|
|
|
|
|
# Used to test DNS-01 challenge
|
2021-11-28 18:03:31 +01:00
|
|
|
|
dnsserver = { nodes, ... }: {
|
2020-02-09 03:09:34 +01:00
|
|
|
|
networking.firewall.allowedTCPPorts = [ 8055 53 ];
|
|
|
|
|
networking.firewall.allowedUDPPorts = [ 53 ];
|
|
|
|
|
systemd.services.pebble-challtestsrv = {
|
|
|
|
|
enable = true;
|
|
|
|
|
description = "Pebble ACME challenge test server";
|
|
|
|
|
wantedBy = [ "network.target" ];
|
|
|
|
|
serviceConfig = {
|
2022-06-06 23:49:59 +02:00
|
|
|
|
ExecStart = "${pkgs.pebble}/bin/pebble-challtestsrv -dns01 ':53' -defaultIPv6 '' -defaultIPv4 '${nodes.webserver.networking.primaryIPAddress}'";
|
2020-02-09 03:09:34 +01:00
|
|
|
|
# Required to bind on privileged ports.
|
2020-03-23 18:58:36 +01:00
|
|
|
|
AmbientCapabilities = [ "CAP_NET_BIND_SERVICE" ];
|
2020-02-09 03:09:34 +01:00
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
};
|
2017-07-27 13:24:17 +02:00
|
|
|
|
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# A web server which will be the node requesting certs
|
2021-11-28 18:03:31 +01:00
|
|
|
|
webserver = { nodes, config, ... }: {
|
2017-07-27 13:24:17 +02:00
|
|
|
|
imports = [ commonConfig ];
|
2020-06-19 21:27:46 +02:00
|
|
|
|
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
|
2017-07-27 13:24:17 +02:00
|
|
|
|
networking.firewall.allowedTCPPorts = [ 80 443 ];
|
2019-08-29 16:32:59 +02:00
|
|
|
|
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# OpenSSL will be used for more thorough certificate validation
|
|
|
|
|
environment.systemPackages = [ pkgs.openssl ];
|
2019-08-29 16:32:59 +02:00
|
|
|
|
|
2020-08-30 19:38:30 +02:00
|
|
|
|
# Set log level to info so that we can see when the service is reloaded
|
|
|
|
|
services.nginx.logError = "stderr info";
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
specialisation = {
|
2022-09-18 22:27:11 +02:00
|
|
|
|
# Tests HTTP-01 verification using Lego's built-in web server
|
2022-09-19 02:07:29 +02:00
|
|
|
|
http01lego.configuration = simpleConfig;
|
2022-09-18 22:27:11 +02:00
|
|
|
|
|
2024-06-06 14:28:54 +02:00
|
|
|
|
# account hash generation with default server from <= 23.11
|
|
|
|
|
http01lego_legacyAccountHash.configuration = lib.mkMerge [
|
|
|
|
|
simpleConfig
|
|
|
|
|
{
|
|
|
|
|
security.acme.defaults.server = lib.mkForce null;
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
2022-09-19 02:07:29 +02:00
|
|
|
|
renew.configuration = lib.mkMerge [
|
|
|
|
|
simpleConfig
|
|
|
|
|
{
|
|
|
|
|
# Pebble provides 5 year long certs,
|
|
|
|
|
# needs to be higher than that to test renewal
|
|
|
|
|
security.acme.certs."http.example.test".validMinDays = 9999;
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
# Tests that account creds can be safely changed.
|
|
|
|
|
accountchange.configuration = lib.mkMerge [
|
|
|
|
|
simpleConfig
|
|
|
|
|
{
|
|
|
|
|
security.acme.certs."http.example.test".email = "admin@example.test";
|
|
|
|
|
}
|
|
|
|
|
];
|
2022-09-18 22:27:11 +02:00
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# First derivation used to test general ACME features
|
|
|
|
|
general.configuration = { ... }: let
|
2022-06-06 23:49:59 +02:00
|
|
|
|
caDomain = nodes.acme.test-support.acme.caDomain;
|
2021-11-28 18:03:31 +01:00
|
|
|
|
email = config.security.acme.defaults.email;
|
|
|
|
|
# Exit 99 to make it easier to track if this is the reason a renew failed
|
|
|
|
|
accountCreateTester = ''
|
|
|
|
|
test -e accounts/${caDomain}/${email}/account.json || exit 99
|
2020-03-05 23:07:20 +01:00
|
|
|
|
'';
|
2021-11-28 18:03:31 +01:00
|
|
|
|
in lib.mkMerge [
|
|
|
|
|
webserverBasicConfig
|
|
|
|
|
{
|
|
|
|
|
# Used to test that account creation is collated into one service.
|
|
|
|
|
# These should not run until after acme-finished-a.example.test.target
|
|
|
|
|
systemd.services."b.example.test".preStart = accountCreateTester;
|
|
|
|
|
systemd.services."c.example.test".preStart = accountCreateTester;
|
|
|
|
|
|
|
|
|
|
services.nginx.virtualHosts."b.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
services.nginx.virtualHosts."c.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
# Test OCSP Stapling
|
|
|
|
|
ocsp-stapling.configuration = { ... }: lib.mkMerge [
|
|
|
|
|
webserverBasicConfig
|
|
|
|
|
{
|
|
|
|
|
security.acme.certs."a.example.test".ocspMustStaple = true;
|
2021-12-18 15:52:32 +01:00
|
|
|
|
services.nginx.virtualHosts."a.example.test" = {
|
2021-11-28 18:03:31 +01:00
|
|
|
|
extraConfig = ''
|
|
|
|
|
ssl_stapling on;
|
|
|
|
|
ssl_stapling_verify on;
|
|
|
|
|
'';
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
# Validate service relationships by adding a slow start service to nginx' wants.
|
|
|
|
|
# Reproducer for https://github.com/NixOS/nixpkgs/issues/81842
|
|
|
|
|
slow-startup.configuration = { ... }: lib.mkMerge [
|
|
|
|
|
webserverBasicConfig
|
|
|
|
|
{
|
|
|
|
|
systemd.services.my-slow-service = {
|
|
|
|
|
wantedBy = [ "multi-user.target" "nginx.service" ];
|
|
|
|
|
before = [ "nginx.service" ];
|
|
|
|
|
preStart = "sleep 5";
|
|
|
|
|
script = "${pkgs.python3}/bin/python -m http.server";
|
|
|
|
|
};
|
|
|
|
|
|
2021-12-18 15:52:32 +01:00
|
|
|
|
services.nginx.virtualHosts."slow.example.test" = {
|
2021-11-28 18:03:31 +01:00
|
|
|
|
forceSSL = true;
|
|
|
|
|
enableACME = true;
|
|
|
|
|
locations."/".proxyPass = "http://localhost:8000";
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
security/acme: limit concurrent certificate generations
fixes #232505
Implements the new option `security.acme.maxConcurrentRenewals` to limit
the number of certificate generation (or renewal) jobs that can run in
parallel. This avoids overloading the system resources with many
certificates or running into acme registry rate limits and network
timeouts.
Architecture considerations:
- simplicity, lightweight: Concerns have been voiced about making this
already rather complex module even more convoluted. Additionally,
locking solutions shall not significantly increase performance and
footprint of individual job runs.
To accomodate these concerns, this solution is implemented purely in
Nix, bash, and using the light-weight `flock` util. To reduce
complexity, jobs are already assigned their lockfile slot at system
build time instead of dynamic locking and retrying. This comes at the
cost of not always maxing out the permitted concurrency at runtime.
- no stale locks: Limiting concurrency via locking mechanism is usually
approached with semaphores. Unfortunately, both SysV as well as
POSIX-Semaphores are *not* released when the process currently locking
them is SIGKILLed. This poses the danger of stale locks staying around
and certificate renewal being blocked from running altogether.
`flock` locks though are released when the process holding the file
descriptor of the lock file is KILLed or terminated.
- lockfile generation: Lock files could either be created at build time
in the Nix store or at script runtime in a idempotent manner.
While the latter would be simpler to achieve, we might exceed the number
of permitted concurrent runs during a system switch: Already running
jobs are still locked on the existing lock files, while jobs started
after the system switch will acquire locks on freshly created files,
not being blocked by the still running services.
For this reason, locks are generated and managed at runtime in the
shared state directory `/var/lib/locks/`.
nixos/security/acme: move locks to /run
also, move over permission and directory management to systemd-tmpfiles
nixos/security/acme: fix some linter remarks in my code
there are some remarks left for existing code, not touching that
nixos/security/acme: redesign script locking flow
- get rid of subshell
- provide function for wrapping scripts in a locked environment
nixos/acme: improve visibility of blocking on locks
nixos/acme: add smoke test for concurrency limitation
heavily inspired by m1cr0man
nixos/acme: release notes entry on new concurrency limits
nixos/acme: cleanup, clarifications
2023-07-18 11:20:33 +02:00
|
|
|
|
concurrency-limit.configuration = {pkgs, ...}: lib.mkMerge [
|
|
|
|
|
webserverBasicConfig {
|
|
|
|
|
security.acme.maxConcurrentRenewals = 1;
|
|
|
|
|
|
|
|
|
|
services.nginx.virtualHosts = {
|
|
|
|
|
"f.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
"g.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
"h.example.test" = vhostBase // {
|
|
|
|
|
enableACME = true;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
systemd.services = {
|
|
|
|
|
# check for mutual exclusion of starting renew services
|
|
|
|
|
"acme-f.example.test".serviceConfig.ExecPreStart = "+" + (pkgs.writeShellScript "test-f" ''
|
|
|
|
|
test "$(systemctl is-active acme-{g,h}.example.test.service | grep activating | wc -l)" -le 0
|
|
|
|
|
'');
|
|
|
|
|
"acme-g.example.test".serviceConfig.ExecPreStart = "+" + (pkgs.writeShellScript "test-g" ''
|
|
|
|
|
test "$(systemctl is-active acme-{f,h}.example.test.service | grep activating | wc -l)" -le 0
|
|
|
|
|
'');
|
|
|
|
|
"acme-h.example.test".serviceConfig.ExecPreStart = "+" + (pkgs.writeShellScript "test-h" ''
|
|
|
|
|
test "$(systemctl is-active acme-{g,f}.example.test.service | grep activating | wc -l)" -le 0
|
|
|
|
|
'');
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
2021-12-18 15:52:32 +01:00
|
|
|
|
# Test lego internal server (listenHTTP option)
|
|
|
|
|
# Also tests useRoot option
|
2021-12-26 22:12:33 +01:00
|
|
|
|
lego-server.configuration = { ... }: {
|
|
|
|
|
security.acme.useRoot = true;
|
|
|
|
|
security.acme.certs."lego.example.test" = {
|
|
|
|
|
listenHTTP = ":80";
|
|
|
|
|
group = "nginx";
|
|
|
|
|
};
|
|
|
|
|
services.nginx.enable = true;
|
|
|
|
|
services.nginx.virtualHosts."lego.example.test" = {
|
|
|
|
|
useACMEHost = "lego.example.test";
|
|
|
|
|
onlySSL = true;
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2022-12-18 01:31:14 +01:00
|
|
|
|
# Test compatibility with Caddy
|
2021-12-26 22:12:33 +01:00
|
|
|
|
# It only supports useACMEHost, hence not using mkServerConfigs
|
|
|
|
|
} // (let
|
|
|
|
|
baseCaddyConfig = { nodes, config, ... }: {
|
|
|
|
|
security.acme = {
|
2022-01-08 21:05:34 +01:00
|
|
|
|
defaults = (dnsConfig nodes);
|
2021-12-26 22:12:33 +01:00
|
|
|
|
# One manual wildcard cert
|
|
|
|
|
certs."example.test" = {
|
|
|
|
|
domain = "*.example.test";
|
2021-12-18 15:52:32 +01:00
|
|
|
|
};
|
2021-12-26 22:12:33 +01:00
|
|
|
|
};
|
|
|
|
|
|
2022-01-08 21:05:34 +01:00
|
|
|
|
users.users."${config.services.caddy.user}".extraGroups = ["acme"];
|
|
|
|
|
|
2021-12-26 22:12:33 +01:00
|
|
|
|
services.caddy = {
|
|
|
|
|
enable = true;
|
security/acme: limit concurrent certificate generations
fixes #232505
Implements the new option `security.acme.maxConcurrentRenewals` to limit
the number of certificate generation (or renewal) jobs that can run in
parallel. This avoids overloading the system resources with many
certificates or running into acme registry rate limits and network
timeouts.
Architecture considerations:
- simplicity, lightweight: Concerns have been voiced about making this
already rather complex module even more convoluted. Additionally,
locking solutions shall not significantly increase performance and
footprint of individual job runs.
To accomodate these concerns, this solution is implemented purely in
Nix, bash, and using the light-weight `flock` util. To reduce
complexity, jobs are already assigned their lockfile slot at system
build time instead of dynamic locking and retrying. This comes at the
cost of not always maxing out the permitted concurrency at runtime.
- no stale locks: Limiting concurrency via locking mechanism is usually
approached with semaphores. Unfortunately, both SysV as well as
POSIX-Semaphores are *not* released when the process currently locking
them is SIGKILLed. This poses the danger of stale locks staying around
and certificate renewal being blocked from running altogether.
`flock` locks though are released when the process holding the file
descriptor of the lock file is KILLed or terminated.
- lockfile generation: Lock files could either be created at build time
in the Nix store or at script runtime in a idempotent manner.
While the latter would be simpler to achieve, we might exceed the number
of permitted concurrent runs during a system switch: Already running
jobs are still locked on the existing lock files, while jobs started
after the system switch will acquire locks on freshly created files,
not being blocked by the still running services.
For this reason, locks are generated and managed at runtime in the
shared state directory `/var/lib/locks/`.
nixos/security/acme: move locks to /run
also, move over permission and directory management to systemd-tmpfiles
nixos/security/acme: fix some linter remarks in my code
there are some remarks left for existing code, not touching that
nixos/security/acme: redesign script locking flow
- get rid of subshell
- provide function for wrapping scripts in a locked environment
nixos/acme: improve visibility of blocking on locks
nixos/acme: add smoke test for concurrency limitation
heavily inspired by m1cr0man
nixos/acme: release notes entry on new concurrency limits
nixos/acme: cleanup, clarifications
2023-07-18 11:20:33 +02:00
|
|
|
|
virtualHosts."a.example.test" = {
|
2021-12-26 22:12:33 +01:00
|
|
|
|
useACMEHost = "example.test";
|
|
|
|
|
extraConfig = ''
|
|
|
|
|
root * ${documentRoot}
|
|
|
|
|
'';
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
in {
|
|
|
|
|
caddy.configuration = baseCaddyConfig;
|
|
|
|
|
|
|
|
|
|
# Test that the server reloads when only the acme configuration is changed.
|
|
|
|
|
"caddy-change-acme-conf".configuration = { nodes, config, ... }: lib.mkMerge [
|
|
|
|
|
(baseCaddyConfig {
|
|
|
|
|
inherit nodes config;
|
|
|
|
|
})
|
|
|
|
|
{
|
|
|
|
|
security.acme.certs."example.test" = {
|
|
|
|
|
keyType = "ec384";
|
2021-12-18 15:52:32 +01:00
|
|
|
|
};
|
2021-12-04 20:01:18 +01:00
|
|
|
|
}
|
|
|
|
|
];
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# Test compatibility with Nginx
|
2021-12-26 22:12:33 +01:00
|
|
|
|
}) // (mkServerConfigs {
|
2021-11-28 18:03:31 +01:00
|
|
|
|
server = "nginx";
|
|
|
|
|
group = "nginx";
|
|
|
|
|
vhostBaseData = vhostBase;
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# Test compatibility with Apache HTTPD
|
|
|
|
|
// (mkServerConfigs {
|
|
|
|
|
server = "httpd";
|
|
|
|
|
group = "wwwrun";
|
|
|
|
|
vhostBaseData = vhostBaseHttpd;
|
|
|
|
|
extraConfig = {
|
|
|
|
|
services.httpd.adminAddr = config.security.acme.defaults.email;
|
|
|
|
|
};
|
|
|
|
|
});
|
2017-07-27 13:24:17 +02:00
|
|
|
|
};
|
|
|
|
|
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# The client will be used to curl the webserver to validate configuration
|
2021-11-28 18:03:31 +01:00
|
|
|
|
client = { nodes, ... }: {
|
2020-02-09 03:09:34 +01:00
|
|
|
|
imports = [ commonConfig ];
|
2020-06-19 21:27:46 +02:00
|
|
|
|
networking.nameservers = lib.mkForce [ (dnsServerIP nodes) ];
|
|
|
|
|
|
|
|
|
|
# OpenSSL will be used for more thorough certificate validation
|
|
|
|
|
environment.systemPackages = [ pkgs.openssl ];
|
2020-02-09 03:09:34 +01:00
|
|
|
|
};
|
2017-07-27 13:24:17 +02:00
|
|
|
|
};
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
testScript = { nodes, ... }:
|
2019-08-29 16:32:59 +02:00
|
|
|
|
let
|
2022-06-06 23:49:59 +02:00
|
|
|
|
caDomain = nodes.acme.test-support.acme.caDomain;
|
2019-08-29 16:32:59 +02:00
|
|
|
|
in
|
2019-10-27 13:53:55 +01:00
|
|
|
|
# Note, wait_for_unit does not work for oneshot services that do not have RemainAfterExit=true,
|
2019-08-29 16:32:59 +02:00
|
|
|
|
# this is because a oneshot goes from inactive => activating => inactive, and never
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# reaches the active state. Targets do not have this issue.
|
2019-08-29 16:32:59 +02:00
|
|
|
|
''
|
2020-09-03 16:31:06 +02:00
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
2022-12-16 12:18:54 +01:00
|
|
|
|
TOTAL_RETRIES = 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BackoffTracker(object):
|
|
|
|
|
delay = 1
|
|
|
|
|
increment = 1
|
|
|
|
|
|
|
|
|
|
def handle_fail(self, retries, message) -> int:
|
|
|
|
|
assert retries < TOTAL_RETRIES, message
|
|
|
|
|
|
|
|
|
|
print(f"Retrying in {self.delay}s, {retries + 1}/{TOTAL_RETRIES}")
|
|
|
|
|
time.sleep(self.delay)
|
|
|
|
|
|
|
|
|
|
# Only increment after the first try
|
|
|
|
|
if retries == 0:
|
|
|
|
|
self.delay += self.increment
|
|
|
|
|
self.increment *= 2
|
|
|
|
|
|
|
|
|
|
return retries + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backoff = BackoffTracker()
|
|
|
|
|
|
|
|
|
|
|
2024-06-06 14:28:54 +02:00
|
|
|
|
def switch_to(node, name, allow_fail=False):
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# On first switch, this will create a symlink to the current system so that we can
|
|
|
|
|
# quickly switch between derivations
|
|
|
|
|
root_specs = "/tmp/specialisation"
|
|
|
|
|
node.execute(
|
|
|
|
|
f"test -e {root_specs}"
|
|
|
|
|
f" || ln -s $(readlink /run/current-system)/specialisation {root_specs}"
|
|
|
|
|
)
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
switcher_path = f"/run/current-system/specialisation/{name}/bin/switch-to-configuration"
|
|
|
|
|
rc, _ = node.execute(f"test -e '{switcher_path}'")
|
|
|
|
|
if rc > 0:
|
|
|
|
|
switcher_path = f"/tmp/specialisation/{name}/bin/switch-to-configuration"
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2024-06-06 14:28:54 +02:00
|
|
|
|
if not allow_fail:
|
|
|
|
|
node.succeed(
|
|
|
|
|
f"{switcher_path} test"
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
node.execute(
|
|
|
|
|
f"{switcher_path} test"
|
|
|
|
|
)
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Ensures the issuer of our cert matches the chain
|
|
|
|
|
# and matches the issuer we expect it to be.
|
|
|
|
|
# It's a good validation to ensure the cert.pem and fullchain.pem
|
2023-05-20 04:10:21 +02:00
|
|
|
|
# are not still selfsigned after verification
|
2020-06-19 21:27:46 +02:00
|
|
|
|
def check_issuer(node, cert_name, issuer):
|
|
|
|
|
for fname in ("cert.pem", "fullchain.pem"):
|
2020-09-03 16:31:06 +02:00
|
|
|
|
actual_issuer = node.succeed(
|
|
|
|
|
f"openssl x509 -noout -issuer -in /var/lib/acme/{cert_name}/{fname}"
|
|
|
|
|
).partition("=")[2]
|
|
|
|
|
print(f"{fname} issuer: {actual_issuer}")
|
|
|
|
|
assert issuer.lower() in actual_issuer.lower()
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Ensure cert comes before chain in fullchain.pem
|
|
|
|
|
def check_fullchain(node, cert_name):
|
2020-09-03 16:31:06 +02:00
|
|
|
|
subject_data = node.succeed(
|
|
|
|
|
f"openssl crl2pkcs7 -nocrl -certfile /var/lib/acme/{cert_name}/fullchain.pem"
|
|
|
|
|
" | openssl pkcs7 -print_certs -noout"
|
2020-06-19 21:27:46 +02:00
|
|
|
|
)
|
2020-09-03 16:31:06 +02:00
|
|
|
|
for line in subject_data.lower().split("\n"):
|
|
|
|
|
if "subject" in line:
|
2021-05-08 18:38:00 +02:00
|
|
|
|
print(f"First subject in fullchain.pem: {line}")
|
2020-09-03 16:31:06 +02:00
|
|
|
|
assert cert_name.lower() in line
|
|
|
|
|
return
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2020-09-03 16:31:06 +02:00
|
|
|
|
assert False
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2020-09-03 16:31:06 +02:00
|
|
|
|
|
2022-12-16 12:18:54 +01:00
|
|
|
|
def check_connection(node, domain, retries=0):
|
2020-09-03 16:31:06 +02:00
|
|
|
|
result = node.succeed(
|
|
|
|
|
"openssl s_client -brief -verify 2 -CAfile /tmp/ca.crt"
|
|
|
|
|
f" -servername {domain} -connect {domain}:443 < /dev/null 2>&1"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for line in result.lower().split("\n"):
|
|
|
|
|
if "verification" in line and "error" in line:
|
2022-12-16 12:18:54 +01:00
|
|
|
|
retries = backoff.handle_fail(retries, f"Failed to connect to https://{domain}")
|
|
|
|
|
return check_connection(node, domain, retries)
|
2020-09-03 16:31:06 +02:00
|
|
|
|
|
|
|
|
|
|
2022-12-16 12:18:54 +01:00
|
|
|
|
def check_connection_key_bits(node, domain, bits, retries=0):
|
2020-09-03 16:31:06 +02:00
|
|
|
|
result = node.succeed(
|
|
|
|
|
"openssl s_client -CAfile /tmp/ca.crt"
|
|
|
|
|
f" -servername {domain} -connect {domain}:443 < /dev/null"
|
|
|
|
|
" | openssl x509 -noout -text | grep -i Public-Key"
|
2020-06-19 21:27:46 +02:00
|
|
|
|
)
|
2020-09-03 16:31:06 +02:00
|
|
|
|
print("Key type:", result)
|
|
|
|
|
|
|
|
|
|
if bits not in result:
|
2022-12-16 12:18:54 +01:00
|
|
|
|
retries = backoff.handle_fail(retries, f"Did not find expected number of bits ({bits}) in key")
|
|
|
|
|
return check_connection_key_bits(node, domain, bits, retries)
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2020-10-06 22:52:49 +02:00
|
|
|
|
|
2022-12-16 12:18:54 +01:00
|
|
|
|
def check_stapling(node, domain, retries=0):
|
2020-10-06 22:52:49 +02:00
|
|
|
|
# Pebble doesn't provide a full OCSP responder, so just check the URL
|
|
|
|
|
result = node.succeed(
|
|
|
|
|
"openssl s_client -CAfile /tmp/ca.crt"
|
|
|
|
|
f" -servername {domain} -connect {domain}:443 < /dev/null"
|
|
|
|
|
" | openssl x509 -noout -ocsp_uri"
|
|
|
|
|
)
|
|
|
|
|
print("OCSP Responder URL:", result)
|
|
|
|
|
|
|
|
|
|
if "${caDomain}:4002" not in result.lower():
|
2022-12-16 12:18:54 +01:00
|
|
|
|
retries = backoff.handle_fail(retries, "OCSP Stapling check failed")
|
|
|
|
|
return check_stapling(node, domain, retries)
|
2020-10-06 22:52:49 +02:00
|
|
|
|
|
2021-03-15 02:33:45 +01:00
|
|
|
|
|
2022-12-16 12:18:54 +01:00
|
|
|
|
def download_ca_certs(node, retries=0):
|
2021-03-15 02:33:45 +01:00
|
|
|
|
exit_code, _ = node.execute("curl https://${caDomain}:15000/roots/0 > /tmp/ca.crt")
|
|
|
|
|
exit_code_2, _ = node.execute(
|
|
|
|
|
"curl https://${caDomain}:15000/intermediate-keys/0 >> /tmp/ca.crt"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if exit_code + exit_code_2 > 0:
|
2022-12-16 12:18:54 +01:00
|
|
|
|
retries = backoff.handle_fail(retries, "Failed to connect to pebble to download root CA certs")
|
|
|
|
|
return download_ca_certs(node, retries)
|
2021-03-15 02:33:45 +01:00
|
|
|
|
|
|
|
|
|
|
2021-11-28 18:03:31 +01:00
|
|
|
|
start_all()
|
2020-02-09 03:09:34 +01:00
|
|
|
|
|
|
|
|
|
dnsserver.wait_for_unit("pebble-challtestsrv.service")
|
2020-06-19 21:27:46 +02:00
|
|
|
|
client.wait_for_unit("default.target")
|
|
|
|
|
|
2020-02-09 03:09:34 +01:00
|
|
|
|
client.succeed(
|
2022-06-06 23:49:59 +02:00
|
|
|
|
'curl --data \'{"host": "${caDomain}", "addresses": ["${nodes.acme.networking.primaryIPAddress}"]}\' http://${dnsServerIP nodes}:8055/add-a'
|
2020-02-09 03:09:34 +01:00
|
|
|
|
)
|
|
|
|
|
|
2023-10-03 09:54:15 +02:00
|
|
|
|
acme.systemctl("start network-online.target")
|
2020-12-13 21:22:33 +01:00
|
|
|
|
acme.wait_for_unit("network-online.target")
|
2020-03-23 21:35:16 +01:00
|
|
|
|
acme.wait_for_unit("pebble.service")
|
2019-10-27 13:53:55 +01:00
|
|
|
|
|
2021-03-15 02:33:45 +01:00
|
|
|
|
download_ca_certs(client)
|
2019-10-27 13:53:55 +01:00
|
|
|
|
|
2022-09-18 22:27:11 +02:00
|
|
|
|
# Perform http-01 w/ lego test first
|
|
|
|
|
with subtest("Can request certificate with Lego's built in web server"):
|
2022-09-19 02:07:29 +02:00
|
|
|
|
switch_to(webserver, "http01lego")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-http.example.test.target")
|
|
|
|
|
check_fullchain(webserver, "http.example.test")
|
|
|
|
|
check_issuer(webserver, "http.example.test", "pebble")
|
|
|
|
|
|
2024-02-07 14:59:50 +01:00
|
|
|
|
# Perform account hash test
|
2024-06-06 14:28:54 +02:00
|
|
|
|
with subtest("Assert that account hash didn't unexpectedly change"):
|
2024-02-07 14:59:50 +01:00
|
|
|
|
hash = webserver.succeed("ls /var/lib/acme/.lego/accounts/")
|
|
|
|
|
print("Account hash: " + hash)
|
|
|
|
|
assert hash.strip() == "d590213ed52603e9128d"
|
|
|
|
|
|
2022-09-19 02:07:29 +02:00
|
|
|
|
# Perform renewal test
|
|
|
|
|
with subtest("Can renew certificates when they expire"):
|
|
|
|
|
hash = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
|
|
|
|
|
switch_to(webserver, "renew")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-http.example.test.target")
|
|
|
|
|
check_fullchain(webserver, "http.example.test")
|
|
|
|
|
check_issuer(webserver, "http.example.test", "pebble")
|
|
|
|
|
hash_after = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
|
|
|
|
|
assert hash != hash_after
|
|
|
|
|
|
|
|
|
|
# Perform account change test
|
|
|
|
|
with subtest("Handles email change correctly"):
|
|
|
|
|
hash = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
|
|
|
|
|
switch_to(webserver, "accountchange")
|
2022-09-18 22:27:11 +02:00
|
|
|
|
webserver.wait_for_unit("acme-finished-http.example.test.target")
|
|
|
|
|
check_fullchain(webserver, "http.example.test")
|
|
|
|
|
check_issuer(webserver, "http.example.test", "pebble")
|
2022-09-19 02:07:29 +02:00
|
|
|
|
hash_after = webserver.succeed("sha256sum /var/lib/acme/http.example.test/cert.pem")
|
|
|
|
|
# Has to do a full run to register account, which creates new certs.
|
|
|
|
|
assert hash != hash_after
|
2022-09-18 22:27:11 +02:00
|
|
|
|
|
|
|
|
|
# Perform general tests
|
2021-11-28 18:03:31 +01:00
|
|
|
|
switch_to(webserver, "general")
|
|
|
|
|
|
2021-12-18 15:52:32 +01:00
|
|
|
|
with subtest("Can request certificate with HTTP-01 challenge"):
|
2020-03-23 21:35:16 +01:00
|
|
|
|
webserver.wait_for_unit("acme-finished-a.example.test.target")
|
2021-11-28 18:03:31 +01:00
|
|
|
|
check_fullchain(webserver, "a.example.test")
|
|
|
|
|
check_issuer(webserver, "a.example.test", "pebble")
|
|
|
|
|
webserver.wait_for_unit("nginx.service")
|
|
|
|
|
check_connection(client, "a.example.test")
|
|
|
|
|
|
|
|
|
|
with subtest("Runs 1 cert for account creation before others"):
|
|
|
|
|
webserver.wait_for_unit("acme-finished-b.example.test.target")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-c.example.test.target")
|
|
|
|
|
check_connection(client, "b.example.test")
|
|
|
|
|
check_connection(client, "c.example.test")
|
2019-10-27 13:53:55 +01:00
|
|
|
|
|
2021-03-15 20:25:49 +01:00
|
|
|
|
with subtest("Certificates and accounts have safe + valid permissions"):
|
2021-11-28 18:03:31 +01:00
|
|
|
|
# Nginx will set the group appropriately when enableACME is used
|
|
|
|
|
group = "nginx"
|
2021-03-15 20:25:49 +01:00
|
|
|
|
webserver.succeed(
|
2021-05-05 01:27:19 +02:00
|
|
|
|
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test/*.pem | tee /dev/stderr | grep '640 acme {group}' | wc -l) -eq 5"
|
2021-03-15 20:25:49 +01:00
|
|
|
|
)
|
|
|
|
|
webserver.succeed(
|
2021-05-05 01:27:19 +02:00
|
|
|
|
f"test $(stat -L -c '%a %U %G' /var/lib/acme/.lego/a.example.test/**/a.example.test* | tee /dev/stderr | grep '600 acme {group}' | wc -l) -eq 4"
|
2021-03-15 20:25:49 +01:00
|
|
|
|
)
|
|
|
|
|
webserver.succeed(
|
2021-05-05 01:27:19 +02:00
|
|
|
|
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test | tee /dev/stderr | grep '750 acme {group}' | wc -l) -eq 1"
|
2021-03-15 20:25:49 +01:00
|
|
|
|
)
|
|
|
|
|
webserver.succeed(
|
2021-05-05 01:27:19 +02:00
|
|
|
|
f"test $(find /var/lib/acme/accounts -type f -exec stat -L -c '%a %U %G' {{}} \\; | tee /dev/stderr | grep -v '600 acme {group}' | wc -l) -eq 0"
|
2021-03-15 20:25:49 +01:00
|
|
|
|
)
|
|
|
|
|
|
2021-05-05 01:27:19 +02:00
|
|
|
|
# Selfsigned certs tests happen late so we aren't fighting the system init triggering cert renewal
|
2020-06-19 21:27:46 +02:00
|
|
|
|
with subtest("Can generate valid selfsigned certs"):
|
|
|
|
|
webserver.succeed("systemctl clean acme-a.example.test.service --what=state")
|
|
|
|
|
webserver.succeed("systemctl start acme-selfsigned-a.example.test.service")
|
|
|
|
|
check_fullchain(webserver, "a.example.test")
|
|
|
|
|
check_issuer(webserver, "a.example.test", "minica")
|
2021-05-05 01:27:19 +02:00
|
|
|
|
# Check selfsigned permissions
|
|
|
|
|
webserver.succeed(
|
|
|
|
|
f"test $(stat -L -c '%a %U %G' /var/lib/acme/a.example.test/*.pem | tee /dev/stderr | grep '640 acme {group}' | wc -l) -eq 5"
|
|
|
|
|
)
|
2020-06-19 21:27:46 +02:00
|
|
|
|
# Will succeed if nginx can load the certs
|
|
|
|
|
webserver.succeed("systemctl start nginx-config-reload.service")
|
|
|
|
|
|
2020-10-06 22:52:49 +02:00
|
|
|
|
with subtest("Correctly implements OCSP stapling"):
|
|
|
|
|
switch_to(webserver, "ocsp-stapling")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-a.example.test.target")
|
|
|
|
|
check_stapling(client, "a.example.test")
|
|
|
|
|
|
2021-12-18 15:52:32 +01:00
|
|
|
|
with subtest("Can request certificate with HTTP-01 using lego's internal web server"):
|
|
|
|
|
switch_to(webserver, "lego-server")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-lego.example.test.target")
|
2021-11-28 18:03:31 +01:00
|
|
|
|
webserver.wait_for_unit("nginx.service")
|
2021-12-18 15:52:32 +01:00
|
|
|
|
webserver.succeed("echo HENLO && systemctl cat nginx.service")
|
|
|
|
|
webserver.succeed("test \"$(stat -c '%U' /var/lib/acme/* | uniq)\" = \"root\"")
|
|
|
|
|
check_connection(client, "a.example.test")
|
|
|
|
|
check_connection(client, "lego.example.test")
|
2020-06-19 21:27:46 +02:00
|
|
|
|
|
2021-12-18 15:52:32 +01:00
|
|
|
|
with subtest("Can request certificate with HTTP-01 when nginx startup is delayed"):
|
|
|
|
|
webserver.execute("systemctl stop nginx")
|
|
|
|
|
switch_to(webserver, "slow-startup")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-slow.example.test.target")
|
|
|
|
|
check_issuer(webserver, "slow.example.test", "pebble")
|
2021-12-04 20:01:18 +01:00
|
|
|
|
webserver.wait_for_unit("nginx.service")
|
2021-12-18 15:52:32 +01:00
|
|
|
|
check_connection(client, "slow.example.test")
|
2021-12-04 20:01:18 +01:00
|
|
|
|
|
security/acme: limit concurrent certificate generations
fixes #232505
Implements the new option `security.acme.maxConcurrentRenewals` to limit
the number of certificate generation (or renewal) jobs that can run in
parallel. This avoids overloading the system resources with many
certificates or running into acme registry rate limits and network
timeouts.
Architecture considerations:
- simplicity, lightweight: Concerns have been voiced about making this
already rather complex module even more convoluted. Additionally,
locking solutions shall not significantly increase performance and
footprint of individual job runs.
To accomodate these concerns, this solution is implemented purely in
Nix, bash, and using the light-weight `flock` util. To reduce
complexity, jobs are already assigned their lockfile slot at system
build time instead of dynamic locking and retrying. This comes at the
cost of not always maxing out the permitted concurrency at runtime.
- no stale locks: Limiting concurrency via locking mechanism is usually
approached with semaphores. Unfortunately, both SysV as well as
POSIX-Semaphores are *not* released when the process currently locking
them is SIGKILLed. This poses the danger of stale locks staying around
and certificate renewal being blocked from running altogether.
`flock` locks though are released when the process holding the file
descriptor of the lock file is KILLed or terminated.
- lockfile generation: Lock files could either be created at build time
in the Nix store or at script runtime in a idempotent manner.
While the latter would be simpler to achieve, we might exceed the number
of permitted concurrent runs during a system switch: Already running
jobs are still locked on the existing lock files, while jobs started
after the system switch will acquire locks on freshly created files,
not being blocked by the still running services.
For this reason, locks are generated and managed at runtime in the
shared state directory `/var/lib/locks/`.
nixos/security/acme: move locks to /run
also, move over permission and directory management to systemd-tmpfiles
nixos/security/acme: fix some linter remarks in my code
there are some remarks left for existing code, not touching that
nixos/security/acme: redesign script locking flow
- get rid of subshell
- provide function for wrapping scripts in a locked environment
nixos/acme: improve visibility of blocking on locks
nixos/acme: add smoke test for concurrency limitation
heavily inspired by m1cr0man
nixos/acme: release notes entry on new concurrency limits
nixos/acme: cleanup, clarifications
2023-07-18 11:20:33 +02:00
|
|
|
|
with subtest("Can limit concurrency of running renewals"):
|
|
|
|
|
switch_to(webserver, "concurrency-limit")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-f.example.test.target")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-g.example.test.target")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-h.example.test.target")
|
|
|
|
|
check_connection(client, "f.example.test")
|
|
|
|
|
check_connection(client, "g.example.test")
|
|
|
|
|
check_connection(client, "h.example.test")
|
|
|
|
|
|
2021-12-26 22:12:33 +01:00
|
|
|
|
with subtest("Works with caddy"):
|
|
|
|
|
switch_to(webserver, "caddy")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-example.test.target")
|
|
|
|
|
webserver.wait_for_unit("caddy.service")
|
|
|
|
|
# FIXME reloading caddy is not sufficient to load new certs.
|
|
|
|
|
# Restart it manually until this is fixed.
|
|
|
|
|
webserver.succeed("systemctl restart caddy.service")
|
|
|
|
|
check_connection(client, "a.example.test")
|
|
|
|
|
|
|
|
|
|
with subtest("security.acme changes reflect on caddy"):
|
|
|
|
|
switch_to(webserver, "caddy-change-acme-conf")
|
|
|
|
|
webserver.wait_for_unit("acme-finished-example.test.target")
|
|
|
|
|
webserver.wait_for_unit("caddy.service")
|
|
|
|
|
# FIXME reloading caddy is not sufficient to load new certs.
|
|
|
|
|
# Restart it manually until this is fixed.
|
|
|
|
|
webserver.succeed("systemctl restart caddy.service")
|
|
|
|
|
check_connection_key_bits(client, "a.example.test", "384")
|
|
|
|
|
|
2024-05-10 00:50:43 +02:00
|
|
|
|
common_domains = ["http", "dns", "wildcard"]
|
|
|
|
|
for server, logsrc, domains in [
|
|
|
|
|
("nginx", "journalctl -n 30 -u nginx.service", common_domains + ["different-key"]),
|
|
|
|
|
("httpd", "tail -n 30 /var/log/httpd/*.log", common_domains),
|
2021-11-28 18:03:31 +01:00
|
|
|
|
]:
|
|
|
|
|
wait_for_server = lambda: webserver.wait_for_unit(f"{server}.service")
|
|
|
|
|
with subtest(f"Works with {server}"):
|
|
|
|
|
try:
|
|
|
|
|
switch_to(webserver, server)
|
2024-05-10 00:50:43 +02:00
|
|
|
|
for domain in domains:
|
|
|
|
|
if domain != "wildcard":
|
|
|
|
|
webserver.wait_for_unit(
|
|
|
|
|
f"acme-finished-{server}-{domain}.example.test.target"
|
|
|
|
|
)
|
2021-11-28 18:03:31 +01:00
|
|
|
|
except Exception as err:
|
|
|
|
|
_, output = webserver.execute(
|
|
|
|
|
f"{logsrc} && ls -al /var/lib/acme/acme-challenge"
|
|
|
|
|
)
|
|
|
|
|
print(output)
|
|
|
|
|
raise err
|
|
|
|
|
|
|
|
|
|
wait_for_server()
|
|
|
|
|
|
2024-05-10 00:50:43 +02:00
|
|
|
|
for domain in domains:
|
|
|
|
|
if domain != "wildcard":
|
|
|
|
|
check_issuer(webserver, f"{server}-{domain}.example.test", "pebble")
|
2021-11-28 18:03:31 +01:00
|
|
|
|
for domain in domains:
|
|
|
|
|
check_connection(client, f"{server}-{domain}.example.test")
|
|
|
|
|
check_connection(client, f"{server}-{domain}-alias.example.test")
|
|
|
|
|
|
|
|
|
|
test_domain = f"{server}-{domains[0]}.example.test"
|
|
|
|
|
|
|
|
|
|
with subtest(f"Can reload {server} when timer triggers renewal"):
|
|
|
|
|
# Switch to selfsigned first
|
|
|
|
|
webserver.succeed(f"systemctl clean acme-{test_domain}.service --what=state")
|
|
|
|
|
webserver.succeed(f"systemctl start acme-selfsigned-{test_domain}.service")
|
|
|
|
|
check_issuer(webserver, test_domain, "minica")
|
|
|
|
|
webserver.succeed(f"systemctl start {server}-config-reload.service")
|
|
|
|
|
webserver.succeed(f"systemctl start test-renew-{server}.target")
|
|
|
|
|
check_issuer(webserver, test_domain, "pebble")
|
|
|
|
|
check_connection(client, test_domain)
|
|
|
|
|
|
|
|
|
|
with subtest("Can remove an alias from a domain + cert is updated"):
|
|
|
|
|
test_alias = f"{server}-{domains[0]}-alias.example.test"
|
|
|
|
|
switch_to(webserver, f"{server}-remove-alias")
|
|
|
|
|
webserver.wait_for_unit(f"acme-finished-{test_domain}.target")
|
|
|
|
|
wait_for_server()
|
|
|
|
|
check_connection(client, test_domain)
|
2022-06-17 11:45:19 +02:00
|
|
|
|
rc, _s = client.execute(
|
2021-11-28 18:03:31 +01:00
|
|
|
|
f"openssl s_client -CAfile /tmp/ca.crt -connect {test_alias}:443"
|
|
|
|
|
" </dev/null 2>/dev/null | openssl x509 -noout -text"
|
|
|
|
|
f" | grep DNS: | grep {test_alias}"
|
2021-03-15 02:33:45 +01:00
|
|
|
|
)
|
2021-11-28 18:03:31 +01:00
|
|
|
|
assert rc > 0, "Removed extraDomainName was not removed from the cert"
|
|
|
|
|
|
|
|
|
|
with subtest("security.acme changes reflect on web server"):
|
|
|
|
|
# Switch back to normal server config first, reset everything.
|
|
|
|
|
switch_to(webserver, server)
|
|
|
|
|
wait_for_server()
|
|
|
|
|
switch_to(webserver, f"{server}-change-acme-conf")
|
|
|
|
|
webserver.wait_for_unit(f"acme-finished-{test_domain}.target")
|
|
|
|
|
wait_for_server()
|
|
|
|
|
check_connection_key_bits(client, test_domain, "384")
|
2024-06-06 14:28:54 +02:00
|
|
|
|
|
|
|
|
|
# Perform http-01 w/ lego test again, but using the pre-24.05 account hashing
|
|
|
|
|
# (see https://github.com/NixOS/nixpkgs/pull/317257)
|
|
|
|
|
with subtest("Check account hashing compatibility with pre-24.05 settings"):
|
|
|
|
|
webserver.succeed("rm -rf /var/lib/acme/.lego/accounts/*")
|
|
|
|
|
switch_to(webserver, "http01lego_legacyAccountHash", allow_fail=True)
|
|
|
|
|
# unit is failed, but in a way that this throws no exception:
|
|
|
|
|
try:
|
|
|
|
|
webserver.wait_for_unit("acme-finished-http.example.test.target")
|
|
|
|
|
except Exception:
|
|
|
|
|
# The unit is allowed – or even expected – to fail due to not being able to
|
|
|
|
|
# reach the actual letsencrypt server. We only use it for serialising the
|
|
|
|
|
# test execution, such that the account check is done after the service run
|
|
|
|
|
# involving the account creation has been executed at least once.
|
|
|
|
|
pass
|
|
|
|
|
hash = webserver.succeed("ls /var/lib/acme/.lego/accounts/")
|
|
|
|
|
print("Account hash: " + hash)
|
|
|
|
|
assert hash.strip() == "1ccf607d9aa280e9af00"
|
2019-08-29 16:32:59 +02:00
|
|
|
|
'';
|
2022-06-06 23:49:59 +02:00
|
|
|
|
}
|