diff --git a/app/Console/Kernel.php b/app/Console/Kernel.php index 3ec59adb3..665553fcb 100644 --- a/app/Console/Kernel.php +++ b/app/Console/Kernel.php @@ -8,6 +8,7 @@ use App\Jobs\CheckHelperImageJob; use App\Jobs\CheckTraefikVersionJob; use App\Jobs\CleanupInstanceStuffsJob; use App\Jobs\CleanupOrphanedPreviewContainersJob; +use App\Jobs\CleanupStaleMultiplexedConnections; use App\Jobs\PullChangelog; use App\Jobs\PullTemplatesFromCDN; use App\Jobs\RegenerateSslCertJob; @@ -40,7 +41,7 @@ class Kernel extends ConsoleKernel $this->instanceTimezone = config('app.timezone'); } - // $this->scheduleInstance->job(new CleanupStaleMultiplexedConnections)->hourly(); + $this->scheduleInstance->job(new CleanupStaleMultiplexedConnections)->hourly()->onOneServer(); $this->scheduleInstance->command('cleanup:redis --clear-locks')->daily(); $this->scheduleInstance->command('sanctum:prune-expired --hours=1')->hourly()->onOneServer(); $this->scheduleInstance->job(new ApiTokenExpirationWarningJob)->hourly()->onOneServer(); diff --git a/app/Helpers/SshMultiplexingHelper.php b/app/Helpers/SshMultiplexingHelper.php index 4629df571..78084a157 100644 --- a/app/Helpers/SshMultiplexingHelper.php +++ b/app/Helpers/SshMultiplexingHelper.php @@ -4,6 +4,7 @@ namespace App\Helpers; use App\Models\PrivateKey; use App\Models\Server; +use Illuminate\Contracts\Cache\LockTimeoutException; use Illuminate\Support\Facades\Cache; use Illuminate\Support\Facades\Hash; use Illuminate\Support\Facades\Log; @@ -30,37 +31,99 @@ class SshMultiplexingHelper return false; } + // Fast path: a usable master already exists, no need to lock. + if (self::connectionIsReusable($server)) { + return true; + } + + // Slow path: establishing or refreshing the master. Serialize per server + // so concurrent workers do not each spawn their own master process, + // leaving orphaned non-master ssh connections that ControlPersist never reaps. + try { + return Cache::lock( + self::connectionLockKey($server), + config('constants.ssh.mux_lock_ttl') + )->block(config('constants.ssh.mux_lock_timeout'), function () use ($server) { + // Double-checked: another worker may have established the master + // while we were waiting for the lock. + if (self::connectionIsReusable($server)) { + return true; + } + + // A master exists but is stale or expired: close and re-establish. + if (self::masterConnectionExists($server)) { + return self::refreshMultiplexedConnection($server); + } + + return self::establishNewMultiplexedConnection($server); + }); + } catch (LockTimeoutException) { + Log::warning('SSH multiplexing lock timeout, falling back to non-multiplexed connection', [ + 'server' => $server->name ?? $server->ip, + ]); + + return false; + } catch (\Throwable $e) { + Log::warning('SSH multiplexing lock unavailable, falling back to non-multiplexed connection', [ + 'server' => $server->name ?? $server->ip, + 'error' => $e->getMessage(), + ]); + + return false; + } + } + + /** + * Per-server, per-host lock key for serializing master establishment. + * + * The mux socket is a host-local unix socket, so the lock is scoped to the + * current Coolify host: workers on the same host share a master and must + * serialize, while workers on other hosts manage their own masters and must + * not block on each other. + */ + private static function connectionLockKey(Server $server): string + { + return 'ssh_mux_lock_'.(gethostname() ?: 'unknown').'_'.$server->uuid; + } + + /** + * Check whether a multiplexed master connection currently exists for the server. + */ + private static function masterConnectionExists(Server $server): bool + { $sshConfig = self::serverSshConfiguration($server); $muxSocket = $sshConfig['muxFilename']; - // Check if connection exists $checkCommand = "ssh -O check -o ControlPath=$muxSocket "; if (data_get($server, 'settings.is_cloudflare_tunnel')) { $checkCommand .= '-o ProxyCommand="cloudflared access ssh --hostname %h" '; } $checkCommand .= self::escapedUserAtHost($server); - $process = Process::run($checkCommand); - if ($process->exitCode() !== 0) { - return self::establishNewMultiplexedConnection($server); + return Process::run($checkCommand)->exitCode() === 0; + } + + /** + * Determine whether the existing master connection can be reused as-is + * (it exists, has not exceeded its max age, and passes the health check). + */ + private static function connectionIsReusable(Server $server): bool + { + if (! self::masterConnectionExists($server)) { + return false; } - // Connection exists, ensure we have metadata for age tracking + // Existing connection but no metadata, store current time as fallback. if (self::getConnectionAge($server) === null) { - // Existing connection but no metadata, store current time as fallback self::storeConnectionMetadata($server); } - // Connection exists, check if it needs refresh due to age if (self::isConnectionExpired($server)) { - return self::refreshMultiplexedConnection($server); + return false; } - // Perform health check if enabled - if (config('constants.ssh.mux_health_check_enabled')) { - if (! self::isConnectionHealthy($server)) { - return self::refreshMultiplexedConnection($server); - } + if (config('constants.ssh.mux_health_check_enabled') && ! self::isConnectionHealthy($server)) { + return false; } return true; @@ -75,7 +138,10 @@ class SshMultiplexingHelper $serverInterval = config('constants.ssh.server_interval'); $muxPersistTime = config('constants.ssh.mux_persist_time'); - $establishCommand = "ssh -fNM -o ControlMaster=auto -o ControlPath=$muxSocket -o ControlPersist={$muxPersistTime} "; + // No -M: it forces master mode and overrides ControlMaster=auto. When a + // socket already exists -M leaves an orphaned non-master ssh -fN process + // that ControlPersist never reaps. ControlMaster=auto reuses instead. + $establishCommand = "ssh -fN -o ControlMaster=auto -o ControlPath=$muxSocket -o ControlPersist={$muxPersistTime} "; if (data_get($server, 'settings.is_cloudflare_tunnel')) { $establishCommand .= ' -o ProxyCommand="cloudflared access ssh --hostname %h" '; @@ -176,6 +242,10 @@ class SshMultiplexingHelper $ssh_command .= "-o ControlMaster=auto -o ControlPath=$muxSocket -o ControlPersist={$muxPersistTime} "; } } catch (\Exception $e) { + Log::warning('SSH multiplexing failed, falling back to non-multiplexed connection', [ + 'server' => $server->name ?? $server->ip, + 'error' => $e->getMessage(), + ]); // Continue without multiplexing } } diff --git a/app/Jobs/CleanupStaleMultiplexedConnections.php b/app/Jobs/CleanupStaleMultiplexedConnections.php index 6d49bee4b..0d3029c66 100644 --- a/app/Jobs/CleanupStaleMultiplexedConnections.php +++ b/app/Jobs/CleanupStaleMultiplexedConnections.php @@ -9,6 +9,7 @@ use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Bus\Dispatchable; use Illuminate\Queue\InteractsWithQueue; use Illuminate\Queue\SerializesModels; +use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Process; use Illuminate\Support\Facades\Storage; @@ -20,6 +21,132 @@ class CleanupStaleMultiplexedConnections implements ShouldQueue { $this->cleanupStaleConnections(); $this->cleanupNonExistentServerConnections(); + $this->cleanupOrphanedSshProcesses(); + $this->cleanupOrphanedCloudflaredProcesses(); + } + + /** + * Kill backgrounded ssh master processes that lost the ControlPath socket + * race. Such processes are not masters, so ControlPersist never reaps them + * and they leak memory until the container restarts. A legitimate master + * always owns its socket file; an orphan has none. + * + * Processes younger than the minimum age are skipped: a freshly forked + * master creates its socket a few milliseconds after starting, so a young + * process with no socket may simply be mid-establish rather than orphaned. + */ + private function cleanupOrphanedSshProcesses(): void + { + $muxDir = storage_path('app/ssh/mux'); + $minAge = (int) config('constants.ssh.mux_orphan_min_age'); + + foreach ($this->listProcesses() as $process) { + // Backgrounded ssh master: current `ssh -fN` or legacy `ssh -fNM`. + if (! preg_match('#(^|/)ssh -fN#', $process['args'])) { + continue; + } + + // Only ever touch ssh processes pointing at Coolify's mux directory. + if (! preg_match('#ControlPath=('.preg_quote($muxDir, '#').'/\S+)#', $process['args'], $pathMatch)) { + continue; + } + + if ($process['etimes'] >= $minAge && ! file_exists($pathMatch[1])) { + $this->reapOrphan('ssh', $process); + } + } + } + + /** + * Kill orphaned `cloudflared access ssh` proxy processes. Each is spawned + * as the SSH ProxyCommand transport for a Cloudflare Tunnel server and must + * die with its parent ssh. When that ssh is killed or orphaned (e.g. a lost + * mux master), the cloudflared process can leak and accumulate. A legitimate + * proxy always has a live ssh parent; one without is safe to reap. + * + * Processes younger than the minimum age are skipped so a proxy whose parent + * ssh is still starting up, or a transient `ssh -O check` proxy mid-exit, is + * never mistaken for an orphan. + */ + private function cleanupOrphanedCloudflaredProcesses(): void + { + $minAge = (int) config('constants.ssh.mux_orphan_min_age'); + $processes = $this->listProcesses(); + + $sshPids = []; + foreach ($processes as $process) { + // The ssh binary itself, not `cloudflared access ssh` (space before ssh). + if (preg_match('#(^|/)ssh\s#', $process['args'])) { + $sshPids[$process['pid']] = true; + } + } + + foreach ($processes as $process) { + // `cloudflared access ssh`, never the `cloudflared tunnel` daemon. + if (! str_contains($process['args'], 'cloudflared access ssh')) { + continue; + } + + // Orphaned when no live ssh process is its parent. + if ($process['etimes'] >= $minAge && ! isset($sshPids[$process['ppid']])) { + $this->reapOrphan('cloudflared', $process); + } + } + } + + /** + * Reap a detected orphan process. When orphan reaping is disabled (the + * default), the orphan is only logged — a dry-run mode that lets operators + * verify what would be killed before enabling it for real. + * + * @param array{pid: string, ppid: string, etimes: int, args: string} $process + */ + private function reapOrphan(string $kind, array $process): void + { + if (! config('constants.ssh.mux_orphan_reap_enabled')) { + Log::info("Orphaned {$kind} process detected (dry-run, not killed)", [ + 'pid' => $process['pid'], + 'etimes' => $process['etimes'], + 'command' => $process['args'], + ]); + + return; + } + + Process::run('kill '.escapeshellarg($process['pid'])); + Log::info("Killed orphaned {$kind} process", [ + 'pid' => $process['pid'], + 'etimes' => $process['etimes'], + 'command' => $process['args'], + ]); + } + + /** + * Snapshot of running processes. + * + * @return list + */ + private function listProcesses(): array + { + $ps = Process::run('ps -ww -eo pid=,ppid=,etimes=,args='); + if ($ps->exitCode() !== 0) { + return []; + } + + $processes = []; + foreach (explode("\n", trim($ps->output())) as $line) { + if (! preg_match('/^\s*(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/', $line, $matches)) { + continue; + } + $processes[] = [ + 'pid' => $matches[1], + 'ppid' => $matches[2], + 'etimes' => (int) $matches[3], + 'args' => $matches[4], + ]; + } + + return $processes; } private function cleanupStaleConnections() @@ -31,7 +158,7 @@ class CleanupStaleMultiplexedConnections implements ShouldQueue $server = Server::where('uuid', $serverUuid)->first(); if (! $server) { - $this->removeMultiplexFile($muxFile); + $this->removeMultiplexFile($muxFile, 'server_not_found'); continue; } @@ -41,14 +168,14 @@ class CleanupStaleMultiplexedConnections implements ShouldQueue $checkProcess = Process::run($checkCommand); if ($checkProcess->exitCode() !== 0) { - $this->removeMultiplexFile($muxFile); + $this->removeMultiplexFile($muxFile, 'connection_check_failed'); } else { $muxContent = Storage::disk('ssh-mux')->get($muxFile); $establishedAt = Carbon::parse(substr($muxContent, 37)); $expirationTime = $establishedAt->addSeconds(config('constants.ssh.mux_persist_time')); if (Carbon::now()->isAfter($expirationTime)) { - $this->removeMultiplexFile($muxFile); + $this->removeMultiplexFile($muxFile, 'expired'); } } } @@ -62,7 +189,7 @@ class CleanupStaleMultiplexedConnections implements ShouldQueue foreach ($muxFiles as $muxFile) { $serverUuid = $this->extractServerUuidFromMuxFile($muxFile); if (! in_array($serverUuid, $existingServerUuids)) { - $this->removeMultiplexFile($muxFile); + $this->removeMultiplexFile($muxFile, 'server_does_not_exist'); } } } @@ -72,11 +199,30 @@ class CleanupStaleMultiplexedConnections implements ShouldQueue return substr($muxFile, 4); } - private function removeMultiplexFile($muxFile) + /** + * Close and delete a stale mux socket file. When orphan reaping is disabled + * (the default), the file is only logged — a dry-run mode that lets operators + * verify what would be removed before enabling it for real. + */ + private function removeMultiplexFile(string $muxFile, string $reason): void { + if (! config('constants.ssh.mux_orphan_reap_enabled')) { + Log::info('Stale mux file detected (dry-run, not removed)', [ + 'file' => $muxFile, + 'reason' => $reason, + ]); + + return; + } + $muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}"; $closeCommand = "ssh -O exit -o ControlPath={$muxSocket} localhost 2>/dev/null"; Process::run($closeCommand); Storage::disk('ssh-mux')->delete($muxFile); + + Log::info('Removed stale mux file', [ + 'file' => $muxFile, + 'reason' => $reason, + ]); } } diff --git a/config/constants.php b/config/constants.php index 10db1085c..7721ff213 100644 --- a/config/constants.php +++ b/config/constants.php @@ -70,6 +70,10 @@ return [ 'mux_health_check_enabled' => env('SSH_MUX_HEALTH_CHECK_ENABLED', true), 'mux_health_check_timeout' => env('SSH_MUX_HEALTH_CHECK_TIMEOUT', 5), 'mux_max_age' => env('SSH_MUX_MAX_AGE', 1800), // 30 minutes + 'mux_lock_ttl' => env('SSH_MUX_LOCK_TTL', 30), // lock auto-release, seconds + 'mux_lock_timeout' => env('SSH_MUX_LOCK_TIMEOUT', 10), // max wait for lock, seconds + 'mux_orphan_min_age' => env('SSH_MUX_ORPHAN_MIN_AGE', 600), // min process age before reaping orphans, seconds + 'mux_orphan_reap_enabled' => env('SSH_MUX_ORPHAN_REAP_ENABLED', false), // false = dry-run, only log orphans 'connection_timeout' => 10, 'server_interval' => 20, 'command_timeout' => 3600, diff --git a/tests/Feature/SshMultiplexingLockTest.php b/tests/Feature/SshMultiplexingLockTest.php new file mode 100644 index 000000000..8d56f5235 --- /dev/null +++ b/tests/Feature/SshMultiplexingLockTest.php @@ -0,0 +1,219 @@ +create(); + $team = $user->teams()->first(); + + $privateKey = PrivateKey::create([ + 'name' => 'mux-test-key-'.uniqid(), + 'private_key' => "-----BEGIN OPENSSH PRIVATE KEY-----\n". + "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW\n". + "QyNTUxOQAAACBbhpqHhqv6aI67Mj9abM3DVbmcfYhZAhC7ca4d9UCevAAAAJi/QySHv0Mk\n". + "hwAAAAtzc2gtZWQyNTUxOQAAACBbhpqHhqv6aI67Mj9abM3DVbmcfYhZAhC7ca4d9UCevA\n". + "AAAECBQw4jg1WRT2IGHMncCiZhURCts2s24HoDS0thHnnRKVuGmoeGq/pojrsyP1pszcNV\n". + "uZx9iFkCELtxrh31QJ68AAAAEXNhaWxANzZmZjY2ZDJlMmRkAQIDBA==\n". + '-----END OPENSSH PRIVATE KEY-----', + 'team_id' => $team->id, + ]); + + return Server::factory()->create([ + 'team_id' => $team->id, + 'private_key_id' => $privateKey->id, + ]); +} + +it('establishes a master with ssh -fN and never the orphan-prone ssh -fNM', function () { + config(['constants.ssh.mux_enabled' => true]); + $server = makeMuxServer(); + + Process::fake([ + '*-O check*' => Process::result(exitCode: 1), // no existing master + '*-fN *' => Process::result(exitCode: 0), // establish succeeds + ]); + + expect(SshMultiplexingHelper::ensureMultiplexedConnection($server))->toBeTrue(); + + Process::assertRan(fn ($process) => str_contains($process->command, 'ssh -fN ') + && ! str_contains($process->command, 'ssh -fNM')); +}); + +it('reuses an existing healthy master without spawning a new one', function () { + config([ + 'constants.ssh.mux_enabled' => true, + 'constants.ssh.mux_health_check_enabled' => true, + ]); + $server = makeMuxServer(); + + Process::fake([ + '*-O check*' => Process::result(exitCode: 0), + '*health_check_ok*' => Process::result(output: 'health_check_ok', exitCode: 0), + ]); + + expect(SshMultiplexingHelper::ensureMultiplexedConnection($server))->toBeTrue(); + + Process::assertNotRan(fn ($process) => str_contains($process->command, 'ssh -fN')); +}); + +it('does not spawn a master when the per-server lock is already held', function () { + config([ + 'constants.ssh.mux_enabled' => true, + 'constants.ssh.mux_lock_timeout' => 0, + ]); + $server = makeMuxServer(); + + Process::fake([ + '*-O check*' => Process::result(exitCode: 1), // forces the slow path + ]); + + // Simulate another worker on the same host holding the lock for this server. + $lockKey = 'ssh_mux_lock_'.(gethostname() ?: 'unknown').'_'.$server->uuid; + $held = Cache::lock($lockKey, 30); + expect($held->get())->toBeTrue(); + + expect(SshMultiplexingHelper::ensureMultiplexedConnection($server))->toBeFalse(); + + Process::assertNotRan(fn ($process) => str_contains($process->command, 'ssh -fN ')); + + $held->release(); +}); + +it('returns false and runs no ssh when multiplexing is disabled', function () { + config(['constants.ssh.mux_enabled' => false]); + $server = makeMuxServer(); + + Process::fake(); + + expect(SshMultiplexingHelper::ensureMultiplexedConnection($server))->toBeFalse(); + + Process::assertNothingRan(); +}); + +it('kills only old orphaned ssh masters whose control socket no longer exists', function () { + config(['constants.ssh.mux_orphan_reap_enabled' => true]); + $muxDir = storage_path('app/ssh/mux'); + File::ensureDirectoryExists($muxDir); + + $liveSocket = $muxDir.'/mux_live_'.uniqid(); + $orphanSocket = $muxDir.'/mux_orphan_'.uniqid(); + $youngSocket = $muxDir.'/mux_young_'.uniqid(); + File::put($liveSocket, 'x'); // live master owns its socket file; the others do not + + // Columns: pid ppid etimes args + Process::fake([ + 'ps*' => Process::result(output: "111 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$liveSocket} root@1.2.3.4\n". + "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n". + "333 1 30 ssh -fN -o ControlMaster=auto -o ControlPath={$youngSocket} root@1.2.3.4\n"), + 'kill*' => Process::result(exitCode: 0), + ]); + + $job = new CleanupStaleMultiplexedConnections; + $method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses'); + $method->setAccessible(true); + $method->invoke($job); + + // Old orphan: killed. + Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222')); + // Live master (socket exists): spared. + Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111')); + // Young process (may be mid-establish): spared despite missing socket. + Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '333')); + + File::delete($liveSocket); +}); + +it('kills only old orphaned cloudflared proxies whose parent ssh is gone', function () { + config(['constants.ssh.mux_orphan_reap_enabled' => true]); + // pid 100 = live ssh master; 200 = its legit child; 300 = old orphan; + // 400 = young orphan (parent ssh may still be starting up). + Process::fake([ + 'ps*' => Process::result(output: "100 1 5000 ssh -fN -o ControlMaster=auto root@1.2.3.4\n". + "200 100 5000 cloudflared access ssh --hostname host.example.com\n". + "300 2176 5000 cloudflared access ssh --hostname host.example.com\n". + "400 2176 30 cloudflared access ssh --hostname host.example.com\n". + "2176 1 9000 /usr/bin/some-supervisor\n"), + 'kill*' => Process::result(exitCode: 0), + ]); + + $job = new CleanupStaleMultiplexedConnections; + $method = new ReflectionMethod($job, 'cleanupOrphanedCloudflaredProcesses'); + $method->setAccessible(true); + $method->invoke($job); + + // Old orphan (parent not ssh): killed. + Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '300')); + // Legit proxy (parent ssh alive): spared. + Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '200')); + // Young orphan: spared. + Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '400')); +}); + +it('dry-run mode logs orphans but kills nothing when reaping is disabled', function () { + config(['constants.ssh.mux_orphan_reap_enabled' => false]); + $muxDir = storage_path('app/ssh/mux'); + File::ensureDirectoryExists($muxDir); + + $orphanSocket = $muxDir.'/mux_orphan_'.uniqid(); // no file: a real old orphan + + Process::fake([ + 'ps*' => Process::result(output: "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n"), + 'kill*' => Process::result(exitCode: 0), + ]); + + $job = new CleanupStaleMultiplexedConnections; + $method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses'); + $method->setAccessible(true); + $method->invoke($job); + + // Orphan detected, but dry-run: nothing killed. + Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill')); +}); + +it('removes mux files for non-existent servers when reaping is enabled', function () { + config(['constants.ssh.mux_orphan_reap_enabled' => true]); + Storage::fake('ssh-mux'); + $file = 'mux_ghost'.uniqid(); + Storage::disk('ssh-mux')->put($file, 'x'); + Process::fake(); // the `ssh -O exit` close command + + $job = new CleanupStaleMultiplexedConnections; + $method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections'); + $method->setAccessible(true); + $method->invoke($job); + + expect(Storage::disk('ssh-mux')->exists($file))->toBeFalse(); +}); + +it('keeps mux files for non-existent servers in dry-run mode', function () { + config(['constants.ssh.mux_orphan_reap_enabled' => false]); + Storage::fake('ssh-mux'); + $file = 'mux_ghost'.uniqid(); + Storage::disk('ssh-mux')->put($file, 'x'); + Process::fake(); + + $job = new CleanupStaleMultiplexedConnections; + $method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections'); + $method->setAccessible(true); + $method->invoke($job); + + expect(Storage::disk('ssh-mux')->exists($file))->toBeTrue(); + Process::assertNothingRan(); +});