diff --git a/app/Helpers/SshRetryHandler.php b/app/Helpers/SshRetryHandler.php new file mode 100644 index 000000000..aaaf4252a --- /dev/null +++ b/app/Helpers/SshRetryHandler.php @@ -0,0 +1,34 @@ +executeWithSshRetry($callback, $context, $throwError); + } +} diff --git a/app/Models/Server.php b/app/Models/Server.php index 0f92bd390..736a59be4 100644 --- a/app/Models/Server.php +++ b/app/Models/Server.php @@ -1082,6 +1082,7 @@ $schema://$host { public function validateConnection(bool $justCheckingNewKey = false) { + ray('validateConnection', $this->id); $this->disableSshMux(); if ($this->skipServer()) { diff --git a/app/Traits/ExecuteRemoteCommand.php b/app/Traits/ExecuteRemoteCommand.php index a228a5d10..0e7961368 100644 --- a/app/Traits/ExecuteRemoteCommand.php +++ b/app/Traits/ExecuteRemoteCommand.php @@ -11,6 +11,8 @@ use Illuminate\Support\Facades\Process; trait ExecuteRemoteCommand { + use SshRetryable; + public ?string $save = null; public static int $batch_counter = 0; @@ -43,76 +45,169 @@ trait ExecuteRemoteCommand $command = parseLineForSudo($command, $this->server); } } - $remote_command = SshMultiplexingHelper::generateSshCommand($this->server, $command); - $process = Process::timeout(3600)->idleTimeout(3600)->start($remote_command, function (string $type, string $output) use ($command, $hidden, $customType, $append) { - $output = str($output)->trim(); - if ($output->startsWith('╔')) { - $output = "\n".$output; - } - // Sanitize output to ensure valid UTF-8 encoding before JSON encoding - $sanitized_output = sanitize_utf8_text($output); - - $new_log_entry = [ - 'command' => remove_iip($command), - 'output' => remove_iip($sanitized_output), - 'type' => $customType ?? $type === 'err' ? 'stderr' : 'stdout', - 'timestamp' => Carbon::now('UTC'), - 'hidden' => $hidden, - 'batch' => static::$batch_counter, - ]; - if (! $this->application_deployment_queue->logs) { - $new_log_entry['order'] = 1; - } else { - try { - $previous_logs = json_decode($this->application_deployment_queue->logs, associative: true, flags: JSON_THROW_ON_ERROR); - } catch (\JsonException $e) { - // If existing logs are corrupted, start fresh - $previous_logs = []; - $new_log_entry['order'] = 1; - } - if (is_array($previous_logs)) { - $new_log_entry['order'] = count($previous_logs) + 1; - } else { - $previous_logs = []; - $new_log_entry['order'] = 1; - } - } - $previous_logs[] = $new_log_entry; + $maxRetries = config('constants.ssh.max_retries'); + $attempt = 0; + $lastError = null; + $commandExecuted = false; + while ($attempt < $maxRetries && ! $commandExecuted) { try { - $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_THROW_ON_ERROR); - } catch (\JsonException $e) { - // If JSON encoding still fails, use fallback with invalid sequences replacement - $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_INVALID_UTF8_SUBSTITUTE); - } + $this->executeCommandWithProcess($command, $hidden, $customType, $append, $ignore_errors); + $commandExecuted = true; + } catch (\RuntimeException $e) { + $lastError = $e; + $errorMessage = $e->getMessage(); + // Only retry if it's an SSH connection error and we haven't exhausted retries + if ($this->isRetryableSshError($errorMessage) && $attempt < $maxRetries - 1) { + $attempt++; + $delay = $this->calculateRetryDelay($attempt - 1); - $this->application_deployment_queue->save(); + // Track SSH retry event in Sentry + $this->trackSshRetryEvent($attempt, $maxRetries, $delay, $errorMessage, [ + 'server' => $this->server->name ?? $this->server->ip ?? 'unknown', + 'command' => remove_iip($command), + 'trait' => 'ExecuteRemoteCommand', + ]); - if ($this->save) { - if (data_get($this->saved_outputs, $this->save, null) === null) { - data_set($this->saved_outputs, $this->save, str()); - } - if ($append) { - $this->saved_outputs[$this->save] .= str($sanitized_output)->trim(); - $this->saved_outputs[$this->save] = str($this->saved_outputs[$this->save]); + // Add log entry for the retry + if (isset($this->application_deployment_queue)) { + $this->addRetryLogEntry($attempt, $maxRetries, $delay, $errorMessage); + } + + sleep($delay); } else { - $this->saved_outputs[$this->save] = str($sanitized_output)->trim(); + // Not retryable or max retries reached + throw $e; } } - }); - $this->application_deployment_queue->update([ - 'current_process_id' => $process->id(), - ]); + } - $process_result = $process->wait(); - if ($process_result->exitCode() !== 0) { - if (! $ignore_errors) { - $this->application_deployment_queue->status = ApplicationDeploymentStatus::FAILED->value; - $this->application_deployment_queue->save(); - throw new \RuntimeException($process_result->errorOutput()); - } + // If we exhausted all retries and still failed + if (! $commandExecuted && $lastError) { + throw $lastError; } }); } + + /** + * Execute the actual command with process handling + */ + private function executeCommandWithProcess($command, $hidden, $customType, $append, $ignore_errors) + { + $remote_command = SshMultiplexingHelper::generateSshCommand($this->server, $command); + $process = Process::timeout(3600)->idleTimeout(3600)->start($remote_command, function (string $type, string $output) use ($command, $hidden, $customType, $append) { + $output = str($output)->trim(); + if ($output->startsWith('╔')) { + $output = "\n".$output; + } + + // Sanitize output to ensure valid UTF-8 encoding before JSON encoding + $sanitized_output = sanitize_utf8_text($output); + + $new_log_entry = [ + 'command' => remove_iip($command), + 'output' => remove_iip($sanitized_output), + 'type' => $customType ?? $type === 'err' ? 'stderr' : 'stdout', + 'timestamp' => Carbon::now('UTC'), + 'hidden' => $hidden, + 'batch' => static::$batch_counter, + ]; + if (! $this->application_deployment_queue->logs) { + $new_log_entry['order'] = 1; + } else { + try { + $previous_logs = json_decode($this->application_deployment_queue->logs, associative: true, flags: JSON_THROW_ON_ERROR); + } catch (\JsonException $e) { + // If existing logs are corrupted, start fresh + $previous_logs = []; + $new_log_entry['order'] = 1; + } + if (is_array($previous_logs)) { + $new_log_entry['order'] = count($previous_logs) + 1; + } else { + $previous_logs = []; + $new_log_entry['order'] = 1; + } + } + $previous_logs[] = $new_log_entry; + + try { + $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_THROW_ON_ERROR); + } catch (\JsonException $e) { + // If JSON encoding still fails, use fallback with invalid sequences replacement + $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_INVALID_UTF8_SUBSTITUTE); + } + + $this->application_deployment_queue->save(); + + if ($this->save) { + if (data_get($this->saved_outputs, $this->save, null) === null) { + data_set($this->saved_outputs, $this->save, str()); + } + if ($append) { + $this->saved_outputs[$this->save] .= str($sanitized_output)->trim(); + $this->saved_outputs[$this->save] = str($this->saved_outputs[$this->save]); + } else { + $this->saved_outputs[$this->save] = str($sanitized_output)->trim(); + } + } + }); + $this->application_deployment_queue->update([ + 'current_process_id' => $process->id(), + ]); + + $process_result = $process->wait(); + if ($process_result->exitCode() !== 0) { + if (! $ignore_errors) { + $this->application_deployment_queue->status = ApplicationDeploymentStatus::FAILED->value; + $this->application_deployment_queue->save(); + throw new \RuntimeException($process_result->errorOutput()); + } + } + } + + /** + * Add a log entry for SSH retry attempts + */ + private function addRetryLogEntry(int $attempt, int $maxRetries, int $delay, string $errorMessage) + { + $retryMessage = "SSH connection failed. Retrying... (Attempt {$attempt}/{$maxRetries}, waiting {$delay}s)\nError: {$errorMessage}"; + + $new_log_entry = [ + 'output' => remove_iip($retryMessage), + 'type' => 'stdout', + 'timestamp' => Carbon::now('UTC'), + 'hidden' => false, + 'batch' => static::$batch_counter, + ]; + + if (! $this->application_deployment_queue->logs) { + $new_log_entry['order'] = 1; + $previous_logs = []; + } else { + try { + $previous_logs = json_decode($this->application_deployment_queue->logs, associative: true, flags: JSON_THROW_ON_ERROR); + } catch (\JsonException $e) { + $previous_logs = []; + $new_log_entry['order'] = 1; + } + if (is_array($previous_logs)) { + $new_log_entry['order'] = count($previous_logs) + 1; + } else { + $previous_logs = []; + $new_log_entry['order'] = 1; + } + } + + $previous_logs[] = $new_log_entry; + + try { + $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_THROW_ON_ERROR); + } catch (\JsonException $e) { + $this->application_deployment_queue->logs = json_encode($previous_logs, flags: JSON_INVALID_UTF8_SUBSTITUTE); + } + + $this->application_deployment_queue->save(); + } } diff --git a/app/Traits/SshRetryable.php b/app/Traits/SshRetryable.php new file mode 100644 index 000000000..a26481056 --- /dev/null +++ b/app/Traits/SshRetryable.php @@ -0,0 +1,174 @@ +getMessage(); + + // Check if it's retryable and not the last attempt + if ($this->isRetryableSshError($lastErrorMessage) && $attempt < $maxRetries - 1) { + $delay = $this->calculateRetryDelay($attempt); + + // Track SSH retry event in Sentry + $this->trackSshRetryEvent($attempt + 1, $maxRetries, $delay, $lastErrorMessage, $context); + + // Add deployment log if available (for ExecuteRemoteCommand trait) + if (isset($this->application_deployment_queue) && method_exists($this, 'addRetryLogEntry')) { + $this->addRetryLogEntry($attempt + 1, $maxRetries, $delay, $lastErrorMessage); + } + + sleep($delay); + + continue; + } + + // Not retryable or max retries reached + break; + } + } + + // All retries exhausted + if ($attempt >= $maxRetries) { + Log::error('SSH operation failed after all retries', array_merge($context, [ + 'attempts' => $attempt, + 'error' => $lastErrorMessage, + ])); + } + + if ($throwError && $lastError) { + // If the error message is empty, provide a more meaningful one + if (empty($lastErrorMessage) || trim($lastErrorMessage) === '') { + $contextInfo = isset($context['server']) ? " to server {$context['server']}" : ''; + $attemptInfo = $attempt > 1 ? " after {$attempt} attempts" : ''; + throw new \RuntimeException("SSH connection failed{$contextInfo}{$attemptInfo}", $lastError->getCode()); + } + throw $lastError; + } + + return null; + } + + /** + * Track SSH retry event in Sentry + */ + protected function trackSshRetryEvent(int $attempt, int $maxRetries, int $delay, string $errorMessage, array $context = []): void + { + // Only track in production/cloud instances + if (isDev() || ! config('constants.sentry.sentry_dsn')) { + return; + } + + try { + app('sentry')->captureMessage( + 'SSH connection retry triggered', + \Sentry\Severity::warning(), + [ + 'extra' => [ + 'attempt' => $attempt, + 'max_retries' => $maxRetries, + 'delay_seconds' => $delay, + 'error_message' => $errorMessage, + 'context' => $context, + 'retryable_error' => true, + ], + 'tags' => [ + 'component' => 'ssh_retry', + 'error_type' => 'connection_retry', + ], + ] + ); + } catch (\Throwable $e) { + // Don't let Sentry tracking errors break the SSH retry flow + Log::warning('Failed to track SSH retry event in Sentry', [ + 'error' => $e->getMessage(), + 'original_attempt' => $attempt, + ]); + } + } +} diff --git a/bootstrap/helpers/remoteProcess.php b/bootstrap/helpers/remoteProcess.php index 6c1e2beab..b5bdeff49 100644 --- a/bootstrap/helpers/remoteProcess.php +++ b/bootstrap/helpers/remoteProcess.php @@ -60,15 +60,28 @@ function remote_process( function instant_scp(string $source, string $dest, Server $server, $throwError = true) { - $scp_command = SshMultiplexingHelper::generateScpCommand($server, $source, $dest); - $process = Process::timeout(config('constants.ssh.command_timeout'))->run($scp_command); - $output = trim($process->output()); - $exitCode = $process->exitCode(); - if ($exitCode !== 0) { - return $throwError ? excludeCertainErrors($process->errorOutput(), $exitCode) : null; - } + return \App\Helpers\SshRetryHandler::retry( + function () use ($source, $dest, $server) { + $scp_command = SshMultiplexingHelper::generateScpCommand($server, $source, $dest); + $process = Process::timeout(config('constants.ssh.command_timeout'))->run($scp_command); - return $output === 'null' ? null : $output; + $output = trim($process->output()); + $exitCode = $process->exitCode(); + + if ($exitCode !== 0) { + excludeCertainErrors($process->errorOutput(), $exitCode); + } + + return $output === 'null' ? null : $output; + }, + [ + 'server' => $server->ip, + 'source' => $source, + 'dest' => $dest, + 'function' => 'instant_scp', + ], + $throwError + ); } function instant_remote_process_with_timeout(Collection|array $command, Server $server, bool $throwError = true, bool $no_sudo = false): ?string @@ -79,25 +92,30 @@ function instant_remote_process_with_timeout(Collection|array $command, Server $ } $command_string = implode("\n", $command); - // $start_time = microtime(true); - $sshCommand = SshMultiplexingHelper::generateSshCommand($server, $command_string); - $process = Process::timeout(30)->run($sshCommand); - // $end_time = microtime(true); + return \App\Helpers\SshRetryHandler::retry( + function () use ($server, $command_string) { + $sshCommand = SshMultiplexingHelper::generateSshCommand($server, $command_string); + $process = Process::timeout(30)->run($sshCommand); - // $execution_time = ($end_time - $start_time) * 1000; // Convert to milliseconds - // ray('SSH command execution time:', $execution_time.' ms')->orange(); + $output = trim($process->output()); + $exitCode = $process->exitCode(); - $output = trim($process->output()); - $exitCode = $process->exitCode(); + if ($exitCode !== 0) { + excludeCertainErrors($process->errorOutput(), $exitCode); + } - if ($exitCode !== 0) { - return $throwError ? excludeCertainErrors($process->errorOutput(), $exitCode) : null; - } + // Sanitize output to ensure valid UTF-8 encoding + $output = $output === 'null' ? null : sanitize_utf8_text($output); - // Sanitize output to ensure valid UTF-8 encoding - $output = $output === 'null' ? null : sanitize_utf8_text($output); - - return $output; + return $output; + }, + [ + 'server' => $server->ip, + 'command_preview' => substr($command_string, 0, 100), + 'function' => 'instant_remote_process_with_timeout', + ], + $throwError + ); } function instant_remote_process(Collection|array $command, Server $server, bool $throwError = true, bool $no_sudo = false): ?string @@ -108,25 +126,30 @@ function instant_remote_process(Collection|array $command, Server $server, bool } $command_string = implode("\n", $command); - // $start_time = microtime(true); - $sshCommand = SshMultiplexingHelper::generateSshCommand($server, $command_string); - $process = Process::timeout(config('constants.ssh.command_timeout'))->run($sshCommand); - // $end_time = microtime(true); + return \App\Helpers\SshRetryHandler::retry( + function () use ($server, $command_string) { + $sshCommand = SshMultiplexingHelper::generateSshCommand($server, $command_string); + $process = Process::timeout(config('constants.ssh.command_timeout'))->run($sshCommand); - // $execution_time = ($end_time - $start_time) * 1000; // Convert to milliseconds - // ray('SSH command execution time:', $execution_time.' ms')->orange(); + $output = trim($process->output()); + $exitCode = $process->exitCode(); - $output = trim($process->output()); - $exitCode = $process->exitCode(); + if ($exitCode !== 0) { + excludeCertainErrors($process->errorOutput(), $exitCode); + } - if ($exitCode !== 0) { - return $throwError ? excludeCertainErrors($process->errorOutput(), $exitCode) : null; - } + // Sanitize output to ensure valid UTF-8 encoding + $output = $output === 'null' ? null : sanitize_utf8_text($output); - // Sanitize output to ensure valid UTF-8 encoding - $output = $output === 'null' ? null : sanitize_utf8_text($output); - - return $output; + return $output; + }, + [ + 'server' => $server->ip, + 'command_preview' => substr($command_string, 0, 100), + 'function' => 'instant_remote_process', + ], + $throwError + ); } function excludeCertainErrors(string $errorOutput, ?int $exitCode = null) @@ -136,11 +159,18 @@ function excludeCertainErrors(string $errorOutput, ?int $exitCode = null) 'Could not resolve hostname', ]); $ignored = $ignoredErrors->contains(fn ($error) => Str::contains($errorOutput, $error)); + + // Ensure we always have a meaningful error message + $errorMessage = trim($errorOutput); + if (empty($errorMessage)) { + $errorMessage = "SSH command failed with exit code: $exitCode"; + } + if ($ignored) { // TODO: Create new exception and disable in sentry - throw new \RuntimeException($errorOutput, $exitCode); + throw new \RuntimeException($errorMessage, $exitCode); } - throw new \RuntimeException($errorOutput, $exitCode); + throw new \RuntimeException($errorMessage, $exitCode); } function decode_remote_command_output(?ApplicationDeploymentQueue $application_deployment_queue = null): Collection diff --git a/config/constants.php b/config/constants.php index 9c1b8b274..652af5ff4 100644 --- a/config/constants.php +++ b/config/constants.php @@ -62,6 +62,10 @@ return [ 'connection_timeout' => 10, 'server_interval' => 20, 'command_timeout' => 7200, + 'max_retries' => env('SSH_MAX_RETRIES', 3), + 'retry_base_delay' => env('SSH_RETRY_BASE_DELAY', 2), // seconds + 'retry_max_delay' => env('SSH_RETRY_MAX_DELAY', 30), // seconds + 'retry_multiplier' => env('SSH_RETRY_MULTIPLIER', 2), ], 'invitation' => [ diff --git a/tests/Unit/SshRetryMechanismTest.php b/tests/Unit/SshRetryMechanismTest.php new file mode 100644 index 000000000..23e1b867f --- /dev/null +++ b/tests/Unit/SshRetryMechanismTest.php @@ -0,0 +1,189 @@ +assertTrue(class_exists(\App\Helpers\SshRetryHandler::class)); + } + + public function test_ssh_retryable_trait_exists() + { + $this->assertTrue(trait_exists(\App\Traits\SshRetryable::class)); + } + + public function test_retry_on_ssh_connection_errors() + { + $handler = new class + { + use SshRetryable; + + // Make methods public for testing + public function test_is_retryable_ssh_error($error) + { + return $this->isRetryableSshError($error); + } + }; + + // Test various SSH error patterns + $sshErrors = [ + 'kex_exchange_identification: read: Connection reset by peer', + 'Connection refused', + 'Connection timed out', + 'ssh_exchange_identification: Connection closed by remote host', + 'Broken pipe', + 'No route to host', + 'Network is unreachable', + ]; + + foreach ($sshErrors as $error) { + $this->assertTrue( + $handler->test_is_retryable_ssh_error($error), + "Failed to identify as retryable: $error" + ); + } + } + + public function test_non_ssh_errors_are_not_retryable() + { + $handler = new class + { + use SshRetryable; + + // Make methods public for testing + public function test_is_retryable_ssh_error($error) + { + return $this->isRetryableSshError($error); + } + }; + + // Test non-SSH errors + $nonSshErrors = [ + 'Command not found', + 'Permission denied', + 'File not found', + 'Syntax error', + 'Invalid argument', + ]; + + foreach ($nonSshErrors as $error) { + $this->assertFalse( + $handler->test_is_retryable_ssh_error($error), + "Incorrectly identified as retryable: $error" + ); + } + } + + public function test_exponential_backoff_calculation() + { + $handler = new class + { + use SshRetryable; + + // Make method public for testing + public function test_calculate_retry_delay($attempt) + { + return $this->calculateRetryDelay($attempt); + } + }; + + // Test with default config values + config(['constants.ssh.retry_base_delay' => 2]); + config(['constants.ssh.retry_max_delay' => 30]); + config(['constants.ssh.retry_multiplier' => 2]); + + // Attempt 0: 2 seconds + $this->assertEquals(2, $handler->test_calculate_retry_delay(0)); + + // Attempt 1: 4 seconds + $this->assertEquals(4, $handler->test_calculate_retry_delay(1)); + + // Attempt 2: 8 seconds + $this->assertEquals(8, $handler->test_calculate_retry_delay(2)); + + // Attempt 3: 16 seconds + $this->assertEquals(16, $handler->test_calculate_retry_delay(3)); + + // Attempt 4: Should be capped at 30 seconds + $this->assertEquals(30, $handler->test_calculate_retry_delay(4)); + + // Attempt 5: Should still be capped at 30 seconds + $this->assertEquals(30, $handler->test_calculate_retry_delay(5)); + } + + public function test_retry_succeeds_after_failures() + { + $attemptCount = 0; + + config(['constants.ssh.max_retries' => 3]); + + // Simulate a function that fails twice then succeeds using the public static method + $result = SshRetryHandler::retry( + function () use (&$attemptCount) { + $attemptCount++; + if ($attemptCount < 3) { + throw new \RuntimeException('kex_exchange_identification: Connection reset by peer'); + } + + return 'success'; + }, + ['test' => 'retry_test'], + true + ); + + $this->assertEquals('success', $result); + $this->assertEquals(3, $attemptCount); + } + + public function test_retry_fails_after_max_attempts() + { + $attemptCount = 0; + + config(['constants.ssh.max_retries' => 3]); + + $this->expectException(\RuntimeException::class); + $this->expectExceptionMessage('Connection reset by peer'); + + // Simulate a function that always fails using the public static method + SshRetryHandler::retry( + function () use (&$attemptCount) { + $attemptCount++; + throw new \RuntimeException('Connection reset by peer'); + }, + ['test' => 'retry_test'], + true + ); + } + + public function test_non_retryable_errors_fail_immediately() + { + $attemptCount = 0; + + config(['constants.ssh.max_retries' => 3]); + + $this->expectException(\RuntimeException::class); + $this->expectExceptionMessage('Command not found'); + + try { + // Simulate a non-retryable error using the public static method + SshRetryHandler::retry( + function () use (&$attemptCount) { + $attemptCount++; + throw new \RuntimeException('Command not found'); + }, + ['test' => 'non_retryable_test'], + true + ); + } catch (\RuntimeException $e) { + // Should only attempt once since it's not retryable + $this->assertEquals(1, $attemptCount); + throw $e; + } + } +}