mirror of
https://github.com/portainer/portainer.git
synced 2025-07-20 13:59:40 +02:00
fix(tunnels): make the tunnels more robust EE-7042 (#11877)
This commit is contained in:
parent
aaab2fa9d8
commit
c5a1d7e051
20 changed files with 389 additions and 355 deletions
|
@ -19,7 +19,6 @@ import (
|
|||
|
||||
const (
|
||||
tunnelCleanupInterval = 10 * time.Second
|
||||
requiredTimeout = 15 * time.Second
|
||||
activeTimeout = 4*time.Minute + 30*time.Second
|
||||
pingTimeout = 3 * time.Second
|
||||
)
|
||||
|
@ -28,32 +27,54 @@ const (
|
|||
// It is used to start a reverse tunnel server and to manage the connection status of each tunnel
|
||||
// connected to the tunnel server.
|
||||
type Service struct {
|
||||
serverFingerprint string
|
||||
serverPort string
|
||||
tunnelDetailsMap map[portainer.EndpointID]*portainer.TunnelDetails
|
||||
dataStore dataservices.DataStore
|
||||
snapshotService portainer.SnapshotService
|
||||
chiselServer *chserver.Server
|
||||
shutdownCtx context.Context
|
||||
ProxyManager *proxy.Manager
|
||||
mu sync.Mutex
|
||||
fileService portainer.FileService
|
||||
serverFingerprint string
|
||||
serverPort string
|
||||
activeTunnels map[portainer.EndpointID]*portainer.TunnelDetails
|
||||
edgeJobs map[portainer.EndpointID][]portainer.EdgeJob
|
||||
dataStore dataservices.DataStore
|
||||
snapshotService portainer.SnapshotService
|
||||
chiselServer *chserver.Server
|
||||
shutdownCtx context.Context
|
||||
ProxyManager *proxy.Manager
|
||||
mu sync.RWMutex
|
||||
fileService portainer.FileService
|
||||
defaultCheckinInterval int
|
||||
}
|
||||
|
||||
// NewService returns a pointer to a new instance of Service
|
||||
func NewService(dataStore dataservices.DataStore, shutdownCtx context.Context, fileService portainer.FileService) *Service {
|
||||
defaultCheckinInterval := portainer.DefaultEdgeAgentCheckinIntervalInSeconds
|
||||
|
||||
settings, err := dataStore.Settings().Settings()
|
||||
if err == nil {
|
||||
defaultCheckinInterval = settings.EdgeAgentCheckinInterval
|
||||
} else {
|
||||
log.Error().Err(err).Msg("unable to retrieve the settings from the database")
|
||||
}
|
||||
|
||||
return &Service{
|
||||
tunnelDetailsMap: make(map[portainer.EndpointID]*portainer.TunnelDetails),
|
||||
dataStore: dataStore,
|
||||
shutdownCtx: shutdownCtx,
|
||||
fileService: fileService,
|
||||
activeTunnels: make(map[portainer.EndpointID]*portainer.TunnelDetails),
|
||||
edgeJobs: make(map[portainer.EndpointID][]portainer.EdgeJob),
|
||||
dataStore: dataStore,
|
||||
shutdownCtx: shutdownCtx,
|
||||
fileService: fileService,
|
||||
defaultCheckinInterval: defaultCheckinInterval,
|
||||
}
|
||||
}
|
||||
|
||||
// pingAgent ping the given agent so that the agent can keep the tunnel alive
|
||||
func (service *Service) pingAgent(endpointID portainer.EndpointID) error {
|
||||
tunnel := service.GetTunnelDetails(endpointID)
|
||||
requestURL := fmt.Sprintf("http://127.0.0.1:%d/ping", tunnel.Port)
|
||||
endpoint, err := service.dataStore.Endpoint().Endpoint(endpointID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tunnelAddr, err := service.TunnelAddr(endpoint)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
requestURL := fmt.Sprintf("http://%s/ping", tunnelAddr)
|
||||
req, err := http.NewRequest(http.MethodHead, requestURL, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -76,47 +97,49 @@ func (service *Service) pingAgent(endpointID portainer.EndpointID) error {
|
|||
|
||||
// KeepTunnelAlive keeps the tunnel of the given environment for maxAlive duration, or until ctx is done
|
||||
func (service *Service) KeepTunnelAlive(endpointID portainer.EndpointID, ctx context.Context, maxAlive time.Duration) {
|
||||
go func() {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Float64("max_alive_minutes", maxAlive.Minutes()).
|
||||
Msg("KeepTunnelAlive: start")
|
||||
go service.keepTunnelAlive(endpointID, ctx, maxAlive)
|
||||
}
|
||||
|
||||
maxAliveTicker := time.NewTicker(maxAlive)
|
||||
defer maxAliveTicker.Stop()
|
||||
func (service *Service) keepTunnelAlive(endpointID portainer.EndpointID, ctx context.Context, maxAlive time.Duration) {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Float64("max_alive_minutes", maxAlive.Minutes()).
|
||||
Msg("KeepTunnelAlive: start")
|
||||
|
||||
pingTicker := time.NewTicker(tunnelCleanupInterval)
|
||||
defer pingTicker.Stop()
|
||||
maxAliveTicker := time.NewTicker(maxAlive)
|
||||
defer maxAliveTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-pingTicker.C:
|
||||
service.SetTunnelStatusToActive(endpointID)
|
||||
err := service.pingAgent(endpointID)
|
||||
if err != nil {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Err(err).
|
||||
Msg("KeepTunnelAlive: ping agent")
|
||||
}
|
||||
case <-maxAliveTicker.C:
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Float64("timeout_minutes", maxAlive.Minutes()).
|
||||
Msg("KeepTunnelAlive: tunnel keep alive timeout")
|
||||
pingTicker := time.NewTicker(tunnelCleanupInterval)
|
||||
defer pingTicker.Stop()
|
||||
|
||||
return
|
||||
case <-ctx.Done():
|
||||
err := ctx.Err()
|
||||
for {
|
||||
select {
|
||||
case <-pingTicker.C:
|
||||
service.UpdateLastActivity(endpointID)
|
||||
|
||||
if err := service.pingAgent(endpointID); err != nil {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Err(err).
|
||||
Msg("KeepTunnelAlive: tunnel stop")
|
||||
|
||||
return
|
||||
Msg("KeepTunnelAlive: ping agent")
|
||||
}
|
||||
case <-maxAliveTicker.C:
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Float64("timeout_minutes", maxAlive.Minutes()).
|
||||
Msg("KeepTunnelAlive: tunnel keep alive timeout")
|
||||
|
||||
return
|
||||
case <-ctx.Done():
|
||||
err := ctx.Err()
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Err(err).
|
||||
Msg("KeepTunnelAlive: tunnel stop")
|
||||
|
||||
return
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// StartTunnelServer starts a tunnel server on the specified addr and port.
|
||||
|
@ -126,7 +149,6 @@ func (service *Service) KeepTunnelAlive(endpointID portainer.EndpointID, ctx con
|
|||
// The snapshotter is used in the tunnel status verification process.
|
||||
func (service *Service) StartTunnelServer(addr, port string, snapshotService portainer.SnapshotService) error {
|
||||
privateKeyFile, err := service.retrievePrivateKeyFile()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -144,21 +166,21 @@ func (service *Service) StartTunnelServer(addr, port string, snapshotService por
|
|||
service.serverFingerprint = chiselServer.GetFingerprint()
|
||||
service.serverPort = port
|
||||
|
||||
err = chiselServer.Start(addr, port)
|
||||
if err != nil {
|
||||
if err := chiselServer.Start(addr, port); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
service.chiselServer = chiselServer
|
||||
|
||||
// TODO: work-around Chisel default behavior.
|
||||
// By default, Chisel will allow anyone to connect if no user exists.
|
||||
username, password := generateRandomCredentials()
|
||||
err = service.chiselServer.AddUser(username, password, "127.0.0.1")
|
||||
if err != nil {
|
||||
if err = service.chiselServer.AddUser(username, password, "127.0.0.1"); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
service.snapshotService = snapshotService
|
||||
|
||||
go service.startTunnelVerificationLoop()
|
||||
|
||||
return nil
|
||||
|
@ -172,37 +194,39 @@ func (service *Service) StopTunnelServer() error {
|
|||
func (service *Service) retrievePrivateKeyFile() (string, error) {
|
||||
privateKeyFile := service.fileService.GetDefaultChiselPrivateKeyPath()
|
||||
|
||||
exist, _ := service.fileService.FileExists(privateKeyFile)
|
||||
if !exist {
|
||||
log.Debug().
|
||||
Str("private-key", privateKeyFile).
|
||||
Msg("Chisel private key file does not exist")
|
||||
|
||||
privateKey, err := ccrypto.GenerateKey("")
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Msg("Failed to generate chisel private key")
|
||||
return "", err
|
||||
}
|
||||
|
||||
err = service.fileService.StoreChiselPrivateKey(privateKey)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Msg("Failed to save Chisel private key to disk")
|
||||
return "", err
|
||||
} else {
|
||||
log.Info().
|
||||
Str("private-key", privateKeyFile).
|
||||
Msg("Generated a new Chisel private key file")
|
||||
}
|
||||
} else {
|
||||
if exists, _ := service.fileService.FileExists(privateKeyFile); exists {
|
||||
log.Info().
|
||||
Str("private-key", privateKeyFile).
|
||||
Msg("Found Chisel private key file on disk")
|
||||
Msg("found Chisel private key file on disk")
|
||||
|
||||
return privateKeyFile, nil
|
||||
}
|
||||
|
||||
log.Debug().
|
||||
Str("private-key", privateKeyFile).
|
||||
Msg("chisel private key file does not exist")
|
||||
|
||||
privateKey, err := ccrypto.GenerateKey("")
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Msg("failed to generate chisel private key")
|
||||
|
||||
return "", err
|
||||
}
|
||||
|
||||
if err = service.fileService.StoreChiselPrivateKey(privateKey); err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Msg("failed to save Chisel private key to disk")
|
||||
|
||||
return "", err
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("private-key", privateKeyFile).
|
||||
Msg("generated a new Chisel private key file")
|
||||
|
||||
return privateKeyFile, nil
|
||||
}
|
||||
|
||||
|
@ -230,63 +254,45 @@ func (service *Service) startTunnelVerificationLoop() {
|
|||
}
|
||||
}
|
||||
|
||||
// checkTunnels finds the first tunnel that has not had any activity recently
|
||||
// and attempts to take a snapshot, then closes it and returns
|
||||
func (service *Service) checkTunnels() {
|
||||
tunnels := make(map[portainer.EndpointID]portainer.TunnelDetails)
|
||||
service.mu.RLock()
|
||||
|
||||
service.mu.Lock()
|
||||
for key, tunnel := range service.tunnelDetailsMap {
|
||||
if tunnel.LastActivity.IsZero() || tunnel.Status == portainer.EdgeAgentIdle {
|
||||
continue
|
||||
}
|
||||
|
||||
if tunnel.Status == portainer.EdgeAgentManagementRequired && time.Since(tunnel.LastActivity) < requiredTimeout {
|
||||
continue
|
||||
}
|
||||
|
||||
if tunnel.Status == portainer.EdgeAgentActive && time.Since(tunnel.LastActivity) < activeTimeout {
|
||||
continue
|
||||
}
|
||||
|
||||
tunnels[key] = *tunnel
|
||||
}
|
||||
service.mu.Unlock()
|
||||
|
||||
for endpointID, tunnel := range tunnels {
|
||||
for endpointID, tunnel := range service.activeTunnels {
|
||||
elapsed := time.Since(tunnel.LastActivity)
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Str("status", tunnel.Status).
|
||||
Float64("status_time_seconds", elapsed.Seconds()).
|
||||
Float64("last_activity_seconds", elapsed.Seconds()).
|
||||
Msg("environment tunnel monitoring")
|
||||
|
||||
if tunnel.Status == portainer.EdgeAgentManagementRequired && elapsed > requiredTimeout {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Str("status", tunnel.Status).
|
||||
Float64("status_time_seconds", elapsed.Seconds()).
|
||||
Float64("timeout_seconds", requiredTimeout.Seconds()).
|
||||
Msg("REQUIRED state timeout exceeded")
|
||||
if tunnel.Status == portainer.EdgeAgentManagementRequired && elapsed < activeTimeout {
|
||||
continue
|
||||
}
|
||||
|
||||
if tunnel.Status == portainer.EdgeAgentActive && elapsed > activeTimeout {
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Str("status", tunnel.Status).
|
||||
Float64("status_time_seconds", elapsed.Seconds()).
|
||||
Float64("timeout_seconds", activeTimeout.Seconds()).
|
||||
Msg("ACTIVE state timeout exceeded")
|
||||
tunnelPort := tunnel.Port
|
||||
|
||||
err := service.snapshotEnvironment(endpointID, tunnel.Port)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Err(err).
|
||||
Msg("unable to snapshot Edge environment")
|
||||
}
|
||||
service.mu.RUnlock()
|
||||
|
||||
log.Debug().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Float64("last_activity_seconds", elapsed.Seconds()).
|
||||
Float64("timeout_seconds", activeTimeout.Seconds()).
|
||||
Msg("last activity timeout exceeded")
|
||||
|
||||
if err := service.snapshotEnvironment(endpointID, tunnelPort); err != nil {
|
||||
log.Error().
|
||||
Int("endpoint_id", int(endpointID)).
|
||||
Err(err).
|
||||
Msg("unable to snapshot Edge environment")
|
||||
}
|
||||
|
||||
service.SetTunnelStatusToIdle(portainer.EndpointID(endpointID))
|
||||
service.close(portainer.EndpointID(endpointID))
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
service.mu.RUnlock()
|
||||
}
|
||||
|
||||
func (service *Service) snapshotEnvironment(endpointID portainer.EndpointID, tunnelPort int) error {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue