mirror of
https://github.com/portainer/portainer.git
synced 2025-08-05 05:45:22 +02:00
feat(gpu): rework docker GPU for UI performance [EE-4918] (#8518)
This commit is contained in:
parent
769c8372fb
commit
fd916bc8a2
52 changed files with 692 additions and 285 deletions
|
@ -689,20 +689,16 @@ func buildServer(flags *portainer.CLIFlags) portainer.Server {
|
|||
log.Fatal().Err(err).Msg("failed initializing upgrade service")
|
||||
}
|
||||
|
||||
// FIXME: In 2.16 we changed the way ingress controller permissions are
|
||||
// stored. Instead of being stored as annotation on an ingress rule, we keep
|
||||
// them in our database. However, in order to run the migration we need an
|
||||
// admin kube client to run lookup the old ingress rules and compare them
|
||||
// with the current existing ingress classes.
|
||||
//
|
||||
// Unfortunately, our migrations run as part of the database initialization
|
||||
// and our kubeclients require an initialized database. So it is not
|
||||
// possible to do this migration as part of our normal flow. We DO have a
|
||||
// migration which toggles a boolean in kubernetes configuration that
|
||||
// indicated that this "post init" migration should be run. If/when this is
|
||||
// resolved we can remove this function.
|
||||
err = kubernetesClientFactory.PostInitMigrateIngresses()
|
||||
if err != nil {
|
||||
// Our normal migrations run as part of the database initialization
|
||||
// but some more complex migrations require access to a kubernetes or docker
|
||||
// client. Therefore we run a separate migration process just before
|
||||
// starting the server.
|
||||
postInitMigrator := datastore.NewPostInitMigrator(
|
||||
kubernetesClientFactory,
|
||||
dockerClientFactory,
|
||||
dataStore,
|
||||
)
|
||||
if err := postInitMigrator.PostInitMigrate(); err != nil {
|
||||
log.Fatal().Err(err).Msg("failure during post init migrations")
|
||||
}
|
||||
|
||||
|
|
116
api/datastore/migrate_post_init.go
Normal file
116
api/datastore/migrate_post_init.go
Normal file
|
@ -0,0 +1,116 @@
|
|||
package datastore
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
portainer "github.com/portainer/portainer/api"
|
||||
"github.com/portainer/portainer/api/dataservices"
|
||||
"github.com/portainer/portainer/api/docker"
|
||||
"github.com/portainer/portainer/api/kubernetes/cli"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
type PostInitMigrator struct {
|
||||
kubeFactory *cli.ClientFactory
|
||||
dockerFactory *docker.ClientFactory
|
||||
dataStore dataservices.DataStore
|
||||
}
|
||||
|
||||
func NewPostInitMigrator(
|
||||
kubeFactory *cli.ClientFactory,
|
||||
dockerFactory *docker.ClientFactory,
|
||||
dataStore dataservices.DataStore,
|
||||
) *PostInitMigrator {
|
||||
return &PostInitMigrator{
|
||||
kubeFactory: kubeFactory,
|
||||
dockerFactory: dockerFactory,
|
||||
dataStore: dataStore,
|
||||
}
|
||||
}
|
||||
|
||||
func (migrator *PostInitMigrator) PostInitMigrate() error {
|
||||
if err := migrator.PostInitMigrateIngresses(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
migrator.PostInitMigrateGPUs()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (migrator *PostInitMigrator) PostInitMigrateIngresses() error {
|
||||
endpoints, err := migrator.dataStore.Endpoint().Endpoints()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range endpoints {
|
||||
// Early exit if we do not need to migrate!
|
||||
if endpoints[i].PostInitMigrations.MigrateIngresses == false {
|
||||
return nil
|
||||
}
|
||||
|
||||
err := migrator.kubeFactory.MigrateEndpointIngresses(&endpoints[i])
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msg("failure migrating endpoint ingresses")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PostInitMigrateGPUs will check all docker endpoints for containers with GPUs and set EnableGPUManagement to true if any are found
|
||||
// If there's an error getting the containers, we'll log it and move on
|
||||
func (migrator *PostInitMigrator) PostInitMigrateGPUs() {
|
||||
environments, err := migrator.dataStore.Endpoint().Endpoints()
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failure getting endpoints")
|
||||
return
|
||||
}
|
||||
for i := range environments {
|
||||
if environments[i].Type == portainer.DockerEnvironment {
|
||||
// // Early exit if we do not need to migrate!
|
||||
if environments[i].PostInitMigrations.MigrateGPUs == false {
|
||||
return
|
||||
}
|
||||
|
||||
// set the MigrateGPUs flag to false so we don't run this again
|
||||
environments[i].PostInitMigrations.MigrateGPUs = false
|
||||
migrator.dataStore.Endpoint().UpdateEndpoint(environments[i].ID, &environments[i])
|
||||
|
||||
// create a docker client
|
||||
dockerClient, err := migrator.dockerFactory.CreateClient(&environments[i], "", nil)
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failure creating docker client for environment: " + environments[i].Name)
|
||||
return
|
||||
}
|
||||
defer dockerClient.Close()
|
||||
|
||||
// get all containers
|
||||
containers, err := dockerClient.ContainerList(context.Background(), types.ContainerListOptions{All: true})
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failed to list containers")
|
||||
return
|
||||
}
|
||||
|
||||
// check for a gpu on each container. If even one GPU is found, set EnableGPUManagement to true for the whole endpoint
|
||||
containersLoop:
|
||||
for _, container := range containers {
|
||||
// https://www.sobyte.net/post/2022-10/go-docker/ has nice documentation on the docker client with GPUs
|
||||
containerDetails, err := dockerClient.ContainerInspect(context.Background(), container.ID)
|
||||
if err != nil {
|
||||
log.Err(err).Msg("failed to inspect container")
|
||||
return
|
||||
}
|
||||
deviceRequests := containerDetails.HostConfig.Resources.DeviceRequests
|
||||
for _, deviceRequest := range deviceRequests {
|
||||
if deviceRequest.Driver == "nvidia" {
|
||||
environments[i].EnableGPUManagement = true
|
||||
migrator.dataStore.Endpoint().UpdateEndpoint(environments[i].ID, &environments[i])
|
||||
break containersLoop
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,11 +3,16 @@ package migrator
|
|||
import (
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
portainer "github.com/portainer/portainer/api"
|
||||
portainerDsErrors "github.com/portainer/portainer/api/dataservices/errors"
|
||||
)
|
||||
|
||||
func (m *Migrator) migrateDBVersionToDB90() error {
|
||||
if err := m.updateUserThemForDB90(); err != nil {
|
||||
if err := m.updateUserThemeForDB90(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := m.updateEnableGpuManagementFeatures(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -39,7 +44,7 @@ func (m *Migrator) updateEdgeStackStatusForDB90() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (m *Migrator) updateUserThemForDB90() error {
|
||||
func (m *Migrator) updateUserThemeForDB90() error {
|
||||
log.Info().Msg("updating existing user theme settings")
|
||||
|
||||
users, err := m.userService.Users()
|
||||
|
@ -60,3 +65,28 @@ func (m *Migrator) updateUserThemForDB90() error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Migrator) updateEnableGpuManagementFeatures() error {
|
||||
// get all environments
|
||||
environments, err := m.endpointService.Endpoints()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, environment := range environments {
|
||||
if environment.Type == portainer.DockerEnvironment {
|
||||
// set the PostInitMigrations.MigrateGPUs to true on this environment to run the migration only on the 2.18 upgrade
|
||||
environment.PostInitMigrations.MigrateGPUs = true
|
||||
// if there's one or more gpu, set the EnableGpuManagement setting to true
|
||||
gpuList := environment.Gpus
|
||||
if len(gpuList) > 0 {
|
||||
environment.EnableGPUManagement = true
|
||||
}
|
||||
// update the environment
|
||||
if err := m.endpointService.UpdateEndpoint(environment.ID, &environment); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
},
|
||||
"EdgeCheckinInterval": 0,
|
||||
"EdgeKey": "",
|
||||
"EnableGPUManagement": false,
|
||||
"Gpus": [],
|
||||
"GroupId": 1,
|
||||
"Id": 1,
|
||||
|
@ -71,6 +72,7 @@
|
|||
"LastCheckInDate": 0,
|
||||
"Name": "local",
|
||||
"PostInitMigrations": {
|
||||
"MigrateGPUs": true,
|
||||
"MigrateIngresses": true
|
||||
},
|
||||
"PublicURL": "",
|
||||
|
|
|
@ -28,6 +28,10 @@ type endpointSettingsUpdatePayload struct {
|
|||
AllowSysctlSettingForRegularUsers *bool `json:"allowSysctlSettingForRegularUsers" example:"true"`
|
||||
// Whether host management features are enabled
|
||||
EnableHostManagementFeatures *bool `json:"enableHostManagementFeatures" example:"true"`
|
||||
|
||||
EnableGPUManagement *bool `json:"enableGPUManagement" example:"false"`
|
||||
|
||||
Gpus []portainer.Pair `json:"gpus"`
|
||||
}
|
||||
|
||||
func (payload *endpointSettingsUpdatePayload) Validate(r *http.Request) error {
|
||||
|
@ -107,6 +111,14 @@ func (handler *Handler) endpointSettingsUpdate(w http.ResponseWriter, r *http.Re
|
|||
securitySettings.EnableHostManagementFeatures = *payload.EnableHostManagementFeatures
|
||||
}
|
||||
|
||||
if payload.EnableGPUManagement != nil {
|
||||
endpoint.EnableGPUManagement = *payload.EnableGPUManagement
|
||||
}
|
||||
|
||||
if payload.Gpus != nil {
|
||||
endpoint.Gpus = payload.Gpus
|
||||
}
|
||||
|
||||
endpoint.SecuritySettings = securitySettings
|
||||
|
||||
err = handler.DataStore.Endpoint().UpdateEndpoint(portainer.EndpointID(endpointID), endpoint)
|
||||
|
|
|
@ -12,7 +12,6 @@ import (
|
|||
"github.com/pkg/errors"
|
||||
portainer "github.com/portainer/portainer/api"
|
||||
"github.com/portainer/portainer/api/dataservices"
|
||||
"github.com/rs/zerolog/log"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
|
@ -221,27 +220,7 @@ func buildLocalClient() (*kubernetes.Clientset, error) {
|
|||
return kubernetes.NewForConfig(config)
|
||||
}
|
||||
|
||||
func (factory *ClientFactory) PostInitMigrateIngresses() error {
|
||||
endpoints, err := factory.dataStore.Endpoint().Endpoints()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := range endpoints {
|
||||
// Early exit if we do not need to migrate!
|
||||
if endpoints[i].PostInitMigrations.MigrateIngresses == false {
|
||||
return nil
|
||||
}
|
||||
|
||||
err := factory.migrateEndpointIngresses(&endpoints[i])
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msg("failure migrating endpoint ingresses")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (factory *ClientFactory) migrateEndpointIngresses(e *portainer.Endpoint) error {
|
||||
func (factory *ClientFactory) MigrateEndpointIngresses(e *portainer.Endpoint) error {
|
||||
// classes is a list of controllers which have been manually added to the
|
||||
// cluster setup view. These need to all be allowed globally, but then
|
||||
// blocked in specific namespaces which they were not previously allowed in.
|
||||
|
|
|
@ -402,6 +402,8 @@ type (
|
|||
Version string `example:"1.0.0"`
|
||||
}
|
||||
|
||||
EnableGPUManagement bool `json:"EnableGPUManagement"`
|
||||
|
||||
// Deprecated fields
|
||||
// Deprecated in DBVersion == 4
|
||||
TLS bool `json:"TLS,omitempty"`
|
||||
|
@ -502,6 +504,7 @@ type (
|
|||
// EndpointPostInitMigrations
|
||||
EndpointPostInitMigrations struct {
|
||||
MigrateIngresses bool `json:"MigrateIngresses"`
|
||||
MigrateGPUs bool `json:"MigrateGPUs"`
|
||||
}
|
||||
|
||||
// Extension represents a deprecated Portainer extension
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue