1
0
Fork 0
mirror of https://github.com/portainer/portainer.git synced 2025-08-05 05:45:22 +02:00

feat(gpu): rework docker GPU for UI performance [EE-4918] (#8518)

This commit is contained in:
Ali 2023-03-03 14:47:10 +13:00 committed by GitHub
parent 769c8372fb
commit fd916bc8a2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 692 additions and 285 deletions

View file

@ -689,20 +689,16 @@ func buildServer(flags *portainer.CLIFlags) portainer.Server {
log.Fatal().Err(err).Msg("failed initializing upgrade service")
}
// FIXME: In 2.16 we changed the way ingress controller permissions are
// stored. Instead of being stored as annotation on an ingress rule, we keep
// them in our database. However, in order to run the migration we need an
// admin kube client to run lookup the old ingress rules and compare them
// with the current existing ingress classes.
//
// Unfortunately, our migrations run as part of the database initialization
// and our kubeclients require an initialized database. So it is not
// possible to do this migration as part of our normal flow. We DO have a
// migration which toggles a boolean in kubernetes configuration that
// indicated that this "post init" migration should be run. If/when this is
// resolved we can remove this function.
err = kubernetesClientFactory.PostInitMigrateIngresses()
if err != nil {
// Our normal migrations run as part of the database initialization
// but some more complex migrations require access to a kubernetes or docker
// client. Therefore we run a separate migration process just before
// starting the server.
postInitMigrator := datastore.NewPostInitMigrator(
kubernetesClientFactory,
dockerClientFactory,
dataStore,
)
if err := postInitMigrator.PostInitMigrate(); err != nil {
log.Fatal().Err(err).Msg("failure during post init migrations")
}

View file

@ -0,0 +1,116 @@
package datastore
import (
"context"
"github.com/docker/docker/api/types"
portainer "github.com/portainer/portainer/api"
"github.com/portainer/portainer/api/dataservices"
"github.com/portainer/portainer/api/docker"
"github.com/portainer/portainer/api/kubernetes/cli"
"github.com/rs/zerolog/log"
)
type PostInitMigrator struct {
kubeFactory *cli.ClientFactory
dockerFactory *docker.ClientFactory
dataStore dataservices.DataStore
}
func NewPostInitMigrator(
kubeFactory *cli.ClientFactory,
dockerFactory *docker.ClientFactory,
dataStore dataservices.DataStore,
) *PostInitMigrator {
return &PostInitMigrator{
kubeFactory: kubeFactory,
dockerFactory: dockerFactory,
dataStore: dataStore,
}
}
func (migrator *PostInitMigrator) PostInitMigrate() error {
if err := migrator.PostInitMigrateIngresses(); err != nil {
return err
}
migrator.PostInitMigrateGPUs()
return nil
}
func (migrator *PostInitMigrator) PostInitMigrateIngresses() error {
endpoints, err := migrator.dataStore.Endpoint().Endpoints()
if err != nil {
return err
}
for i := range endpoints {
// Early exit if we do not need to migrate!
if endpoints[i].PostInitMigrations.MigrateIngresses == false {
return nil
}
err := migrator.kubeFactory.MigrateEndpointIngresses(&endpoints[i])
if err != nil {
log.Debug().Err(err).Msg("failure migrating endpoint ingresses")
}
}
return nil
}
// PostInitMigrateGPUs will check all docker endpoints for containers with GPUs and set EnableGPUManagement to true if any are found
// If there's an error getting the containers, we'll log it and move on
func (migrator *PostInitMigrator) PostInitMigrateGPUs() {
environments, err := migrator.dataStore.Endpoint().Endpoints()
if err != nil {
log.Err(err).Msg("failure getting endpoints")
return
}
for i := range environments {
if environments[i].Type == portainer.DockerEnvironment {
// // Early exit if we do not need to migrate!
if environments[i].PostInitMigrations.MigrateGPUs == false {
return
}
// set the MigrateGPUs flag to false so we don't run this again
environments[i].PostInitMigrations.MigrateGPUs = false
migrator.dataStore.Endpoint().UpdateEndpoint(environments[i].ID, &environments[i])
// create a docker client
dockerClient, err := migrator.dockerFactory.CreateClient(&environments[i], "", nil)
if err != nil {
log.Err(err).Msg("failure creating docker client for environment: " + environments[i].Name)
return
}
defer dockerClient.Close()
// get all containers
containers, err := dockerClient.ContainerList(context.Background(), types.ContainerListOptions{All: true})
if err != nil {
log.Err(err).Msg("failed to list containers")
return
}
// check for a gpu on each container. If even one GPU is found, set EnableGPUManagement to true for the whole endpoint
containersLoop:
for _, container := range containers {
// https://www.sobyte.net/post/2022-10/go-docker/ has nice documentation on the docker client with GPUs
containerDetails, err := dockerClient.ContainerInspect(context.Background(), container.ID)
if err != nil {
log.Err(err).Msg("failed to inspect container")
return
}
deviceRequests := containerDetails.HostConfig.Resources.DeviceRequests
for _, deviceRequest := range deviceRequests {
if deviceRequest.Driver == "nvidia" {
environments[i].EnableGPUManagement = true
migrator.dataStore.Endpoint().UpdateEndpoint(environments[i].ID, &environments[i])
break containersLoop
}
}
}
}
}
}

View file

@ -3,11 +3,16 @@ package migrator
import (
"github.com/rs/zerolog/log"
portainer "github.com/portainer/portainer/api"
portainerDsErrors "github.com/portainer/portainer/api/dataservices/errors"
)
func (m *Migrator) migrateDBVersionToDB90() error {
if err := m.updateUserThemForDB90(); err != nil {
if err := m.updateUserThemeForDB90(); err != nil {
return err
}
if err := m.updateEnableGpuManagementFeatures(); err != nil {
return err
}
@ -39,7 +44,7 @@ func (m *Migrator) updateEdgeStackStatusForDB90() error {
return nil
}
func (m *Migrator) updateUserThemForDB90() error {
func (m *Migrator) updateUserThemeForDB90() error {
log.Info().Msg("updating existing user theme settings")
users, err := m.userService.Users()
@ -60,3 +65,28 @@ func (m *Migrator) updateUserThemForDB90() error {
return nil
}
func (m *Migrator) updateEnableGpuManagementFeatures() error {
// get all environments
environments, err := m.endpointService.Endpoints()
if err != nil {
return err
}
for _, environment := range environments {
if environment.Type == portainer.DockerEnvironment {
// set the PostInitMigrations.MigrateGPUs to true on this environment to run the migration only on the 2.18 upgrade
environment.PostInitMigrations.MigrateGPUs = true
// if there's one or more gpu, set the EnableGpuManagement setting to true
gpuList := environment.Gpus
if len(gpuList) > 0 {
environment.EnableGPUManagement = true
}
// update the environment
if err := m.endpointService.UpdateEndpoint(environment.ID, &environment); err != nil {
return err
}
}
}
return nil
}

View file

@ -46,6 +46,7 @@
},
"EdgeCheckinInterval": 0,
"EdgeKey": "",
"EnableGPUManagement": false,
"Gpus": [],
"GroupId": 1,
"Id": 1,
@ -71,6 +72,7 @@
"LastCheckInDate": 0,
"Name": "local",
"PostInitMigrations": {
"MigrateGPUs": true,
"MigrateIngresses": true
},
"PublicURL": "",

View file

@ -28,6 +28,10 @@ type endpointSettingsUpdatePayload struct {
AllowSysctlSettingForRegularUsers *bool `json:"allowSysctlSettingForRegularUsers" example:"true"`
// Whether host management features are enabled
EnableHostManagementFeatures *bool `json:"enableHostManagementFeatures" example:"true"`
EnableGPUManagement *bool `json:"enableGPUManagement" example:"false"`
Gpus []portainer.Pair `json:"gpus"`
}
func (payload *endpointSettingsUpdatePayload) Validate(r *http.Request) error {
@ -107,6 +111,14 @@ func (handler *Handler) endpointSettingsUpdate(w http.ResponseWriter, r *http.Re
securitySettings.EnableHostManagementFeatures = *payload.EnableHostManagementFeatures
}
if payload.EnableGPUManagement != nil {
endpoint.EnableGPUManagement = *payload.EnableGPUManagement
}
if payload.Gpus != nil {
endpoint.Gpus = payload.Gpus
}
endpoint.SecuritySettings = securitySettings
err = handler.DataStore.Endpoint().UpdateEndpoint(portainer.EndpointID(endpointID), endpoint)

View file

@ -12,7 +12,6 @@ import (
"github.com/pkg/errors"
portainer "github.com/portainer/portainer/api"
"github.com/portainer/portainer/api/dataservices"
"github.com/rs/zerolog/log"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
@ -221,27 +220,7 @@ func buildLocalClient() (*kubernetes.Clientset, error) {
return kubernetes.NewForConfig(config)
}
func (factory *ClientFactory) PostInitMigrateIngresses() error {
endpoints, err := factory.dataStore.Endpoint().Endpoints()
if err != nil {
return err
}
for i := range endpoints {
// Early exit if we do not need to migrate!
if endpoints[i].PostInitMigrations.MigrateIngresses == false {
return nil
}
err := factory.migrateEndpointIngresses(&endpoints[i])
if err != nil {
log.Debug().Err(err).Msg("failure migrating endpoint ingresses")
}
}
return nil
}
func (factory *ClientFactory) migrateEndpointIngresses(e *portainer.Endpoint) error {
func (factory *ClientFactory) MigrateEndpointIngresses(e *portainer.Endpoint) error {
// classes is a list of controllers which have been manually added to the
// cluster setup view. These need to all be allowed globally, but then
// blocked in specific namespaces which they were not previously allowed in.

View file

@ -402,6 +402,8 @@ type (
Version string `example:"1.0.0"`
}
EnableGPUManagement bool `json:"EnableGPUManagement"`
// Deprecated fields
// Deprecated in DBVersion == 4
TLS bool `json:"TLS,omitempty"`
@ -502,6 +504,7 @@ type (
// EndpointPostInitMigrations
EndpointPostInitMigrations struct {
MigrateIngresses bool `json:"MigrateIngresses"`
MigrateGPUs bool `json:"MigrateGPUs"`
}
// Extension represents a deprecated Portainer extension