From 05279f747247c4aa6d845d0f882f8bb11bcb4cbd Mon Sep 17 00:00:00 2001 From: Stavros Kois <47820033+stavros-k@users.noreply.github.com> Date: Sat, 24 Feb 2024 11:37:33 +0200 Subject: [PATCH] chore(resources): remove scaleGPU support (#709) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description** ⚒️ Fixes # **⚙️ Type of change** - [ ] ⚙️ Feature/App addition - [ ] 🪛 Bugfix - [ ] ⚠️ Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] 🔃 Refactor of current code **🧪 How Has This Been Tested?** **📃 Notes:** **✔️ Checklist:** - [x] ⚖️ My code follows the style guidelines of this project - [x] 👀 I have performed a self-review of my own code - [x] #️⃣ I have commented my code, particularly in hard-to-understand areas - [x] 📄 I have made corresponding changes to the documentation - [x] ⚠️ My changes generate no new warnings - [x] 🧪 I have added tests to this description that prove my fix is effective or that my feature works - [x] ⬆️ I increased versions for any altered app according to semantic versioning **➕ App addition** If this PR is an app addition please make sure you have done the following. - [ ] 🖼️ I have added an icon in the Chart's root directory called `icon.png` --- _Please don't blindly check all the boxes. Read them and only check those that apply. Those checkboxes are there for the reviewer to see what is this all about and the status of this PR with a quick glance._ --------- Co-authored-by: Kjeld Schouten --- .github/workflows/common_library_tests.yaml | 2 +- library/common-test/Chart.yaml | 2 +- .../tests/container/envFixed_test .yaml | 18 +- .../tests/container/resources_test.yaml | 540 +----------------- .../tests/pod/host_users_test.yaml | 42 +- .../tests/pod/runtime_class_name_test.yaml | 176 ++---- .../tests/pod/securityContext.yaml | 108 ++-- .../templates/lib/container/_fixedEnv.tpl | 2 +- .../templates/lib/container/_resources.tpl | 117 ++-- .../templates/lib/pod/_podSecurityContext.tpl | 15 +- .../templates/lib/pod/_runtimeClassName.tpl | 62 +- library/common/values.yaml | 3 - 12 files changed, 191 insertions(+), 896 deletions(-) diff --git a/.github/workflows/common_library_tests.yaml b/.github/workflows/common_library_tests.yaml index 0a599122..6462a747 100644 --- a/.github/workflows/common_library_tests.yaml +++ b/.github/workflows/common_library_tests.yaml @@ -56,7 +56,7 @@ jobs: name: Unit Tests runs-on: ubuntu-22.04 env: - helmUnitVersion: 0.4.1 + helmUnitVersion: 0.4.2 strategy: fail-fast: false matrix: diff --git a/library/common-test/Chart.yaml b/library/common-test/Chart.yaml index f2298282..acd5543b 100644 --- a/library/common-test/Chart.yaml +++ b/library/common-test/Chart.yaml @@ -3,7 +3,7 @@ appVersion: "" dependencies: - name: common repository: file://../common - version: ~17.5.0 + version: ~18.0.0 deprecated: false description: Helper chart to test different use cases of the common library home: https://github.com/truecharts/apps/tree/master/charts/library/common-test diff --git a/library/common-test/tests/container/envFixed_test .yaml b/library/common-test/tests/container/envFixed_test .yaml index eff52385..62c945f1 100644 --- a/library/common-test/tests/container/envFixed_test .yaml +++ b/library/common-test/tests/container/envFixed_test .yaml @@ -224,12 +224,6 @@ tests: - it: should create the correct fixed envs with GPU set: - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name: - - container-name1 image: *image TZ: Europe/London containerOptions: @@ -248,6 +242,9 @@ tests: primary: true imageSelector: image probes: *probes + resources: + limits: + nvidia.com/gpu: 1 asserts: - documentIndex: &deploymentDoc 0 isKind: @@ -273,12 +270,6 @@ tests: - it: should create the correct fixed envs with GPU and overridden on container level set: - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name: - - container-name1 image: *image TZ: Europe/London containerOptions: @@ -297,6 +288,9 @@ tests: primary: true imageSelector: image probes: *probes + resources: + limits: + nvidia.com/gpu: 1 fixedEnv: NVIDIA_CAPS: - all diff --git a/library/common-test/tests/container/resources_test.yaml b/library/common-test/tests/container/resources_test.yaml index c98e243a..2c914598 100644 --- a/library/common-test/tests/container/resources_test.yaml +++ b/library/common-test/tests/container/resources_test.yaml @@ -67,6 +67,7 @@ tests: limits: cpu: 2000m memory: 4Gi + nvidia.com/gpu: 1 some-resource: 1 some-other-resource: 0 asserts: @@ -84,6 +85,7 @@ tests: limits: cpu: 2000m memory: 4Gi + nvidia.com/gpu: 1 some-resource: 1 some-other-resource: 0 requests: @@ -113,7 +115,8 @@ tests: limits: cpu: 2000m memory: 4Gi - some-resource: 2 + some-resource: 0 + some-other-resource: 2 asserts: - documentIndex: &deploymentDoc 0 isKind: @@ -129,8 +132,8 @@ tests: limits: cpu: 2000m memory: 4Gi - some-resource: 2 - some-other-resource: 0 + some-resource: 0 + some-other-resource: 2 requests: cpu: 10m memory: 50Mi @@ -393,439 +396,6 @@ tests: cpu: 10m memory: 1Gi - - it: should assign GPU on the primary pod/container - set: - image: *image - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pod/container - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name2: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pods/containers - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name1: - - container-name1 - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pod/container with multiple GPUs - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - amd.com/gpu: 0 - targetSelector: - workload-name1: - - container-name1 - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - - it: should assign multiple GPU on the selected pod/container with multiple selected GPUs - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - amd.com/gpu: 0 - targetSelector: - workload-name1: - - container-name1 - - gpu: - nvidia.com/gpu: 0 - amd.com/gpu: 1 - targetSelector: - workload-name1: - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - amd.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - # Failures - it: should fail with empty requests set: @@ -993,101 +563,3 @@ tests: asserts: - failedTemplate: errorMessage: Container - Expected [resources.limits.memory] to have one of the following formats [(Suffixed with E/P/T/G/M/K - eg. 1G), (Suffixed with Ei/Pi/Ti/Gi/Mi/Ki - eg. 1Gi), (Plain Integer in bytes - eg. 1024), (Exponent - eg. 134e6)], but got [8GB] - - - it: should fail with empty gpu in defined entry - set: - image: *image - scaleGPU: - - gpu: - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU.gpu] - - - it: should fail with empty list under workload in targetSelector - set: - image: *image - scaleGPU: - - gpu: - key: value - targetSelector: - workload-name1: [] - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty list under pod in [scaleGPU.targetSelector] - - - it: should fail with empty value in gpu - set: - image: *image - scaleGPU: - - gpu: - key: "" - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU] [value] - - - it: should fail with no value in gpu - set: - image: *image - scaleGPU: - - gpu: - key: - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU] [value] diff --git a/library/common-test/tests/pod/host_users_test.yaml b/library/common-test/tests/pod/host_users_test.yaml index 6e43c03d..66aa8f67 100644 --- a/library/common-test/tests/pod/host_users_test.yaml +++ b/library/common-test/tests/pod/host_users_test.yaml @@ -141,11 +141,45 @@ tests: path: spec.template.spec.hostUsers value: true - - it: should pass with enabled hostUsers because of gpu + - it: should pass with enabled hostUsers because of nvidia gpu set: - scaleGPU: - - gpu: - nvidia: "1" + resources: + limits: + nvidia.com/gpu: 1 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.hostUsers + value: true + + - it: should pass with enabled hostUsers because of amd gpu + set: + resources: + limits: + amd.com/gpu: 1 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.hostUsers + value: true + + - it: should pass with enabled hostUsers because of intel gpu + set: + resources: + limits: + intel.com/i915: 1 workload: workload-name1: enabled: true diff --git a/library/common-test/tests/pod/runtime_class_name_test.yaml b/library/common-test/tests/pod/runtime_class_name_test.yaml index adde23ca..bb408445 100644 --- a/library/common-test/tests/pod/runtime_class_name_test.yaml +++ b/library/common-test/tests/pod/runtime_class_name_test.yaml @@ -74,149 +74,6 @@ tests: path: spec.template.spec.runtimeClassName value: some-other-runtime-class - - it: should pass with runtimeClassName from ixChartContext with targetSelector - set: - scaleGPU: - - gpu: - key: value - targetSelector: - workload-name1: - - container-name1 - workload-name3: - - container-name1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - podOptions: - runtimeClassName: some-class - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - runtimeClassName: some-other-class - workload-name2: - enabled: true - primary: false - type: DaemonSet - podSpec: - runtimeClassName: some-class - workload-name3: - enabled: true - primary: false - type: StatefulSet - podSpec: - runtimeClassName: some-class - asserts: - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - documentIndex: &daemonSetDoc 1 - isKind: - of: DaemonSet - - documentIndex: *daemonSetDoc - equal: - path: spec.template.spec.runtimeClassName - value: some-class - - documentIndex: &statefulSetDoc 2 - isKind: - of: StatefulSet - - documentIndex: *statefulSetDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - - it: should not add runtimeClassName with gpu value 0 - set: - scaleGPU: - - gpu: - key: 0 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: {} - asserts: - - documentIndex: *deploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - - it: should pass with runtimeClassName from ixChartContext without targetSelector (on primary workload) - set: - scaleGPU: - - gpu: - key: value - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Job - podSpec: - runtimeClassName: some-other-class - workload-name2: - enabled: true - primary: false - type: CronJob - schedule: "* * * * *" - podSpec: {} - asserts: - - documentIndex: &jobDoc 0 - isKind: - of: Job - - documentIndex: *jobDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - documentIndex: &cronJobDoc 1 - isKind: - of: CronJob - - documentIndex: *cronJobDoc - isNullOrEmpty: - path: spec.jobTemplate.spec.template.spec.runtimeClassName - - - it: should pass with runtimeClassName not set from ixChartContext without gpu value - set: - scaleGPU: - - gpu: {} - targetSelector: - workload-name1: - - container-name1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Job - podSpec: - runtimeClassName: some-other-class - asserts: - - documentIndex: *jobDoc - isKind: - of: Job - - documentIndex: *jobDoc - equal: - path: spec.template.spec.runtimeClassName - value: some-other-class - - it: should pass with runtimeClass set to nvidia when in SCALE and using the container "resources" directly set: global: @@ -250,6 +107,39 @@ tests: path: spec.template.spec.runtimeClassName value: nvidia + - it: should pass with runtimeClass NOT set when in SCALE and 0 gpu + set: + global: + namespace: ix-namespace + ixChartContext: + addNvidiaRuntimeClass: true + nvidiaRuntimeClassName: ix-runtime + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: + containers: + container-name: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: + asserts: + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.runtimeClassName + value: null + - it: should pass with runtimeClass set to nvidia when in SCALE and using the top level "resources" directly set: global: diff --git a/library/common-test/tests/pod/securityContext.yaml b/library/common-test/tests/pod/securityContext.yaml index 7bc2fc9b..d8a2ad38 100644 --- a/library/common-test/tests/pod/securityContext.yaml +++ b/library/common-test/tests/pod/securityContext.yaml @@ -354,7 +354,7 @@ tests: - 568 sysctls: [] - - it: should pass with with gpu assigned to primary pod + - it: should pass with with gpu assigned to single container set: workload: workload-name1: @@ -365,14 +365,25 @@ tests: securityContext: supplementalGroups: - 1000 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 workload-name2: enabled: true primary: false type: Deployment podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" asserts: - documentIndex: &deploymentDoc 0 isKind: @@ -402,7 +413,7 @@ tests: - 568 sysctls: [] - - it: should pass with with gpu assigned to specific pod + - it: should pass with with gpu assigned to multiple pods set: workload: workload-name1: @@ -413,70 +424,39 @@ tests: securityContext: supplementalGroups: - 1000 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 workload-name2: enabled: true primary: false type: Deployment - podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" - targetSelector: - workload-name1: - - container-name1 - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.securityContext - value: - fsGroup: 568 - fsGroupChangePolicy: OnRootMismatch - supplementalGroups: - - 1000 - - 44 - - 107 - - 568 - sysctls: [] - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - equal: - path: spec.template.spec.securityContext - value: - fsGroup: 568 - fsGroupChangePolicy: OnRootMismatch - supplementalGroups: - - 568 - sysctls: [] - - - it: should pass with with gpu assigned to multiple pod - set: - workload: - workload-name1: - enabled: true - primary: true - type: Deployment podSpec: - securityContext: - supplementalGroups: - - 1000 - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" - targetSelector: - workload-name1: - - container-name1 - workload-name2: - - container-name1 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 asserts: - documentIndex: &deploymentDoc 0 isKind: diff --git a/library/common/templates/lib/container/_fixedEnv.tpl b/library/common/templates/lib/container/_fixedEnv.tpl index df9f359a..a25887e5 100644 --- a/library/common/templates/lib/container/_fixedEnv.tpl +++ b/library/common/templates/lib/container/_fixedEnv.tpl @@ -47,7 +47,7 @@ objectData: The object data to be used to render the container. {{- $fixed = mustAppend $fixed (dict "k" "UMASK_SET" "v" $UMASK) -}} {{- $nvidia := false -}} - {{- if eq (include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData "returnBool" true)) "true" -}} + {{- if eq (include "tc.v1.common.lib.container.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData "gpuType" "nvidia.com/gpu")) "true" -}} {{- $nvidia = true -}} {{- end -}} diff --git a/library/common/templates/lib/container/_resources.tpl b/library/common/templates/lib/container/_resources.tpl index 5910c53a..d7eb3469 100644 --- a/library/common/templates/lib/container/_resources.tpl +++ b/library/common/templates/lib/container/_resources.tpl @@ -27,73 +27,12 @@ limits: {{- with $resources.limits.memory }} {{/* Passing 0, will not render it, meaning unlimited */}} memory: {{ . }} {{- end -}} - {{- include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData) | trim | nindent 2 -}} {{- range $k, $v := (omit $resources.limits "cpu" "memory") }} {{/* Omit cpu and memory, as they are handled above */}} {{ $k }}: {{ $v }} {{- end -}} {{- end -}} {{- end -}} -{{/* Returns GPU resource */}} -{{/* Call this template: -{{ include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData) }} -rootCtx: The root context of the chart. -objectData: The object data to be used to render the container. -*/}} -{{- define "tc.v1.common.lib.container.resources.gpu" -}} - {{- $objectData := .objectData -}} - {{- $rootCtx := .rootCtx -}} - {{- $returnBool := .returnBool -}} - - {{- $gpuResource := list -}} - - {{- range $GPUValues := $rootCtx.Values.scaleGPU -}} - {{- if not $GPUValues.gpu -}} - {{- fail "Container - Expected non-empty [scaleGPU.gpu]" -}} - {{- end -}} - - {{- $selected := false -}} - - {{/* Parse selector if defined */}} - {{- if $GPUValues.targetSelector -}} - {{- range $podName, $containers := $GPUValues.targetSelector -}} - {{- if not $containers -}} - {{- fail "Container - Expected non-empty list under pod in [scaleGPU.targetSelector]" -}} - {{- end -}} - - {{- if and (eq $podName $objectData.podShortName) (mustHas $objectData.shortName $containers) -}} - {{- $selected = true -}} - {{- end -}} - {{- end -}} - {{/* If no selector, select primary pod/container */}} - {{- else if and $objectData.podPrimary $objectData.primary -}} - {{- $selected = true -}} - {{- end -}} - - {{- if $selected -}} - {{- $gpuResource = mustAppend $gpuResource $GPUValues.gpu -}} - {{- end -}} - {{- end -}} - - {{- if not $returnBool -}} - {{- range $gpu := $gpuResource -}} - {{- range $k, $v := $gpu -}} - {{- if or (kindIs "invalid" $v) (eq (toString $v) "") -}} - {{- fail "Container - Expected non-empty [scaleGPU] [value]" -}} - {{- end -}} {{/* Don't try to schedule 0 GPUs */}} - {{- if gt (int $v) 0 }} -{{ $k }}: {{ $v | quote }} - {{- end -}} - {{- end -}} - {{- end -}} - {{- else -}} - {{- if $gpuResource -}} - {{- "true" -}} - {{- end -}} - {{- end -}} - -{{- end -}} - {{/* Validates resources to match a pattern */}} {{/* Call this template: {{ include "tc.v1.common.lib.container.resources.validation" (dict "resources" $resources) }} @@ -141,3 +80,59 @@ resources: The resources object {{- end -}} {{- end -}} {{- end -}} + +{{- define "tc.v1.common.lib.pod.resources.hasGPU" -}} + {{- $rootCtx := .rootCtx -}} + {{- $objectData := .objectData -}} + {{- $gpuType := .gpuType -}} + + {{- $types := (list "nvidia.com/gpu" "amd.com/gpu" "intel.com/i915") -}} + {{- if $gpuType -}} + {{- $types = (list $gpuType) -}} + {{- end -}} + + {{- $gpu := false -}} + + {{- if and ($rootCtx.Values.resources) ($rootCtx.Values.resources.limits) -}} + {{- range $t := $types -}} + {{- if gt ((get $rootCtx.Values.resources.limits $t) | int) 0 -}} + {{- $gpu = true -}} + {{- break -}} + {{- end -}} + {{- end -}} + {{- end -}} + + {{- if $objectData.podSpec -}} + {{- range $k, $v := $objectData.podSpec.containers -}} + {{- if not $v.enabled -}} + {{- continue -}} + {{- end -}} + + {{- range $t := $types -}} + {{- if eq (include "tc.v1.common.lib.container.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $v "gpuType" $t)) "true" -}} + {{- $gpu = true -}} + {{- break -}} + {{- end -}} + {{- end -}} + + {{- end -}} + {{- end -}} + + {{- $gpu | toString -}} +{{- end -}} + +{{- define "tc.v1.common.lib.container.resources.hasGPU" -}} + {{- $rootCtx := .rootCtx -}} + {{- $objectData := .objectData -}} + {{- $gpuType := .gpuType -}} + + {{- $gpu := false -}} + + {{- if and ($objectData.resources) ($objectData.resources.limits) -}} + {{- if gt ((get $objectData.resources.limits $gpuType) | int) 0 -}} + {{- $gpu = true -}} + {{- end -}} + {{- end -}} + + {{- $gpu | toString -}} +{{- end -}} diff --git a/library/common/templates/lib/pod/_podSecurityContext.tpl b/library/common/templates/lib/pod/_podSecurityContext.tpl index 9dc5ac6d..87db62d5 100644 --- a/library/common/templates/lib/pod/_podSecurityContext.tpl +++ b/library/common/templates/lib/pod/_podSecurityContext.tpl @@ -20,18 +20,7 @@ objectData: The object data to be used to render the Pod. {{- $secContext = mustMergeOverwrite $secContext . -}} {{- end -}} - {{- $gpuAdded := false -}} - {{- range $GPUValues := $rootCtx.Values.scaleGPU -}} - {{/* If there is a selector and pod is selected */}} - {{- if $GPUValues.targetSelector -}} - {{- if mustHas $objectData.shortName ($GPUValues.targetSelector | keys) -}} - {{- $gpuAdded = true -}} - {{- end -}} - {{/* If there is not a selector, but pod is primary */}} - {{- else if $objectData.primary -}} - {{- $gpuAdded = true -}} - {{- end -}} - {{- end -}} + {{- $gpu := (include "tc.v1.common.lib.pod.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData)) -}} {{- $deviceGroups := (list 5 10 20 24) -}} {{- $deviceAdded := false -}} @@ -84,7 +73,7 @@ objectData: The object data to be used to render the Pod. {{- end -}} {{- end -}} - {{- if $gpuAdded -}} + {{- if eq $gpu "true" -}} {{- $_ := set $secContext "supplementalGroups" (concat $secContext.supplementalGroups (list 44 107)) -}} {{- $hostUsers = true -}} {{- end -}} diff --git a/library/common/templates/lib/pod/_runtimeClassName.tpl b/library/common/templates/lib/pod/_runtimeClassName.tpl index 4d4efc55..b419acd8 100644 --- a/library/common/templates/lib/pod/_runtimeClassName.tpl +++ b/library/common/templates/lib/pod/_runtimeClassName.tpl @@ -34,69 +34,13 @@ objectData: The object data to be used to render the Pod. {{- define "tc.v1.common.lib.pod.runtimeClassName.scale" -}} {{- $rootCtx := .rootCtx -}} {{- $objectData := .objectData -}} + {{- $runtime := "" -}} - - {{- $nvidia := false -}} - - {{- if and ($rootCtx.Values.resources) ($rootCtx.Values.resources.limits) -}} - {{- if gt ((get $rootCtx.Values.resources.limits "nvidia.com/gpu") | int) 0 -}} - {{- $nvidia = true -}} - {{- end -}} - {{- end -}} - - {{- range $rootCtx.Values.workload -}} - {{- if not .podSpec -}} - {{- continue -}} - {{- end -}} - - {{- range $k, $v := .podSpec.containers -}} - {{- if or (not $v.resources) (not $v.resources.limits) -}} - {{- continue -}} - {{- end -}} - - {{- if gt ((get $v.resources.limits "nvidia.com/gpu") | int) 0 -}} - {{- $nvidia = true -}} - {{- break -}} - {{- end -}} - {{- end -}} - {{- end -}} - - {{- if $nvidia -}} + {{- $nvidia := (include "tc.v1.common.lib.pod.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData)) -}} + {{- if eq $nvidia "true" -}} {{/* https://github.com/truenas/middleware/blob/0bfc05166c3f95b1ab4ca4a9614691f14303db2e/src/middlewared/middlewared/plugins/kubernetes_linux/utils.py#L16 */}} {{- $runtime = "nvidia" -}} {{- end -}} - {{/* Keep backwards compat with .scaleGPU */}} - {{- if $rootCtx.Values.global.ixChartContext.addNvidiaRuntimeClass -}} - {{- range $rootCtx.Values.scaleGPU -}} - {{- if .gpu -}} {{/* Make sure it has a value... */}} - {{- $scaleGPU := false -}} - {{- range $k, $v := .gpu -}} - {{- if $v -}} {{/* Make sure value is not "0" or "" */}} - {{- $scaleGPU = true -}} - {{- break -}} - {{- end -}} - {{- end -}} - - {{- if $scaleGPU -}} - - {{- if (kindIs "map" .targetSelector) -}} - {{- range $podName, $containers := .targetSelector -}} - - {{- if eq $objectData.shortName $podName -}} {{/* If the pod is selected */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} - {{- end -}} - - {{- end -}} - {{- else if $objectData.primary -}} - {{/* If the pod is primary and no targetSelector is given, assign to primary */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} - {{- end -}} - - {{- end -}} - {{- end -}} - {{- end -}} - {{- end -}} - {{- $runtime -}} {{- end -}} diff --git a/library/common/values.yaml b/library/common/values.yaml index 6d443420..35cf570a 100644 --- a/library/common/values.yaml +++ b/library/common/values.yaml @@ -385,9 +385,6 @@ scaleExternalInterface: [] # -- (docs/scaleCertificate.md) scaleCertificate: {} -# -- (docs/scaleGPU.md) -scaleGPU: [] - # NOTES.txt notes: header: |