diff --git a/library/common-test/tests/container/envFixed_test .yaml b/library/common-test/tests/container/envFixed_test .yaml index eff52385..d55847b2 100644 --- a/library/common-test/tests/container/envFixed_test .yaml +++ b/library/common-test/tests/container/envFixed_test .yaml @@ -222,107 +222,6 @@ tests: - name: GID value: "568" - - it: should create the correct fixed envs with GPU - set: - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name: - - container-name1 - image: *image - TZ: Europe/London - containerOptions: - NVIDIA_CAPS: - - compute - - video - workload: - workload-name: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - env: - - name: TZ - value: Europe/London - - name: UMASK - value: "0022" - - name: UMASK_SET - value: "0022" - - name: NVIDIA_DRIVER_CAPABILITIES - value: "compute,video" - - name: S6_READ_ONLY_ROOT - value: "1" - - - it: should create the correct fixed envs with GPU and overridden on container level - set: - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name: - - container-name1 - image: *image - TZ: Europe/London - containerOptions: - NVIDIA_CAPS: - - compute - - video - workload: - workload-name: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - fixedEnv: - NVIDIA_CAPS: - - all - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - env: - - name: TZ - value: Europe/London - - name: UMASK - value: "0022" - - name: UMASK_SET - value: "0022" - - name: NVIDIA_DRIVER_CAPABILITIES - value: "all" - - name: S6_READ_ONLY_ROOT - value: "1" - - it: should create the correct fixed envs with PUID set to 0 on container level set: image: *image diff --git a/library/common-test/tests/container/resources_test.yaml b/library/common-test/tests/container/resources_test.yaml index c98e243a..f6c74204 100644 --- a/library/common-test/tests/container/resources_test.yaml +++ b/library/common-test/tests/container/resources_test.yaml @@ -393,439 +393,6 @@ tests: cpu: 10m memory: 1Gi - - it: should assign GPU on the primary pod/container - set: - image: *image - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pod/container - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name2: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pods/containers - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - targetSelector: - workload-name1: - - container-name1 - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *otherDeploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - documentIndex: *otherDeploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - requests: - cpu: 10m - memory: 50Mi - - - it: should assign GPU on the selected pod/container with multiple GPUs - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - amd.com/gpu: 0 - targetSelector: - workload-name1: - - container-name1 - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - - it: should assign multiple GPU on the selected pod/container with multiple selected GPUs - set: - image: *image - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: nvidia - scaleGPU: - - gpu: - nvidia.com/gpu: 1 - amd.com/gpu: 0 - targetSelector: - workload-name1: - - container-name1 - - gpu: - nvidia.com/gpu: 0 - amd.com/gpu: 1 - targetSelector: - workload-name1: - - container-name2 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - container-name2: - enabled: true - primary: false - imageSelector: image - probes: *probes - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - isAPIVersion: - of: apps/v1 - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: nvidia - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[0] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - nvidia.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - - documentIndex: *deploymentDoc - isSubset: - path: spec.template.spec.containers[1] - content: - resources: - limits: - cpu: 4000m - memory: 8Gi - amd.com/gpu: "1" - requests: - cpu: 10m - memory: 50Mi - # Failures - it: should fail with empty requests set: @@ -993,101 +560,3 @@ tests: asserts: - failedTemplate: errorMessage: Container - Expected [resources.limits.memory] to have one of the following formats [(Suffixed with E/P/T/G/M/K - eg. 1G), (Suffixed with Ei/Pi/Ti/Gi/Mi/Ki - eg. 1Gi), (Plain Integer in bytes - eg. 1024), (Exponent - eg. 134e6)], but got [8GB] - - - it: should fail with empty gpu in defined entry - set: - image: *image - scaleGPU: - - gpu: - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU.gpu] - - - it: should fail with empty list under workload in targetSelector - set: - image: *image - scaleGPU: - - gpu: - key: value - targetSelector: - workload-name1: [] - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty list under pod in [scaleGPU.targetSelector] - - - it: should fail with empty value in gpu - set: - image: *image - scaleGPU: - - gpu: - key: "" - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU] [value] - - - it: should fail with no value in gpu - set: - image: *image - scaleGPU: - - gpu: - key: - targetSelector: - workload-name1: - - container-name1 - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - containers: - container-name1: - enabled: true - primary: true - imageSelector: image - probes: *probes - asserts: - - failedTemplate: - errorMessage: Container - Expected non-empty [scaleGPU] [value] diff --git a/library/common-test/tests/pod/host_users_test.yaml b/library/common-test/tests/pod/host_users_test.yaml index 6e43c03d..66aa8f67 100644 --- a/library/common-test/tests/pod/host_users_test.yaml +++ b/library/common-test/tests/pod/host_users_test.yaml @@ -141,11 +141,45 @@ tests: path: spec.template.spec.hostUsers value: true - - it: should pass with enabled hostUsers because of gpu + - it: should pass with enabled hostUsers because of nvidia gpu set: - scaleGPU: - - gpu: - nvidia: "1" + resources: + limits: + nvidia.com/gpu: 1 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.hostUsers + value: true + + - it: should pass with enabled hostUsers because of amd gpu + set: + resources: + limits: + amd.com/gpu: 1 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.hostUsers + value: true + + - it: should pass with enabled hostUsers because of intel gpu + set: + resources: + limits: + intel.com/i915: 1 workload: workload-name1: enabled: true diff --git a/library/common-test/tests/pod/runtime_class_name_test.yaml b/library/common-test/tests/pod/runtime_class_name_test.yaml index adde23ca..e4f69351 100644 --- a/library/common-test/tests/pod/runtime_class_name_test.yaml +++ b/library/common-test/tests/pod/runtime_class_name_test.yaml @@ -74,149 +74,6 @@ tests: path: spec.template.spec.runtimeClassName value: some-other-runtime-class - - it: should pass with runtimeClassName from ixChartContext with targetSelector - set: - scaleGPU: - - gpu: - key: value - targetSelector: - workload-name1: - - container-name1 - workload-name3: - - container-name1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - podOptions: - runtimeClassName: some-class - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: - runtimeClassName: some-other-class - workload-name2: - enabled: true - primary: false - type: DaemonSet - podSpec: - runtimeClassName: some-class - workload-name3: - enabled: true - primary: false - type: StatefulSet - podSpec: - runtimeClassName: some-class - asserts: - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - documentIndex: &daemonSetDoc 1 - isKind: - of: DaemonSet - - documentIndex: *daemonSetDoc - equal: - path: spec.template.spec.runtimeClassName - value: some-class - - documentIndex: &statefulSetDoc 2 - isKind: - of: StatefulSet - - documentIndex: *statefulSetDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - - it: should not add runtimeClassName with gpu value 0 - set: - scaleGPU: - - gpu: - key: 0 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Deployment - podSpec: {} - asserts: - - documentIndex: *deploymentDoc - isNullOrEmpty: - path: spec.template.spec.runtimeClassName - - - it: should pass with runtimeClassName from ixChartContext without targetSelector (on primary workload) - set: - scaleGPU: - - gpu: - key: value - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Job - podSpec: - runtimeClassName: some-other-class - workload-name2: - enabled: true - primary: false - type: CronJob - schedule: "* * * * *" - podSpec: {} - asserts: - - documentIndex: &jobDoc 0 - isKind: - of: Job - - documentIndex: *jobDoc - equal: - path: spec.template.spec.runtimeClassName - value: ix-runtime - - documentIndex: &cronJobDoc 1 - isKind: - of: CronJob - - documentIndex: *cronJobDoc - isNullOrEmpty: - path: spec.jobTemplate.spec.template.spec.runtimeClassName - - - it: should pass with runtimeClassName not set from ixChartContext without gpu value - set: - scaleGPU: - - gpu: {} - targetSelector: - workload-name1: - - container-name1 - global: - namespace: ix-namespace - ixChartContext: - addNvidiaRuntimeClass: true - nvidiaRuntimeClassName: ix-runtime - workload: - workload-name1: - enabled: true - primary: true - type: Job - podSpec: - runtimeClassName: some-other-class - asserts: - - documentIndex: *jobDoc - isKind: - of: Job - - documentIndex: *jobDoc - equal: - path: spec.template.spec.runtimeClassName - value: some-other-class - - it: should pass with runtimeClass set to nvidia when in SCALE and using the container "resources" directly set: global: diff --git a/library/common-test/tests/pod/securityContext.yaml b/library/common-test/tests/pod/securityContext.yaml index 7bc2fc9b..d8a2ad38 100644 --- a/library/common-test/tests/pod/securityContext.yaml +++ b/library/common-test/tests/pod/securityContext.yaml @@ -354,7 +354,7 @@ tests: - 568 sysctls: [] - - it: should pass with with gpu assigned to primary pod + - it: should pass with with gpu assigned to single container set: workload: workload-name1: @@ -365,14 +365,25 @@ tests: securityContext: supplementalGroups: - 1000 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 workload-name2: enabled: true primary: false type: Deployment podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" asserts: - documentIndex: &deploymentDoc 0 isKind: @@ -402,7 +413,7 @@ tests: - 568 sysctls: [] - - it: should pass with with gpu assigned to specific pod + - it: should pass with with gpu assigned to multiple pods set: workload: workload-name1: @@ -413,70 +424,39 @@ tests: securityContext: supplementalGroups: - 1000 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 workload-name2: enabled: true primary: false type: Deployment - podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" - targetSelector: - workload-name1: - - container-name1 - asserts: - - documentIndex: &deploymentDoc 0 - isKind: - of: Deployment - - documentIndex: *deploymentDoc - equal: - path: spec.template.spec.securityContext - value: - fsGroup: 568 - fsGroupChangePolicy: OnRootMismatch - supplementalGroups: - - 1000 - - 44 - - 107 - - 568 - sysctls: [] - - documentIndex: &otherDeploymentDoc 1 - isKind: - of: Deployment - - documentIndex: *otherDeploymentDoc - equal: - path: spec.template.spec.securityContext - value: - fsGroup: 568 - fsGroupChangePolicy: OnRootMismatch - supplementalGroups: - - 568 - sysctls: [] - - - it: should pass with with gpu assigned to multiple pod - set: - workload: - workload-name1: - enabled: true - primary: true - type: Deployment podSpec: - securityContext: - supplementalGroups: - - 1000 - workload-name2: - enabled: true - primary: false - type: Deployment - podSpec: {} - scaleGPU: - - gpu: - nvidia: "1" - targetSelector: - workload-name1: - - container-name1 - workload-name2: - - container-name1 + containers: + container1: + enabled: true + primary: true + probes: + liveness: + enabled: false + readiness: + enabled: false + startup: + enabled: false + resources: + limits: + nvidia.com/gpu: 1 asserts: - documentIndex: &deploymentDoc 0 isKind: diff --git a/library/common/templates/lib/container/_fixedEnv.tpl b/library/common/templates/lib/container/_fixedEnv.tpl index df9f359a..a25887e5 100644 --- a/library/common/templates/lib/container/_fixedEnv.tpl +++ b/library/common/templates/lib/container/_fixedEnv.tpl @@ -47,7 +47,7 @@ objectData: The object data to be used to render the container. {{- $fixed = mustAppend $fixed (dict "k" "UMASK_SET" "v" $UMASK) -}} {{- $nvidia := false -}} - {{- if eq (include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData "returnBool" true)) "true" -}} + {{- if eq (include "tc.v1.common.lib.container.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData "gpuType" "nvidia.com/gpu")) "true" -}} {{- $nvidia = true -}} {{- end -}} diff --git a/library/common/templates/lib/container/_resources.tpl b/library/common/templates/lib/container/_resources.tpl index 5910c53a..d7eb3469 100644 --- a/library/common/templates/lib/container/_resources.tpl +++ b/library/common/templates/lib/container/_resources.tpl @@ -27,73 +27,12 @@ limits: {{- with $resources.limits.memory }} {{/* Passing 0, will not render it, meaning unlimited */}} memory: {{ . }} {{- end -}} - {{- include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData) | trim | nindent 2 -}} {{- range $k, $v := (omit $resources.limits "cpu" "memory") }} {{/* Omit cpu and memory, as they are handled above */}} {{ $k }}: {{ $v }} {{- end -}} {{- end -}} {{- end -}} -{{/* Returns GPU resource */}} -{{/* Call this template: -{{ include "tc.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData) }} -rootCtx: The root context of the chart. -objectData: The object data to be used to render the container. -*/}} -{{- define "tc.v1.common.lib.container.resources.gpu" -}} - {{- $objectData := .objectData -}} - {{- $rootCtx := .rootCtx -}} - {{- $returnBool := .returnBool -}} - - {{- $gpuResource := list -}} - - {{- range $GPUValues := $rootCtx.Values.scaleGPU -}} - {{- if not $GPUValues.gpu -}} - {{- fail "Container - Expected non-empty [scaleGPU.gpu]" -}} - {{- end -}} - - {{- $selected := false -}} - - {{/* Parse selector if defined */}} - {{- if $GPUValues.targetSelector -}} - {{- range $podName, $containers := $GPUValues.targetSelector -}} - {{- if not $containers -}} - {{- fail "Container - Expected non-empty list under pod in [scaleGPU.targetSelector]" -}} - {{- end -}} - - {{- if and (eq $podName $objectData.podShortName) (mustHas $objectData.shortName $containers) -}} - {{- $selected = true -}} - {{- end -}} - {{- end -}} - {{/* If no selector, select primary pod/container */}} - {{- else if and $objectData.podPrimary $objectData.primary -}} - {{- $selected = true -}} - {{- end -}} - - {{- if $selected -}} - {{- $gpuResource = mustAppend $gpuResource $GPUValues.gpu -}} - {{- end -}} - {{- end -}} - - {{- if not $returnBool -}} - {{- range $gpu := $gpuResource -}} - {{- range $k, $v := $gpu -}} - {{- if or (kindIs "invalid" $v) (eq (toString $v) "") -}} - {{- fail "Container - Expected non-empty [scaleGPU] [value]" -}} - {{- end -}} {{/* Don't try to schedule 0 GPUs */}} - {{- if gt (int $v) 0 }} -{{ $k }}: {{ $v | quote }} - {{- end -}} - {{- end -}} - {{- end -}} - {{- else -}} - {{- if $gpuResource -}} - {{- "true" -}} - {{- end -}} - {{- end -}} - -{{- end -}} - {{/* Validates resources to match a pattern */}} {{/* Call this template: {{ include "tc.v1.common.lib.container.resources.validation" (dict "resources" $resources) }} @@ -141,3 +80,59 @@ resources: The resources object {{- end -}} {{- end -}} {{- end -}} + +{{- define "tc.v1.common.lib.pod.resources.hasGPU" -}} + {{- $rootCtx := .rootCtx -}} + {{- $objectData := .objectData -}} + {{- $gpuType := .gpuType -}} + + {{- $types := (list "nvidia.com/gpu" "amd.com/gpu" "intel.com/i915") -}} + {{- if $gpuType -}} + {{- $types = (list $gpuType) -}} + {{- end -}} + + {{- $gpu := false -}} + + {{- if and ($rootCtx.Values.resources) ($rootCtx.Values.resources.limits) -}} + {{- range $t := $types -}} + {{- if gt ((get $rootCtx.Values.resources.limits $t) | int) 0 -}} + {{- $gpu = true -}} + {{- break -}} + {{- end -}} + {{- end -}} + {{- end -}} + + {{- if $objectData.podSpec -}} + {{- range $k, $v := $objectData.podSpec.containers -}} + {{- if not $v.enabled -}} + {{- continue -}} + {{- end -}} + + {{- range $t := $types -}} + {{- if eq (include "tc.v1.common.lib.container.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $v "gpuType" $t)) "true" -}} + {{- $gpu = true -}} + {{- break -}} + {{- end -}} + {{- end -}} + + {{- end -}} + {{- end -}} + + {{- $gpu | toString -}} +{{- end -}} + +{{- define "tc.v1.common.lib.container.resources.hasGPU" -}} + {{- $rootCtx := .rootCtx -}} + {{- $objectData := .objectData -}} + {{- $gpuType := .gpuType -}} + + {{- $gpu := false -}} + + {{- if and ($objectData.resources) ($objectData.resources.limits) -}} + {{- if gt ((get $objectData.resources.limits $gpuType) | int) 0 -}} + {{- $gpu = true -}} + {{- end -}} + {{- end -}} + + {{- $gpu | toString -}} +{{- end -}} diff --git a/library/common/templates/lib/pod/_podSecurityContext.tpl b/library/common/templates/lib/pod/_podSecurityContext.tpl index 9dc5ac6d..87db62d5 100644 --- a/library/common/templates/lib/pod/_podSecurityContext.tpl +++ b/library/common/templates/lib/pod/_podSecurityContext.tpl @@ -20,18 +20,7 @@ objectData: The object data to be used to render the Pod. {{- $secContext = mustMergeOverwrite $secContext . -}} {{- end -}} - {{- $gpuAdded := false -}} - {{- range $GPUValues := $rootCtx.Values.scaleGPU -}} - {{/* If there is a selector and pod is selected */}} - {{- if $GPUValues.targetSelector -}} - {{- if mustHas $objectData.shortName ($GPUValues.targetSelector | keys) -}} - {{- $gpuAdded = true -}} - {{- end -}} - {{/* If there is not a selector, but pod is primary */}} - {{- else if $objectData.primary -}} - {{- $gpuAdded = true -}} - {{- end -}} - {{- end -}} + {{- $gpu := (include "tc.v1.common.lib.pod.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData)) -}} {{- $deviceGroups := (list 5 10 20 24) -}} {{- $deviceAdded := false -}} @@ -84,7 +73,7 @@ objectData: The object data to be used to render the Pod. {{- end -}} {{- end -}} - {{- if $gpuAdded -}} + {{- if eq $gpu "true" -}} {{- $_ := set $secContext "supplementalGroups" (concat $secContext.supplementalGroups (list 44 107)) -}} {{- $hostUsers = true -}} {{- end -}} diff --git a/library/common/templates/lib/pod/_runtimeClassName.tpl b/library/common/templates/lib/pod/_runtimeClassName.tpl index 4d4efc55..b419acd8 100644 --- a/library/common/templates/lib/pod/_runtimeClassName.tpl +++ b/library/common/templates/lib/pod/_runtimeClassName.tpl @@ -34,69 +34,13 @@ objectData: The object data to be used to render the Pod. {{- define "tc.v1.common.lib.pod.runtimeClassName.scale" -}} {{- $rootCtx := .rootCtx -}} {{- $objectData := .objectData -}} + {{- $runtime := "" -}} - - {{- $nvidia := false -}} - - {{- if and ($rootCtx.Values.resources) ($rootCtx.Values.resources.limits) -}} - {{- if gt ((get $rootCtx.Values.resources.limits "nvidia.com/gpu") | int) 0 -}} - {{- $nvidia = true -}} - {{- end -}} - {{- end -}} - - {{- range $rootCtx.Values.workload -}} - {{- if not .podSpec -}} - {{- continue -}} - {{- end -}} - - {{- range $k, $v := .podSpec.containers -}} - {{- if or (not $v.resources) (not $v.resources.limits) -}} - {{- continue -}} - {{- end -}} - - {{- if gt ((get $v.resources.limits "nvidia.com/gpu") | int) 0 -}} - {{- $nvidia = true -}} - {{- break -}} - {{- end -}} - {{- end -}} - {{- end -}} - - {{- if $nvidia -}} + {{- $nvidia := (include "tc.v1.common.lib.pod.resources.hasGPU" (dict "rootCtx" $rootCtx "objectData" $objectData)) -}} + {{- if eq $nvidia "true" -}} {{/* https://github.com/truenas/middleware/blob/0bfc05166c3f95b1ab4ca4a9614691f14303db2e/src/middlewared/middlewared/plugins/kubernetes_linux/utils.py#L16 */}} {{- $runtime = "nvidia" -}} {{- end -}} - {{/* Keep backwards compat with .scaleGPU */}} - {{- if $rootCtx.Values.global.ixChartContext.addNvidiaRuntimeClass -}} - {{- range $rootCtx.Values.scaleGPU -}} - {{- if .gpu -}} {{/* Make sure it has a value... */}} - {{- $scaleGPU := false -}} - {{- range $k, $v := .gpu -}} - {{- if $v -}} {{/* Make sure value is not "0" or "" */}} - {{- $scaleGPU = true -}} - {{- break -}} - {{- end -}} - {{- end -}} - - {{- if $scaleGPU -}} - - {{- if (kindIs "map" .targetSelector) -}} - {{- range $podName, $containers := .targetSelector -}} - - {{- if eq $objectData.shortName $podName -}} {{/* If the pod is selected */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} - {{- end -}} - - {{- end -}} - {{- else if $objectData.primary -}} - {{/* If the pod is primary and no targetSelector is given, assign to primary */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} - {{- end -}} - - {{- end -}} - {{- end -}} - {{- end -}} - {{- end -}} - {{- $runtime -}} {{- end -}} diff --git a/library/common/values.yaml b/library/common/values.yaml index 6d443420..35cf570a 100644 --- a/library/common/values.yaml +++ b/library/common/values.yaml @@ -385,9 +385,6 @@ scaleExternalInterface: [] # -- (docs/scaleCertificate.md) scaleCertificate: {} -# -- (docs/scaleGPU.md) -scaleGPU: [] - # NOTES.txt notes: header: |