From 816548049f014406a8b836e1f1c324a17a5a1eae Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sat, 23 May 2026 17:22:32 +0530 Subject: [PATCH 1/6] fix(hub): deploy model registry to user profile namespace instead of default Three of the four hub kustomizations referenced by example/kustomization.yaml had no namespace directive, causing kustomize build to emit resources without namespace metadata. kubectl apply then deployed them to the default namespace. This was introduced by #3318 and masked by the CI install script which passes -n kubeflow explicitly. Per pboyd direction, model registry must live in the user profile namespace (kubeflow-user-example-com) for dashboard integration. Deploying to kubeflow breaks the dashboard (hub#1045). Changes: - Add applications/hub/overlays/kustomization.yaml wrapping all 4 upstream resources with namespace: kubeflow-user-example-com and 6 Istio patches: - 2 gateway refs: kubeflow-gateway -> kubeflow/kubeflow-gateway (Istio resolves gateway names relative to VirtualService namespace) - 4 FQDN hosts: .kubeflow.svc.cluster.local -> .kubeflow-user-example-com.svc.cluster.local (services move with overlay) - Simplify example/kustomization.yaml from 4 upstream paths to 1 overlay - Update tests/model_registry_install.sh to use overlay and target kubeflow-user-example-com namespace - Fix tests/model_registry_test.sh port-forward to target correct namespace - Add applications/hub/overlays/** to model_registry_test.yaml CI triggers Fixes #3457 Signed-off-by: Siddhant Jain --- .github/workflows/model_registry_test.yaml | 1 + applications/hub/overlays/kustomization.yaml | 61 ++++++++++++++++ example/kustomization.yaml | 10 +-- tests/model_registry_install.sh | 74 ++++++++------------ tests/model_registry_test.sh | 2 +- 5 files changed, 96 insertions(+), 52 deletions(-) create mode 100644 applications/hub/overlays/kustomization.yaml diff --git a/.github/workflows/model_registry_test.yaml b/.github/workflows/model_registry_test.yaml index 353b1417d4..f8290ac32e 100644 --- a/.github/workflows/model_registry_test.yaml +++ b/.github/workflows/model_registry_test.yaml @@ -7,6 +7,7 @@ on: - tests/install_KinD_create_KinD_cluster_install_kustomize.sh - .github/workflows/model_registry_test.yaml - applications/hub/upstream/** + - applications/hub/overlays/** - tests/istio* - tests/multi_tenancy_install.sh - tests/profile_controller_install.sh diff --git a/applications/hub/overlays/kustomization.yaml b/applications/hub/overlays/kustomization.yaml new file mode 100644 index 0000000000..2147db57f6 --- /dev/null +++ b/applications/hub/overlays/kustomization.yaml @@ -0,0 +1,61 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kubeflow-user-example-com + +resources: +- ../upstream/overlays/postgres +- ../upstream/options/istio +- ../upstream/options/ui/overlays/istio +- ../upstream/options/catalog/overlays/demo + +patches: +# Gateway references: Istio resolves gateway names relative to the +# VirtualService namespace. kubeflow-gateway lives in the kubeflow namespace, +# so VirtualServices in kubeflow-user-example-com must use the fully-qualified +# cross-namespace reference. +- target: + group: networking.istio.io + version: v1alpha3 + kind: VirtualService + name: model-registry + patch: |- + - op: replace + path: /spec/gateways/0 + value: kubeflow/kubeflow-gateway + - op: replace + path: /spec/http/0/route/0/destination/host + value: model-registry-service.kubeflow-user-example-com.svc.cluster.local +- target: + group: networking.istio.io + version: v1alpha3 + kind: VirtualService + name: model-registry-ui + patch: |- + - op: replace + path: /spec/gateways/0 + value: kubeflow/kubeflow-gateway + - op: replace + path: /spec/http/0/route/0/destination/host + value: model-registry-ui-service.kubeflow-user-example-com.svc.cluster.local + +# Destination host FQDNs: services move to kubeflow-user-example-com with the +# namespace override. DestinationRule hosts must reference the correct namespace. +- target: + group: networking.istio.io + version: v1alpha3 + kind: DestinationRule + name: model-registry-service + patch: |- + - op: replace + path: /spec/host + value: model-registry-service.kubeflow-user-example-com.svc.cluster.local +- target: + group: networking.istio.io + version: v1alpha3 + kind: DestinationRule + name: model-registry-ui + patch: |- + - op: replace + path: /spec/host + value: model-registry-ui-service.kubeflow-user-example-com.svc.cluster.local diff --git a/example/kustomization.yaml b/example/kustomization.yaml index e1940e41ad..1b60f500bd 100644 --- a/example/kustomization.yaml +++ b/example/kustomization.yaml @@ -86,14 +86,8 @@ resources: # Spark Operator - ../applications/spark/spark-operator/overlays/kubeflow -# Model Registry -- ../applications/hub/upstream/overlays/postgres -# Model Registry Istio networking (VirtualService for /api/model_registry/) -- ../applications/hub/upstream/options/istio -# Model Registry UI -- ../applications/hub/upstream/options/ui/overlays/istio -# Model Catalog (demo) -- ../applications/hub/upstream/options/catalog/overlays/demo +# Hub (Model Registry + Model Catalog) +- ../applications/hub/overlays # Ray is an experimental integration # Here is the documentation for Ray: https://docs.ray.io/en/latest/ diff --git a/tests/model_registry_install.sh b/tests/model_registry_install.sh index dd09ab6e0f..a0561309b0 100755 --- a/tests/model_registry_install.sh +++ b/tests/model_registry_install.sh @@ -3,84 +3,72 @@ set -euxo pipefail # Install Model Registry server, UI, database, and catalog components # This script can be used for local testing without GitHub Actions -# Prerequisites: kubeflow namespace must exist, kustomize must be installed +# Prerequisites: kubeflow-user-example-com namespace must exist (created by Profile controller), +# kustomize must be installed # Usage: ./tests/model_registry_install.sh echo "Installing Model Registry components..." -# Build and apply Model Registry server with database -echo "Deploying Model Registry server (with database)..." -kustomize build applications/hub/upstream/overlays/postgres \ - | kubectl apply -n kubeflow -f - - -# Build and apply Model Registry Istio networking -echo "Deploying Model Registry Istio resources..." -kustomize build applications/hub/upstream/options/istio \ - | kubectl apply -n kubeflow -f - - -# Build and apply Model Registry UI with Istio integration -echo "Deploying Model Registry UI..." -kustomize build applications/hub/upstream/options/ui/overlays/istio \ - | kubectl apply -n kubeflow -f - - -# Build and apply Model Catalog (demo overlay) -echo "Deploying Model Catalog..." -kustomize build applications/hub/upstream/options/catalog/overlays/demo \ - | kubectl apply -n kubeflow -f - +# Build and apply all Hub components (Model Registry + Istio + UI + Catalog) +# The overlay sets namespace: kubeflow-user-example-com and patches Istio +# gateway references and destination host FQDNs. +echo "Deploying Hub components to kubeflow-user-example-com..." +kustomize build applications/hub/overlays \ + | kubectl apply -f - # Wait for Model Registry database deployment echo "Waiting for Model Registry database to become ready..." -if ! kubectl wait --for=condition=available -n kubeflow deployment/model-registry-db --timeout=120s; then +if ! kubectl wait --for=condition=available -n kubeflow-user-example-com deployment/model-registry-db --timeout=120s; then echo "ERROR: Model Registry database deployment failed" - kubectl get pods -n kubeflow -l component=db - kubectl describe deployment/model-registry-db -n kubeflow - kubectl logs deployment/model-registry-db -n kubeflow + kubectl get pods -n kubeflow-user-example-com -l component=db + kubectl describe deployment/model-registry-db -n kubeflow-user-example-com + kubectl logs deployment/model-registry-db -n kubeflow-user-example-com exit 1 fi # Wait for Model Registry server deployment echo "Waiting for Model Registry server to become ready..." -if ! kubectl wait --for=condition=available -n kubeflow deployment/model-registry-deployment --timeout=120s; then +if ! kubectl wait --for=condition=available -n kubeflow-user-example-com deployment/model-registry-deployment --timeout=120s; then echo "ERROR: Model Registry server deployment failed" - kubectl get pods -n kubeflow -l component=model-registry-server - kubectl describe deployment/model-registry-deployment -n kubeflow - kubectl logs deployment/model-registry-deployment -n kubeflow --all-containers + kubectl get pods -n kubeflow-user-example-com -l component=model-registry-server + kubectl describe deployment/model-registry-deployment -n kubeflow-user-example-com + kubectl logs deployment/model-registry-deployment -n kubeflow-user-example-com --all-containers exit 1 fi # Wait for Model Registry UI deployment echo "Waiting for Model Registry UI to become ready..." -if ! kubectl wait --for=condition=available -n kubeflow deployment/model-registry-ui --timeout=120s; then +if ! kubectl wait --for=condition=available -n kubeflow-user-example-com deployment/model-registry-ui --timeout=120s; then echo "ERROR: Model Registry UI deployment failed" - kubectl get pods -n kubeflow -l app=model-registry-ui - kubectl describe deployment/model-registry-ui -n kubeflow - kubectl logs deployment/model-registry-ui -n kubeflow --all-containers + kubectl get pods -n kubeflow-user-example-com -l app=model-registry-ui + kubectl describe deployment/model-registry-ui -n kubeflow-user-example-com + kubectl logs deployment/model-registry-ui -n kubeflow-user-example-com --all-containers exit 1 fi # Wait for Model Catalog PostgreSQL StatefulSet echo "Waiting for Model Catalog database to become ready..." -if ! kubectl wait --for=condition=ready -n kubeflow pod \ +if ! kubectl wait --for=condition=ready -n kubeflow-user-example-com pod \ -l app.kubernetes.io/name=postgres,app.kubernetes.io/part-of=model-catalog \ --timeout=120s; then echo "ERROR: Model Catalog database pod failed" - kubectl get pods -n kubeflow -l app.kubernetes.io/part-of=model-catalog - kubectl describe statefulset/model-catalog-postgres -n kubeflow - kubectl logs statefulset/model-catalog-postgres -n kubeflow + kubectl get pods -n kubeflow-user-example-com -l app.kubernetes.io/part-of=model-catalog + kubectl describe statefulset/model-catalog-postgres -n kubeflow-user-example-com + kubectl logs statefulset/model-catalog-postgres -n kubeflow-user-example-com exit 1 fi # Wait for Model Catalog server deployment echo "Waiting for Model Catalog server to become ready..." -if ! kubectl wait --for=condition=available -n kubeflow deployment/model-catalog-server --timeout=120s; then +if ! kubectl wait --for=condition=available -n kubeflow-user-example-com deployment/model-catalog-server --timeout=120s; then echo "ERROR: Model Catalog server deployment failed" - kubectl get pods -n kubeflow -l app.kubernetes.io/part-of=model-catalog - kubectl describe deployment/model-catalog-server -n kubeflow - kubectl logs deployment/model-catalog-server -n kubeflow --all-containers + kubectl get pods -n kubeflow-user-example-com -l app.kubernetes.io/part-of=model-catalog + kubectl describe deployment/model-catalog-server -n kubeflow-user-example-com + kubectl logs deployment/model-catalog-server -n kubeflow-user-example-com --all-containers exit 1 fi echo "Model Registry installation complete!" -kubectl get pods -n kubeflow -l component=model-registry-server -kubectl get pods -n kubeflow -l app=model-registry-ui -kubectl get pods -n kubeflow -l app.kubernetes.io/part-of=model-catalog +kubectl get pods -n kubeflow-user-example-com -l component=model-registry-server +kubectl get pods -n kubeflow-user-example-com -l app=model-registry-ui +kubectl get pods -n kubeflow-user-example-com -l app.kubernetes.io/part-of=model-catalog diff --git a/tests/model_registry_test.sh b/tests/model_registry_test.sh index f8dfdf10a7..7b3310def9 100755 --- a/tests/model_registry_test.sh +++ b/tests/model_registry_test.sh @@ -17,7 +17,7 @@ echo "=== Model Registry Integration Tests ===" # enforced here. These tests validate the Model Registry REST API functionality. # AuthorizationPolicy enforcement is validated through the gateway tests below. echo "Test 1: Direct Model Registry API access..." -nohup kubectl port-forward svc/model-registry-service -n kubeflow 8081:8080 & +nohup kubectl port-forward svc/model-registry-service -n kubeflow-user-example-com 8081:8080 & timeout 30s bash -c 'until curl -s localhost:8081 > /dev/null 2>&1; do sleep 1; done' # ---- Test 2: Create a RegisteredModel ---- From fa7d82755378ec5efffd31e1717bb4252c8995ae Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sat, 23 May 2026 17:54:57 +0530 Subject: [PATCH 2/6] refactor(tests): address review feedback on hub namespace PR - Add trap-based port-forward cleanup in model_registry_test.sh matching the established pattern from model_catalog_test.sh (PID capture + explicit kill) and swfs_namespace_isolation_test.sh (trap cleanup EXIT) - Add fail-fast namespace existence check in model_registry_install.sh for local environments where the Profile controller may not have provisioned kubeflow-user-example-com - Update example/kustomization.yaml comment to reflect full overlay scope (Model Registry + UI + Istio resources + Model Catalog) Signed-off-by: Siddhant Jain --- example/kustomization.yaml | 2 +- tests/model_registry_install.sh | 6 ++++++ tests/model_registry_test.sh | 12 +++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/example/kustomization.yaml b/example/kustomization.yaml index 1b60f500bd..13904eca6f 100644 --- a/example/kustomization.yaml +++ b/example/kustomization.yaml @@ -86,7 +86,7 @@ resources: # Spark Operator - ../applications/spark/spark-operator/overlays/kubeflow -# Hub (Model Registry + Model Catalog) +# Hub (Model Registry + UI + Istio resources + Model Catalog) - ../applications/hub/overlays # Ray is an experimental integration diff --git a/tests/model_registry_install.sh b/tests/model_registry_install.sh index a0561309b0..14d2e40ab9 100755 --- a/tests/model_registry_install.sh +++ b/tests/model_registry_install.sh @@ -9,6 +9,12 @@ set -euxo pipefail echo "Installing Model Registry components..." +# Fail fast if the profile namespace has not been provisioned +if ! kubectl get namespace kubeflow-user-example-com >/dev/null 2>&1; then + echo "ERROR: namespace kubeflow-user-example-com does not exist. Create a Kubeflow Profile first." + exit 1 +fi + # Build and apply all Hub components (Model Registry + Istio + UI + Catalog) # The overlay sets namespace: kubeflow-user-example-com and patches Istio # gateway references and destination host FQDNs. diff --git a/tests/model_registry_test.sh b/tests/model_registry_test.sh index 7b3310def9..ce2936dd88 100755 --- a/tests/model_registry_test.sh +++ b/tests/model_registry_test.sh @@ -17,7 +17,17 @@ echo "=== Model Registry Integration Tests ===" # enforced here. These tests validate the Model Registry REST API functionality. # AuthorizationPolicy enforcement is validated through the gateway tests below. echo "Test 1: Direct Model Registry API access..." -nohup kubectl port-forward svc/model-registry-service -n kubeflow-user-example-com 8081:8080 & +nohup kubectl port-forward svc/model-registry-service -n kubeflow-user-example-com 8081:8080 >/dev/null 2>&1 & +PORT_FORWARD_PID=$! + +cleanup_port_forward() { + if [ -n "$PORT_FORWARD_PID" ]; then + kill "$PORT_FORWARD_PID" 2>/dev/null + wait "$PORT_FORWARD_PID" 2>/dev/null + fi +} +trap cleanup_port_forward EXIT + timeout 30s bash -c 'until curl -s localhost:8081 > /dev/null 2>&1; do sleep 1; done' # ---- Test 2: Create a RegisteredModel ---- From 75807121d02126087ef268a17bf25ec529b63764 Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sat, 23 May 2026 18:03:03 +0530 Subject: [PATCH 3/6] fix(tests): remove wait from port-forward cleanup trap wait returns the exit code of the killed process (143 = 128 + SIGTERM), which triggers set -euxo pipefail and fails the script despite all 8 tests passing. Both model_catalog_test.sh and swfs_namespace_isolation_test.sh use kill without wait in their cleanup functions. Signed-off-by: Siddhant Jain --- tests/model_registry_test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/model_registry_test.sh b/tests/model_registry_test.sh index ce2936dd88..70466b75d6 100755 --- a/tests/model_registry_test.sh +++ b/tests/model_registry_test.sh @@ -23,7 +23,6 @@ PORT_FORWARD_PID=$! cleanup_port_forward() { if [ -n "$PORT_FORWARD_PID" ]; then kill "$PORT_FORWARD_PID" 2>/dev/null - wait "$PORT_FORWARD_PID" 2>/dev/null fi } trap cleanup_port_forward EXIT From db02dfe034bb7466435671984ab33ad7b27d36a4 Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sat, 23 May 2026 18:42:35 +0530 Subject: [PATCH 4/6] fix(tests): update model catalog scripts namespace to kubeflow-user-example-com MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hub overlay moves all resources (including Model Catalog) to kubeflow-user-example-com. Both model_catalog_test.sh and model_catalog_install.sh still referenced -n kubeflow for deployment checks, service checks, pod listing, port-forward, and kubectl wait — causing failures since the resources no longer exist in the kubeflow namespace. Signed-off-by: Siddhant Jain --- tests/model_catalog_install.sh | 4 ++-- tests/model_catalog_test.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/model_catalog_install.sh b/tests/model_catalog_install.sh index f8008f04ac..5bdc6e1e99 100755 --- a/tests/model_catalog_install.sh +++ b/tests/model_catalog_install.sh @@ -3,7 +3,7 @@ set -euxo pipefail ( cd applications/hub/upstream/options/catalog/base - kustomize build . | kubectl apply -n kubeflow -f - + kustomize build . | kubectl apply -n kubeflow-user-example-com -f - ) -kubectl wait --for=condition=Available deployment/model-catalog-server -n kubeflow --timeout=120s +kubectl wait --for=condition=Available deployment/model-catalog-server -n kubeflow-user-example-com --timeout=120s diff --git a/tests/model_catalog_test.sh b/tests/model_catalog_test.sh index 06669405b6..e30310ece7 100755 --- a/tests/model_catalog_test.sh +++ b/tests/model_catalog_test.sh @@ -2,19 +2,19 @@ set -euxo pipefail -if ! kubectl get deployment/model-catalog-server -n kubeflow; then +if ! kubectl get deployment/model-catalog-server -n kubeflow-user-example-com; then echo "ERROR: Model Catalog deployment not found" exit 1 fi -if ! kubectl get svc/model-catalog -n kubeflow; then +if ! kubectl get svc/model-catalog -n kubeflow-user-example-com; then echo "ERROR: Model Catalog service not found" exit 1 fi -kubectl get pods -n kubeflow -l app.kubernetes.io/name=model-catalog,app.kubernetes.io/component=server +kubectl get pods -n kubeflow-user-example-com -l app.kubernetes.io/name=model-catalog,app.kubernetes.io/component=server -nohup kubectl port-forward svc/model-catalog -n kubeflow 8082:8080 & +nohup kubectl port-forward svc/model-catalog -n kubeflow-user-example-com 8082:8080 & PORT_FORWARD_PID=$! MAX_RETRIES=30 From f7db495c80d2539d09b8c03bc7d812a5c955cd8b Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 24 May 2026 18:15:12 +0530 Subject: [PATCH 5/6] fix(ci): broaden hub path filter and add default namespace isolation guard applies julius review suggestion to consolidate applications/hub/upstream/** and applications/hub/overlays/** into applications/hub/** for the model registry test workflow trigger. adds a default namespace isolation verification step to the full integration test that fails the build if any pods leak into the default namespace, providing diagnostic output for debugging. Signed-off-by: Siddhant Jain --- .../full_kubeflow_integration_test.yaml | 17 +++++++++++++++++ .github/workflows/model_registry_test.yaml | 3 +-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/full_kubeflow_integration_test.yaml b/.github/workflows/full_kubeflow_integration_test.yaml index da228374ee..4942e3258c 100644 --- a/.github/workflows/full_kubeflow_integration_test.yaml +++ b/.github/workflows/full_kubeflow_integration_test.yaml @@ -202,6 +202,23 @@ jobs: - name: Apply Pod Security Standards Restricted run: ./tests/PSS_enable.sh restricted + - name: Verify No Components in Default Namespace + run: | + echo "==== Checking for leaked resources in default namespace ====" + DEFAULT_PODS=$(kubectl get pods -n default --no-headers --ignore-not-found 2>/dev/null | grep -v "^$" | wc -l) + if [ "${DEFAULT_PODS}" -gt 0 ]; then + echo "ERROR: Found ${DEFAULT_PODS} pod(s) in the default namespace." + echo "No Kubeflow component should deploy to the default namespace." + echo "" + echo "==== Diagnostic dump ====" + kubectl get all -n default + echo "" + echo "==== Pod details ====" + kubectl get pods -n default -o wide + exit 1 + fi + echo "PASS: default namespace is clean (0 pods)." + - name: Verify Components run: kubectl get pods --all-namespaces | grep -E '(Error|CrashLoopBackOff)' && exit 1 || true diff --git a/.github/workflows/model_registry_test.yaml b/.github/workflows/model_registry_test.yaml index f8290ac32e..f672f3ad43 100644 --- a/.github/workflows/model_registry_test.yaml +++ b/.github/workflows/model_registry_test.yaml @@ -6,8 +6,7 @@ on: paths: - tests/install_KinD_create_KinD_cluster_install_kustomize.sh - .github/workflows/model_registry_test.yaml - - applications/hub/upstream/** - - applications/hub/overlays/** + - applications/hub/** - tests/istio* - tests/multi_tenancy_install.sh - tests/profile_controller_install.sh From d4a42e7e69473e589c6c3a0d9c0972cd366a0977 Mon Sep 17 00:00:00 2001 From: Siddhant Jain Date: Sun, 24 May 2026 23:59:07 +0530 Subject: [PATCH 6/6] fix(ci): apply review feedback on default namespace guard wording applies all 4 review suggestions from julius on the default namespace verification step: renamed step to explicitly state failure behavior, corrected echo wording to remove jargon, removed parenthetical from pod count, and updated pass message phrasing. Signed-off-by: Siddhant Jain --- .github/workflows/full_kubeflow_integration_test.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/full_kubeflow_integration_test.yaml b/.github/workflows/full_kubeflow_integration_test.yaml index 4942e3258c..31f5ef820b 100644 --- a/.github/workflows/full_kubeflow_integration_test.yaml +++ b/.github/workflows/full_kubeflow_integration_test.yaml @@ -202,12 +202,12 @@ jobs: - name: Apply Pod Security Standards Restricted run: ./tests/PSS_enable.sh restricted - - name: Verify No Components in Default Namespace + - name: Fail if there are resources in the "default" namespace run: | - echo "==== Checking for leaked resources in default namespace ====" + echo "==== Checking for resources in the default namespace ====" DEFAULT_PODS=$(kubectl get pods -n default --no-headers --ignore-not-found 2>/dev/null | grep -v "^$" | wc -l) if [ "${DEFAULT_PODS}" -gt 0 ]; then - echo "ERROR: Found ${DEFAULT_PODS} pod(s) in the default namespace." + echo "ERROR: Found ${DEFAULT_PODS} pods in the default namespace." echo "No Kubeflow component should deploy to the default namespace." echo "" echo "==== Diagnostic dump ====" @@ -217,7 +217,7 @@ jobs: kubectl get pods -n default -o wide exit 1 fi - echo "PASS: default namespace is clean (0 pods)." + echo "PASSED: The default namespace does not contain pods." - name: Verify Components run: kubectl get pods --all-namespaces | grep -E '(Error|CrashLoopBackOff)' && exit 1 || true