Skip to content

Conversation

robertgshaw2-redhat
Copy link
Contributor

SUMMARY:

  • previously, installer user same resources: for P and D
  • this enables the help chart to have P and D specified separately

@robertgshaw2-redhat
Copy link
Contributor Author

robertgshaw2-redhat commented Jun 8, 2025

  • Seems to be working with:
sampleApplication:
    baseConfigMapRefName: basic-gpu-with-nixl-preset
    model:
        modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic
        modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
        auth:
            hfToken:
                name: llm-d-hf-token
                key: HF_TOKEN
    prefill:
        replicas: 4
        resources:
          limits:
            nvidia.com/gpu: 2
            rdma/ib: 1
          requests:
            nvidia.com/gpu: 2
            rdma/ib: 1
            cpu: "16"
            memory: 64Gi
        extraArgs:
            - "--tensor-parallel-size"
            - "2"
            - "--disable-log-requests"
            - "--max-model-len"
            - "32768"
            - "--block-size"
            - "128"
            - "--max-num-batched-tokens"
            - "32768"

    decode:
        replicas: 1
        resources:
          limits:
            nvidia.com/gpu: 8
            rdma/ib: 1
          requests:
            nvidia.com/gpu: 8
            rdma/ib: 1
            cpu: "16"
            memory: 64Gi
        extraArgs:
            - "--tensor-parallel-size"
            - "8"
            - "--disable-log-requests"
            - "--max-model-len"
            - "32768"
            - "--block-size"
            - "128"
modelservice:
  vllm:
    image:
      registry: docker.io    
      repository: robertgshaw2/vllm-nixl
      tag: het-tp-dev-0.1
  epp:
    defaultEnvVarsOverride:
      - name: ENABLE_KVCACHE_AWARE_SCORER
        value: "false"
      - name: ENABLE_PREFIX_AWARE_SCORER
        value: "true"
      - name: ENABLE_LOAD_AWARE_SCORER
        value: "true"
      - name: ENABLE_SESSION_AWARE_SCORER
        value: "false"
      - name: PD_ENABLED
        value: "true"
      - name: PD_PROMPT_LEN_THRESHOLD
        value: "10"
      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
        value: "false"
      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
        value: "true"
      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
        value: "true"
      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
        value: "false"
redis:
  enabled: false
  • installed with my justfile:
NAMESPACE := "pete-davidson"
MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"

logs POD:
    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"

get-ips:
    just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}'
get-pods:
    kubectl get pods -n {{NAMESPACE}} -o wide

hf-token:
  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN='$HF_TOKEN' -n {{NAMESPACE}}

[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
install VALUES:
    ./llmd-installer.sh \
        --namespace {{NAMESPACE}} \
        --storage-class shared-vast --storage-size 300Gi \
        --values-file ./examples/rob-benchmarking/{{VALUES}}

start VALUES: 
    just install {{VALUES}} && \
    just hf-token && \
    just start-bench

[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
uninstall VALUES:
    ./llmd-installer.sh \
        --namespace {{NAMESPACE}} \
        --storage-class shared-vast  --storage-size 300Gi \
        --values-file $PWD/../project/{{VALUES}} \
        --uninstall

gh-token GH_TOKEN:
    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}

# Interactive benchmark commands:
start-bench:
    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml

delete-bench:
    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive

exec-bench:
    kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \
    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants