Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions charts/model-engine/values_sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,37 @@ config:
ml_account_id: "000000000000"
# docker_repo_prefix [required] is the prefix for AWS ECR repositories
docker_repo_prefix: "000000000000.dkr.ecr.us-east-1.amazonaws.com"
# redis_host [required] is the hostname of the redis cluster you wish to connect
# redis_host [required if redis_aws_secret_name not present] is the hostname of the redis cluster you wish to connect
redis_host: llm-engine-prod-cache.use1.cache.amazonaws.com
# redis_aws_secret_name [optional] is the AWS secret that contains the connection info of the Redis cluster.
# The information provided should be as follows:
# scheme: either redis:// or rediss://, will default to redis://
# auth_token (optional): an auth token for the Redis cluster
# host: the hostname of the Redis cluster
# port: the port of the Redis cluster
# query_params (optional): additional query parameters for the Redis cluster, will default to ""
# The url will be built as follows:
# {scheme}{host}:{port}/{db_index}{query_params} if auth_token is not provided,
# {scheme}:{auth_token}@{host}:{port}/{db_index}{query_params} if auth_token is provided
# db_index will be filled in by LLM Engine.
# This secret must be accessible by the default LLM Engine AWS role
# e.g. what is set by profile_ml_worker if provided
# redis_aws_secret_name: sample-prod/redis-credentials
# s3_bucket [required] is the S3 bucket you wish to connect
s3_bucket: "llm-engine"
launch:
# endpoint_namespace [required] is K8s namespace the endpoints will be created in
endpoint_namespace: llm-engine
# cache_redis_aws_url is the full url for the redis cluster you wish to connect,
# cache_redis_azure_host is the redis cluster host when using cloud_provider azure
# one of cache_redis_aws_url and cache_redis_azure_host must be provided
# cache_redis_aws_secret_name is an AWS secret that contains the Redis credentials.
# It has a field "cache-url" with the full URL of the Redis cluster (including db number).
# Other fields are ignored; e.g. you can use the secret for multiple purposes.
# This secret must be accessible by the default LLM Engine AWS role
# exactly one of cache_redis_aws_url, cache_redis_azure_host, or cache_redis_aws_secret_name must be provided
cache_redis_aws_url: redis://llm-engine-prod-cache.use1.cache.amazonaws.com:6379/15
cache_redis_azure_host: llm-engine-cache.redis.cache.windows.net:6380
cache_redis_aws_secret_name: sample-prod/redis-credentials
# s3_file_llm_fine_tuning_job_repository [required] is the S3 URI for the S3 bucket/key that you wish to save fine-tuned assests
s3_file_llm_fine_tuning_job_repository: "s3://llm-engine/llm-ft-job-repository"
# dd_trace_enabled specifies whether to enable datadog tracing, datadog must be installed in the cluster
Expand Down
16 changes: 16 additions & 0 deletions model-engine/model_engine_server/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import yaml
from azure.identity import DefaultAzureCredential
from model_engine_server.core.aws.secrets import get_key_file
from model_engine_server.core.config import infra_config
from model_engine_server.core.loggers import logger_name, make_logger

Expand Down Expand Up @@ -68,8 +69,12 @@ class HostedModelInferenceServiceConfig:
user_inference_tensorflow_repository: str
docker_image_layer_cache_repository: str
sensitive_log_mode: bool
# Exactly one of the following three must be specified
cache_redis_aws_url: Optional[str] = None # also using this to store sync autoscaling metrics
cache_redis_azure_host: Optional[str] = None
cache_redis_aws_secret_name: Optional[
str
] = None # Not an env var because the redis cache info is already here

@classmethod
def from_yaml(cls, yaml_path):
Expand All @@ -80,7 +85,18 @@ def from_yaml(cls, yaml_path):
@property
def cache_redis_url(self) -> str:
if self.cache_redis_aws_url:
assert infra_config().cloud_provider == "aws", "cache_redis_aws_url is only for AWS"
if self.cache_redis_aws_secret_name:
logger.warning(
"Both cache_redis_aws_url and cache_redis_aws_secret_name are set. Using cache_redis_aws_url"
)
return self.cache_redis_aws_url
elif self.cache_redis_aws_secret_name:
assert (
infra_config().cloud_provider == "aws"
), "cache_redis_aws_secret_name is only for AWS"
creds = get_key_file(self.cache_redis_aws_secret_name) # Use default role
return creds["cache-url"]

assert self.cache_redis_azure_host and infra_config().cloud_provider == "azure"
username = os.getenv("AZURE_OBJECT_ID")
Expand Down
12 changes: 12 additions & 0 deletions model-engine/model_engine_server/core/celery/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from celery.app.control import Inspect
from celery.result import AsyncResult
from model_engine_server.core.aws.roles import session
from model_engine_server.core.aws.secrets import get_key_file
from model_engine_server.core.config import infra_config
from model_engine_server.core.loggers import (
CustomJSONFormatter,
Expand Down Expand Up @@ -195,6 +196,17 @@ def get_redis_host_port():


def get_redis_endpoint(db_index: int = 0) -> str:
if infra_config().redis_aws_secret_name is not None:
logger.info("Using infra_config().redis_aws_secret_name for Redis endpoint")
creds = get_key_file(infra_config().redis_aws_secret_name) # Use default role
scheme = creds.get("scheme", "redis://")
host = creds["host"]
port = creds["port"]
query_params = creds.get("query_params", "")
auth_token = creds.get("auth_token", None)
if auth_token is not None:
return f"{scheme}:{auth_token}@{host}:{port}/{db_index}{query_params}"
return f"{scheme}{host}:{port}/{db_index}{query_params}"
host, port = get_redis_host_port()
auth_token = os.getenv("REDIS_AUTH_TOKEN")
if auth_token:
Expand Down
3 changes: 2 additions & 1 deletion model-engine/model_engine_server/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ class InfraConfig:
default_region: str
ml_account_id: str
docker_repo_prefix: str
redis_host: str
s3_bucket: str
redis_host: Optional[str] = None
redis_aws_secret_name: Optional[str] = None
profile_ml_worker: str = "default"
profile_ml_inference_worker: str = "default"
identity_service_url: Optional[str] = None
Expand Down