diff --git a/build/build-image.sh b/build/build-image.sh index 7b8b426e1c..c58a52839e 100755 --- a/build/build-image.sh +++ b/build/build-image.sh @@ -33,7 +33,7 @@ if [[ "$image" == *"-slim" ]]; then build_args="--build-arg SLIM=true" fi -if [ "${image}" == "python-predictor-gpu-slim" ]; then +if [ "${image}" == *"gpu-slim" ]; then cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1") cudnn=("7" "7" "8" "7" "8" "8" "8") for i in ${!cudnn[@]}; do diff --git a/cli/cluster/delete.go b/cli/cluster/delete.go index eb7d4937ba..e81624f98d 100644 --- a/cli/cluster/delete.go +++ b/cli/cluster/delete.go @@ -25,6 +25,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/prompt" s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/userconfig" ) func Delete(operatorConfig OperatorConfig, apiName string, keepCache bool, force bool) (schema.DeleteResponse, error) { @@ -73,13 +74,20 @@ func getReadyRealtimeAPIReplicasOrNil(operatorConfig OperatorConfig, apiName str return &totalReady } -func StopJob(operatorConfig OperatorConfig, apiName string, jobID string) (schema.DeleteResponse, error) { +func StopJob(operatorConfig OperatorConfig, kind userconfig.Kind, apiName string, jobID string) (schema.DeleteResponse, error) { params := map[string]string{ "apiName": apiName, "jobID": jobID, } - httpRes, err := HTTPDelete(operatorConfig, path.Join("/batch", apiName), params) + var endpointComponent string + if kind == userconfig.BatchAPIKind { + endpointComponent = "batch" + } else { + endpointComponent = "tasks" + } + + httpRes, err := HTTPDelete(operatorConfig, path.Join("/"+endpointComponent, apiName), params) if err != nil { return schema.DeleteResponse{}, err } diff --git a/cli/cluster/get.go b/cli/cluster/get.go index 699bfc9171..47a24aa0a3 100644 --- a/cli/cluster/get.go +++ b/cli/cluster/get.go @@ -65,16 +65,31 @@ func GetAPIByID(operatorConfig OperatorConfig, apiName string, apiID string) ([] return apiRes, nil } -func GetJob(operatorConfig OperatorConfig, apiName string, jobID string) (schema.JobResponse, error) { +func GetBatchJob(operatorConfig OperatorConfig, apiName string, jobID string) (schema.BatchJobResponse, error) { endpoint := path.Join("/batch", apiName) httpRes, err := HTTPGet(operatorConfig, endpoint, map[string]string{"jobID": jobID}) if err != nil { - return schema.JobResponse{}, err + return schema.BatchJobResponse{}, err } - var jobRes schema.JobResponse + var jobRes schema.BatchJobResponse if err = json.Unmarshal(httpRes, &jobRes); err != nil { - return schema.JobResponse{}, errors.Wrap(err, endpoint, string(httpRes)) + return schema.BatchJobResponse{}, errors.Wrap(err, endpoint, string(httpRes)) + } + + return jobRes, nil +} + +func GetTaskJob(operatorConfig OperatorConfig, apiName string, jobID string) (schema.TaskJobResponse, error) { + endpoint := path.Join("/tasks", apiName) + httpRes, err := HTTPGet(operatorConfig, endpoint, map[string]string{"jobID": jobID}) + if err != nil { + return schema.TaskJobResponse{}, err + } + + var jobRes schema.TaskJobResponse + if err = json.Unmarshal(httpRes, &jobRes); err != nil { + return schema.TaskJobResponse{}, errors.Wrap(err, endpoint, string(httpRes)) } return jobRes, nil diff --git a/cli/cluster/logs.go b/cli/cluster/logs.go index 615355cc15..c914c049f5 100644 --- a/cli/cluster/logs.go +++ b/cli/cluster/logs.go @@ -42,20 +42,6 @@ func StreamJobLogs(operatorConfig OperatorConfig, apiName string, jobID string) return streamLogs(operatorConfig, "/logs/"+apiName, map[string]string{"jobID": jobID}) } -func GetGCPLogsURL(operatorConfig OperatorConfig, apiName string) (schema.GCPLogsResponse, error) { - httpRes, err := HTTPGet(operatorConfig, "/logs/"+apiName) - if err != nil { - return schema.GCPLogsResponse{}, err - } - - var gcpLogsResponse schema.GCPLogsResponse - if err = json.Unmarshal(httpRes, &gcpLogsResponse); err != nil { - return schema.GCPLogsResponse{}, errors.Wrap(err, "/logs/"+apiName, string(httpRes)) - } - - return gcpLogsResponse, nil -} - func streamLogs(operatorConfig OperatorConfig, path string, qParams ...map[string]string) error { interrupt := make(chan os.Signal, 1) signal.Notify(interrupt, os.Interrupt) diff --git a/cli/cmd/delete.go b/cli/cmd/delete.go index 06eb1620af..0621efcb2a 100644 --- a/cli/cmd/delete.go +++ b/cli/cmd/delete.go @@ -70,7 +70,12 @@ var _deleteCmd = &cobra.Command{ var deleteResponse schema.DeleteResponse if len(args) == 2 { - deleteResponse, err = cluster.StopJob(MustGetOperatorConfig(env.Name), args[0], args[1]) + apisRes, err := cluster.GetAPI(MustGetOperatorConfig(env.Name), args[0]) + if err != nil { + exit.Error(err) + } + + deleteResponse, err = cluster.StopJob(MustGetOperatorConfig(env.Name), apisRes[0].Spec.Kind, args[0], args[1]) if err != nil { exit.Error(err) } diff --git a/cli/cmd/get.go b/cli/cmd/get.go index ddb5a4a05b..391ba841fc 100644 --- a/cli/cmd/get.go +++ b/cli/cmd/get.go @@ -127,7 +127,17 @@ var _getCmd = &cobra.Command{ return "", err } - jobTable, err := getJob(env, args[0], args[1]) + apisRes, err := cluster.GetAPI(MustGetOperatorConfig(envName), args[0]) + if err != nil { + return "", err + } + + var jobTable string + if apisRes[0].Spec.Kind == userconfig.BatchAPIKind { + jobTable, err = getBatchJob(env, args[0], args[1]) + } else { + jobTable, err = getTaskJob(env, args[0], args[1]) + } if err != nil { return "", err } @@ -189,6 +199,8 @@ func getAPIsInAllEnvironments() (string, error) { var allRealtimeAPIEnvs []string var allBatchAPIs []schema.APIResponse var allBatchAPIEnvs []string + var allTaskAPIs []schema.APIResponse + var allTaskAPIEnvs []string var allTrafficSplitters []schema.APIResponse var allTrafficSplitterEnvs []string @@ -219,6 +231,9 @@ func getAPIsInAllEnvironments() (string, error) { case userconfig.RealtimeAPIKind: allRealtimeAPIEnvs = append(allRealtimeAPIEnvs, env.Name) allRealtimeAPIs = append(allRealtimeAPIs, api) + case userconfig.TaskAPIKind: + allTaskAPIEnvs = append(allTaskAPIEnvs, env.Name) + allTaskAPIs = append(allTaskAPIs, api) case userconfig.TrafficSplitterKind: allTrafficSplitterEnvs = append(allTrafficSplitterEnvs, env.Name) allTrafficSplitters = append(allTrafficSplitters, api) @@ -243,7 +258,7 @@ func getAPIsInAllEnvironments() (string, error) { out := "" - if len(allRealtimeAPIs) == 0 && len(allBatchAPIs) == 0 && len(allTrafficSplitters) == 0 { + if len(allRealtimeAPIs) == 0 && len(allBatchAPIs) == 0 && len(allTrafficSplitters) == 0 && len(allTaskAPIs) == 0 { // check if any environments errorred if len(errorsMap) != len(cliConfig.Environments) { if len(errorsMap) == 0 { @@ -271,20 +286,26 @@ func getAPIsInAllEnvironments() (string, error) { out += t.MustFormat() } - if len(allRealtimeAPIs) > 0 { - t := realtimeAPIsTable(allRealtimeAPIs, allRealtimeAPIEnvs) - + if len(allTaskAPIs) > 0 { + t := taskAPIsTable(allTaskAPIs, allTaskAPIEnvs) if len(allBatchAPIs) > 0 { out += "\n" } + out += t.MustFormat() + } + if len(allRealtimeAPIs) > 0 { + t := realtimeAPIsTable(allRealtimeAPIs, allRealtimeAPIEnvs) + if len(allBatchAPIs) > 0 || len(allTaskAPIs) > 0 { + out += "\n" + } out += t.MustFormat() } if len(allTrafficSplitters) > 0 { t := trafficSplitterListTable(allTrafficSplitters, allTrafficSplitterEnvs) - if len(allRealtimeAPIs) > 0 || len(allBatchAPIs) > 0 { + if len(allRealtimeAPIs) > 0 || len(allBatchAPIs) > 0 || len(allTaskAPIs) > 0 { out += "\n" } @@ -319,12 +340,15 @@ func getAPIsByEnv(env cliconfig.Environment, printEnv bool) (string, error) { var allRealtimeAPIs []schema.APIResponse var allBatchAPIs []schema.APIResponse + var allTaskAPIs []schema.APIResponse var allTrafficSplitters []schema.APIResponse for _, api := range apisRes { switch api.Spec.Kind { case userconfig.BatchAPIKind: allBatchAPIs = append(allBatchAPIs, api) + case userconfig.TaskAPIKind: + allTaskAPIs = append(allTaskAPIs, api) case userconfig.RealtimeAPIKind: allRealtimeAPIs = append(allRealtimeAPIs, api) case userconfig.TrafficSplitterKind: @@ -332,7 +356,7 @@ func getAPIsByEnv(env cliconfig.Environment, printEnv bool) (string, error) { } } - if len(allRealtimeAPIs) == 0 && len(allBatchAPIs) == 0 && len(allTrafficSplitters) == 0 { + if len(allRealtimeAPIs) == 0 && len(allBatchAPIs) == 0 && len(allTaskAPIs) == 0 && len(allTrafficSplitters) == 0 { return console.Bold("no apis are deployed"), nil } @@ -350,6 +374,22 @@ func getAPIsByEnv(env cliconfig.Environment, printEnv bool) (string, error) { out += t.MustFormat() } + if len(allTaskAPIs) > 0 { + envNames := []string{} + for range allTaskAPIs { + envNames = append(envNames, env.Name) + } + + t := taskAPIsTable(allTaskAPIs, envNames) + t.FindHeaderByTitle(_titleEnvironment).Hidden = true + + if len(allBatchAPIs) > 0 { + out += "\n" + } + + out += t.MustFormat() + } + if len(allRealtimeAPIs) > 0 { envNames := []string{} for range allRealtimeAPIs { @@ -359,7 +399,7 @@ func getAPIsByEnv(env cliconfig.Environment, printEnv bool) (string, error) { t := realtimeAPIsTable(allRealtimeAPIs, envNames) t.FindHeaderByTitle(_titleEnvironment).Hidden = true - if len(allBatchAPIs) > 0 { + if len(allBatchAPIs) > 0 || len(allTaskAPIs) > 0 { out += "\n" } @@ -375,7 +415,7 @@ func getAPIsByEnv(env cliconfig.Environment, printEnv bool) (string, error) { t := trafficSplitterListTable(allTrafficSplitters, envNames) t.FindHeaderByTitle(_titleEnvironment).Hidden = true - if len(allBatchAPIs) > 0 || len(allRealtimeAPIs) > 0 { + if len(allBatchAPIs) > 0 || len(allTaskAPIs) > 0 || len(allRealtimeAPIs) > 0 { out += "\n" } @@ -412,6 +452,8 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) { return trafficSplitterTable(apiRes, env) case userconfig.BatchAPIKind: return batchAPITable(apiRes), nil + case userconfig.TaskAPIKind: + return taskAPITable(apiRes), nil default: return "", errors.ErrorUnexpected(fmt.Sprintf("encountered unexpected kind %s for api %s", apiRes.Spec.Kind, apiRes.Spec.Name)) } diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go index 841da5bec2..a80cfbf5a0 100644 --- a/cli/cmd/lib_batch_apis.go +++ b/cli/cmd/lib_batch_apis.go @@ -49,7 +49,7 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab latestJobID := "-" runningJobs := 0 - for _, job := range batchAPI.JobStatuses { + for _, job := range batchAPI.BatchJobStatuses { if job.StartTime.After(latestStartTime) { latestStartTime = job.StartTime latestJobID = job.ID + fmt.Sprintf(" (submitted %s ago)", libtime.SinceStr(&latestStartTime)) @@ -82,14 +82,14 @@ func batchAPIsTable(batchAPIs []schema.APIResponse, envNames []string) table.Tab } func batchAPITable(batchAPI schema.APIResponse) string { - jobRows := make([][]interface{}, 0, len(batchAPI.JobStatuses)) + jobRows := make([][]interface{}, 0, len(batchAPI.BatchJobStatuses)) out := "" - if len(batchAPI.JobStatuses) == 0 { - out = console.Bold("no submitted jobs\n") + if len(batchAPI.BatchJobStatuses) == 0 { + out = console.Bold("no submitted batch jobs\n") } else { totalFailed := 0 - for _, job := range batchAPI.JobStatuses { + for _, job := range batchAPI.BatchJobStatuses { succeeded := 0 failed := 0 @@ -144,8 +144,8 @@ func batchAPITable(batchAPI schema.APIResponse) string { return out } -func getJob(env cliconfig.Environment, apiName string, jobID string) (string, error) { - resp, err := cluster.GetJob(MustGetOperatorConfig(env.Name), apiName, jobID) +func getBatchJob(env cliconfig.Environment, apiName string, jobID string) (string, error) { + resp, err := cluster.GetBatchJob(MustGetOperatorConfig(env.Name), apiName, jobID) if err != nil { return "", err } @@ -216,12 +216,11 @@ func getJob(env cliconfig.Environment, apiName string, jobID string) (string, er out += titleStr("batch stats") + t.MustFormat(&table.Opts{BoldHeader: pointer.Bool(false)}) if job.Status == status.JobEnqueuing { - out += "\nstill enqueuing, workers have not been allocated for this job yet\n" + out += "\n" + "still enqueuing, workers have not been allocated for this job yet\n" } else if job.Status.IsCompleted() { - out += "\nworker stats are not available because this job is not currently running\n" + out += "\n" + "worker stats are not available because this job is not currently running\n" } else { out += titleStr("worker stats") - if job.WorkerCounts != nil { t := table.Table{ Headers: []table.Header{ @@ -253,7 +252,7 @@ func getJob(env cliconfig.Environment, apiName string, jobID string) (string, er out += "\n" + console.Bold("job endpoint: ") + resp.Endpoint + "\n" - jobSpecStr, err := libjson.Pretty(job.Job) + jobSpecStr, err := libjson.Pretty(job.BatchJob) if err != nil { return "", err } diff --git a/cli/cmd/lib_task_apis.go b/cli/cmd/lib_task_apis.go new file mode 100644 index 0000000000..de63526021 --- /dev/null +++ b/cli/cmd/lib_task_apis.go @@ -0,0 +1,212 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cmd + +import ( + "fmt" + "time" + + "github.com/cortexlabs/cortex/cli/cluster" + "github.com/cortexlabs/cortex/cli/types/cliconfig" + "github.com/cortexlabs/cortex/cli/types/flags" + "github.com/cortexlabs/cortex/pkg/lib/console" + libjson "github.com/cortexlabs/cortex/pkg/lib/json" + "github.com/cortexlabs/cortex/pkg/lib/pointer" + "github.com/cortexlabs/cortex/pkg/lib/table" + libtime "github.com/cortexlabs/cortex/pkg/lib/time" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types" +) + +const ( + _titleTaskAPI = "task api" + _titleTaskJobCount = "running jobs" + _titleLatestTaskJobID = "latest job id" +) + +func taskAPIsTable(taskAPIs []schema.APIResponse, envNames []string) table.Table { + rows := make([][]interface{}, 0, len(taskAPIs)) + + for i, taskAPI := range taskAPIs { + lastAPIUpdated := time.Unix(taskAPI.Spec.LastUpdated, 0) + latestStartTime := time.Time{} + latestJobID := "-" + runningJobs := 0 + + for _, job := range taskAPI.TaskJobStatuses { + if job.StartTime.After(latestStartTime) { + latestStartTime = job.StartTime + latestJobID = job.ID + fmt.Sprintf(" (submitted %s ago)", libtime.SinceStr(&latestStartTime)) + } + + if job.Status.IsInProgress() { + runningJobs++ + } + } + + rows = append(rows, []interface{}{ + envNames[i], + taskAPI.Spec.Name, + runningJobs, + latestJobID, + libtime.SinceStr(&lastAPIUpdated), + }) + } + + return table.Table{ + Headers: []table.Header{ + {Title: _titleEnvironment}, + {Title: _titleTaskAPI}, + {Title: _titleTaskJobCount}, + {Title: _titleLatestTaskJobID}, + {Title: _titleLastupdated}, + }, + Rows: rows, + } +} + +func taskAPITable(taskAPI schema.APIResponse) string { + jobRows := make([][]interface{}, 0, len(taskAPI.TaskJobStatuses)) + + out := "" + if len(taskAPI.TaskJobStatuses) == 0 { + out = console.Bold("no submitted task jobs\n") + } else { + for _, job := range taskAPI.TaskJobStatuses { + jobEndTime := time.Now() + if job.EndTime != nil { + jobEndTime = *job.EndTime + } + + duration := jobEndTime.Sub(job.StartTime).Truncate(time.Second).String() + + jobRows = append(jobRows, []interface{}{ + job.ID, + job.Status.Message(), + job.StartTime.Format(_timeFormat), + duration, + }) + } + + t := table.Table{ + Headers: []table.Header{ + {Title: "task job id"}, + {Title: "status"}, + {Title: "start time"}, + {Title: "duration"}, + }, + Rows: jobRows, + } + + out += t.MustFormat() + } + + out += "\n" + console.Bold("endpoint: ") + taskAPI.Endpoint + "\n" + + out += "\n" + apiHistoryTable(taskAPI.APIVersions) + + if !_flagVerbose { + return out + } + + out += titleStr("task api configuration") + taskAPI.Spec.UserStr(types.AWSProviderType) + + return out +} + +func getTaskJob(env cliconfig.Environment, apiName string, jobID string) (string, error) { + resp, err := cluster.GetTaskJob(MustGetOperatorConfig(env.Name), apiName, jobID) + if err != nil { + return "", err + } + + if _flagOutput == flags.JSONOutputType { + bytes, err := libjson.Marshal(resp) + if err != nil { + return "", err + } + return string(bytes), nil + } + + job := resp.JobStatus + + out := "" + + jobIntroTable := table.KeyValuePairs{} + jobIntroTable.Add("job id", job.ID) + jobIntroTable.Add("status", job.Status.Message()) + out += jobIntroTable.String(&table.KeyValuePairOpts{BoldKeys: pointer.Bool(true)}) + + jobTimingTable := table.KeyValuePairs{} + jobTimingTable.Add("start time", job.StartTime.Format(_timeFormat)) + + jobEndTime := time.Now() + if job.EndTime != nil { + jobTimingTable.Add("end time", job.EndTime.Format(_timeFormat)) + jobEndTime = *job.EndTime + } else { + jobTimingTable.Add("end time", "-") + } + duration := jobEndTime.Sub(job.StartTime).Truncate(time.Second).String() + jobTimingTable.Add("duration", duration) + + out += "\n" + jobTimingTable.String(&table.KeyValuePairOpts{BoldKeys: pointer.Bool(true)}) + + if job.Status.IsCompleted() { + out += "\n" + "worker stats are not available because this job is not currently running\n" + } else { + out += titleStr("worker stats") + if job.WorkerCounts != nil { + t := table.Table{ + Headers: []table.Header{ + {Title: "requested"}, + {Title: "pending", Hidden: job.WorkerCounts.Pending == 0}, + {Title: "initializing", Hidden: job.WorkerCounts.Initializing == 0}, + {Title: "stalled", Hidden: job.WorkerCounts.Stalled == 0}, + {Title: "running"}, + {Title: "failed", Hidden: job.WorkerCounts.Failed == 0}, + {Title: "succeeded"}, + }, + Rows: [][]interface{}{ + { + job.Workers, + job.WorkerCounts.Pending, + job.WorkerCounts.Initializing, + job.WorkerCounts.Stalled, + job.WorkerCounts.Running, + job.WorkerCounts.Failed, + job.WorkerCounts.Succeeded, + }, + }, + } + out += t.MustFormat(&table.Opts{BoldHeader: pointer.Bool(false)}) + } else { + out += "unable to get worker stats\n" + } + } + + out += "\n" + console.Bold("job endpoint: ") + resp.Endpoint + "\n" + + jobSpecStr, err := libjson.Pretty(job.TaskJob) + if err != nil { + return "", err + } + + out += titleStr("job configuration") + jobSpecStr + + return out, nil +} diff --git a/cli/cmd/logs.go b/cli/cmd/logs.go index 726e92f5ab..ce598d7a28 100644 --- a/cli/cmd/logs.go +++ b/cli/cmd/logs.go @@ -62,12 +62,12 @@ var _logsCmd = &cobra.Command{ operatorConfig := MustGetOperatorConfig(env.Name) apiName := args[0] - if len(args) == 1 { - apiResponse, err := cluster.GetAPI(operatorConfig, apiName) - if err != nil { - exit.Error(err) - } + apiResponse, err := cluster.GetAPI(operatorConfig, apiName) + if err != nil { + exit.Error(err) + } + if len(args) == 1 { if apiResponse[0].Spec.Kind == userconfig.RealtimeAPIKind && apiResponse[0].Status.Requested > 1 && !_flagLogsDisallowPrompt { prompt.YesOrExit("logs from a single random replica will be streamed\n\nfor aggregated logs please visit your cloud provider's logging dashboard; see https://docs.cortex.dev for details", "", "") } @@ -77,14 +77,17 @@ var _logsCmd = &cobra.Command{ exit.Error(err) } } - if len(args) == 2 { - jobResponse, err := cluster.GetJob(operatorConfig, apiName, args[1]) - if err != nil { - exit.Error(err) - } - if jobResponse.JobStatus.Job.Workers > 1 && !_flagLogsDisallowPrompt { - prompt.YesOrExit("logs from a single random worker will be streamed\n\nfor aggregated logs please visit your cloud provider's logging dashboard; see https://docs.cortex.dev for details", "", "") + if len(args) == 2 { + if apiResponse[0].Spec.Kind == userconfig.BatchAPIKind { + jobResponse, err := cluster.GetBatchJob(operatorConfig, apiName, args[1]) + if err != nil { + exit.Error(err) + } + + if jobResponse.JobStatus.Workers > 1 && !_flagLogsDisallowPrompt { + prompt.YesOrExit("logs from a single random worker will be streamed\n\nfor aggregated logs please visit your cloud provider's logging dashboard; see https://docs.cortex.dev for details", "", "") + } } err = cluster.StreamJobLogs(operatorConfig, apiName, args[1]) @@ -92,6 +95,5 @@ var _logsCmd = &cobra.Command{ exit.Error(err) } } - }, } diff --git a/dev/format.sh b/dev/format.sh index b385f93046..44ba815cd5 100755 --- a/dev/format.sh +++ b/dev/format.sh @@ -29,9 +29,9 @@ if ! command -v black >/dev/null 2>&1; then exit 1 fi -gofmt -s -w "$ROOT" +gofmt -s -w "$ROOT"/cli "${ROOT}"/pkg -black --quiet --line-length=100 "$ROOT" +black --quiet --line-length=100 --exclude .idea/ "$ROOT" # Trim trailing whitespace if [[ "$OSTYPE" == "darwin"* ]]; then @@ -39,6 +39,7 @@ if [[ "$OSTYPE" == "darwin"* ]]; then ! -path "./vendor/*" \ ! -path "./bin/*" \ ! -path "./.git/*" \ + ! -path "./.idea/*" \ ! -name ".*" \ -print0 | \ xargs -0 sed -i '' -e's/[[:space:]]*$//') @@ -47,6 +48,7 @@ else ! -path "./vendor/*" \ ! -path "./bin/*" \ ! -path "./.git/*" \ + ! -path "./.idea/*" \ ! -name ".*" \ -print0 | \ xargs -0 sed -i 's/[[:space:]]*$//') @@ -57,6 +59,7 @@ fi ! -path "./vendor/*" \ ! -path "./bin/*" \ ! -path "./.git/*" \ +! -path "./.idea/*" \ ! -name ".*" \ -print0 | \ xargs -0 -L1 bash -c 'test "$(tail -c 1 "$0")" && echo "" >> "$0"' || true) @@ -66,6 +69,7 @@ xargs -0 -L1 bash -c 'test "$(tail -c 1 "$0")" && echo "" >> "$0"' || true) ! -path "./vendor/*" \ ! -path "./bin/*" \ ! -path "./.git/*" \ +! -path "./.idea/*" \ ! -name ".*" \ -print0 | \ xargs -0 -L1 bash -c 'test "$(tail -c 2 "$0")" || [ ! -s "$0" ] || (trimmed=$(printf "%s" "$(< $0)") && echo "$trimmed" > "$0")' || true) @@ -75,6 +79,7 @@ xargs -0 -L1 bash -c 'test "$(tail -c 2 "$0")" || [ ! -s "$0" ] || (trimmed=$(pr ! -path "./vendor/*" \ ! -path "./bin/*" \ ! -path "./.git/*" \ +! -path "./.idea/*" \ ! -name ".*" \ -print0 | \ xargs -0 -L1 bash -c 'test "$(head -c 1 "$0")" || [ ! -s "$0" ] || (trimmed=$(sed '"'"'/./,$!d'"'"' "$0") && echo "$trimmed" > "$0")' || true) diff --git a/docs/clients/python.md b/docs/clients/python.md index 903896ec2b..a845612f06 100644 --- a/docs/clients/python.md +++ b/docs/clients/python.md @@ -84,7 +84,7 @@ Delete an environment configured on this machine. ```python - | create_api(api_spec: dict, predictor=None, requirements=[], conda_packages=[], project_dir: Optional[str] = None, force: bool = True, wait: bool = False) -> list + | create_api(api_spec: dict, predictor=None, task=None, requirements=[], conda_packages=[], project_dir: Optional[str] = None, force: bool = True, wait: bool = False) -> list ``` Deploy an API. @@ -93,6 +93,7 @@ Deploy an API. - `api_spec` - A dictionary defining a single Cortex API. See https://docs.cortex.dev/v/master/ for schema. - `predictor` - A Cortex Predictor class implementation. Not required when deploying a traffic splitter. +- `task` - A callable class/function implementation. Not required for RealtimeAPI/BatchAPI/TrafficSplitter kinds. - `requirements` - A list of PyPI dependencies that will be installed before the predictor class implementation is invoked. - `conda_packages` - A list of Conda dependencies that will be installed before the predictor class implementation is invoked. - `project_dir` - Path to a python project. @@ -143,7 +144,7 @@ Get information about a submitted job. **Arguments**: -- `api_name` - Name of the Batch API. +- `api_name` - Name of the Batch/Task API. - `job_id` - Job ID. @@ -200,7 +201,7 @@ Stop a running job. **Arguments**: -- `api_name` - Name of the Batch API. +- `api_name` - Name of the Batch/Task API. - `job_id` - ID of the Job to stop. ## stream\_api\_logs diff --git a/docs/clusters/aws/logs.md b/docs/clusters/aws/logs.md index d84392f6d3..3b80eab02b 100644 --- a/docs/clusters/aws/logs.md +++ b/docs/clusters/aws/logs.md @@ -23,4 +23,15 @@ fields @timestamp, log | limit 1000 ``` +TaskAPI: + +```text +fields @timestamp, log +| filter labels.apiName="" +| filter labels.jobID="" +| filter labels.apiKind="TaskAPI" +| sort @timestamp asc +| limit 1000 +``` + Please make sure to select the log group for your cluster and adjust the time range accordingly before running the queries. diff --git a/docs/clusters/gcp/logs.md b/docs/clusters/gcp/logs.md index 75c7b6f5ce..bf772d5127 100644 --- a/docs/clusters/gcp/logs.md +++ b/docs/clusters/gcp/logs.md @@ -1,8 +1,8 @@ # Logs -By default, logs will be pushed to [StackDriver](https://console.cloud.google.com/logs/query) using fluent-bit. API logs are tagged with labels to help with log aggregation and filtering. +By default, logs will be pushed to [StackDriver](https://console.cloud.google.com/logs/query) using fluent-bit. API logs are tagged with labels to help with log aggregation and filtering. Below are some sample Stackdriver queries: -Example query for a RealtimeAPI: +RealtimeAPI: ```text resource.type="k8s_container" @@ -11,4 +11,14 @@ jsonPayload.labels.apiKind="RealtimeAPI" jsonPayload.labels.apiName="" ``` +TaskAPI: + +```text +resource.type="k8s_container" +resource.labels.cluster_name="" +jsonPayload.labels.apiKind="TaskAPI" +jsonPayload.labels.apiName="" +jsonPayload.labels.jobID="" +``` + Please make sure to navigate to the project containing your cluster and adjust the time range accordingly before running queries. diff --git a/docs/summary.md b/docs/summary.md index 7c596c0f3a..088e294377 100644 --- a/docs/summary.md +++ b/docs/summary.md @@ -21,6 +21,12 @@ * [Configuration](workloads/batch/configuration.md) * [Jobs](workloads/batch/jobs.md) * [Statuses](workloads/batch/statuses.md) +* Task + * [Example](workloads/task/example.md) + * [Definiton](workloads/task/definitions.md) + * [Configuration](workloads/task/configuration.md) + * [Jobs](workloads/task/jobs.md) + * [Statuses](workloads/task/statuses.md) * Multi-model * [Example](workloads/multi-model/example.md) * [Configuration](workloads/multi-model/configuration.md) diff --git a/docs/workloads/batch/jobs.md b/docs/workloads/batch/jobs.md index 2f16481da8..b652a45301 100644 --- a/docs/workloads/batch/jobs.md +++ b/docs/workloads/batch/jobs.md @@ -53,6 +53,7 @@ RESPONSE: { "job_id": , "api_name": , + "kind": "BatchAPI", "workers": , "config": {: }, "api_id": , @@ -103,6 +104,7 @@ RESPONSE: { "job_id": , "api_name": , + "kind": "BatchAPI", "workers": , "config": {: }, "api_id": , @@ -152,6 +154,7 @@ RESPONSE: { "job_id": , "api_name": , + "kind": "BatchAPI", "workers": , "config": {: }, "api_id": , @@ -179,6 +182,7 @@ RESPONSE: "job_status": { "job_id": , "api_name": , + "kind": "BatchAPI", "workers": , "config": {: }, "api_id": , @@ -202,8 +206,10 @@ RESPONSE: "start_time": # e.g. 2020-07-16T14:56:10.276007415Z "end_time": (optional) # e.g. 2020-07-16T14:56:10.276007415Z (only present if the job has completed) }, - "api_spec": , # a base64 encoded string of your api configuration YAML that has been encoded in msgpack "endpoint": # endpoint for this job + "api_spec": { + ... + } } ``` diff --git a/docs/workloads/realtime/configuration.md b/docs/workloads/realtime/configuration.md index 907ca5e0ab..d55faa4da7 100644 --- a/docs/workloads/realtime/configuration.md +++ b/docs/workloads/realtime/configuration.md @@ -25,7 +25,7 @@ threads_per_process: # the number of threads per process (default: 1) config: # arbitrary dictionary passed to the constructor of the Predictor (optional) python_path: # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml) - image: # docker image to use for the Predictor (default: quay.io/cortexlabs/python-predictor-cpu:master or quay.io/cortexlabs/python-predictor-gpu:master based on compute) + image: # docker image to use for the Predictor (default: quay.io/cortexlabs/python-predictor-cpu:master, quay.io/cortexlabs/python-predictor-gpu:master or quay.io/cortexlabs/python-predictor-inf:master based on compute) env: # dictionary of environment variables log_level: # log level that can be "debug", "info", "warning" or "error" (default: "info") shm_size: # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null) @@ -82,7 +82,7 @@ config: # arbitrary dictionary passed to the constructor of the Predictor (optional) python_path: # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml) image: # docker image to use for the Predictor (default: quay.io/cortexlabs/tensorflow-predictor:master) - tensorflow_serving_image: # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-gpu:master or quay.io/cortexlabs/tensorflow-serving-cpu:master based on compute) + tensorflow_serving_image: # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-gpu:master, quay.io/cortexlabs/tensorflow-serving-cpu:master or quay.io/cortexlabs/tensorflow-serving-inf:master based on compute) env: # dictionary of environment variables log_level: # log level that can be "debug", "info", "warning" or "error" (default: "info") shm_size: # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null) @@ -133,7 +133,7 @@ threads_per_process: # the number of threads per process (default: 1) config: # arbitrary dictionary passed to the constructor of the Predictor (optional) python_path: # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml) - image: # docker image to use for the Predictor (default: quay.io/cortexlabs/onnx-predictor-gpu:master or quay.io/cortexlabs/onnx-predictor-cpu:master based on compute) + image: # docker image to use for the Predictor (default: quay.io/cortexlabs/onnx-predictor-gpu:master, quay.io/cortexlabs/onnx-predictor-cpu:master or quay.io/cortexlabs/onnx-predictor-inf:master based on compute) env: # dictionary of environment variables log_level: # log level that can be "debug", "info", "warning" or "error" (default: "info") shm_size: # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null) diff --git a/docs/workloads/task/configuration.md b/docs/workloads/task/configuration.md new file mode 100644 index 0000000000..b94f89c71b --- /dev/null +++ b/docs/workloads/task/configuration.md @@ -0,0 +1,21 @@ +# Task API configuration + + +```yaml +- name: # API name (required) + kind: TaskAPI + definition: + path: # path to a python file with a Task class definition, relative to the Cortex root (required) + config: # arbitrary dictionary passed to the callable method of the Task class (can be overridden by config passed in job submission) (optional) + python_path: # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml) + image: # docker image to use for the Task (default: quay.io/cortexlabs/python-predictor-cpu:master, quay.io/cortexlabs/python-predictor-gpu:master or quay.io/cortexlabs/python-predictor-inf:master based on compute) + env: # dictionary of environment variables + log_level: # log level that can be "debug", "info", "warning" or "error" (default: "info") + networking: + endpoint: # the endpoint for the API (default: ) + compute: + cpu: # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m) + gpu: # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0) + inf: # Inferentia request per worker. One unit corresponds to one Inferentia ASIC with 4 NeuronCores and 8GB of cache memory. Each process will have one NeuronCore Group with (4 * inf / processes_per_replica) NeuronCores, so your model should be compiled to run on (4 * inf / processes_per_replica) NeuronCores. (default: 0) (aws only) + mem: # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null) +``` diff --git a/docs/workloads/task/definitions.md b/docs/workloads/task/definitions.md new file mode 100644 index 0000000000..486f3cd1dd --- /dev/null +++ b/docs/workloads/task/definitions.md @@ -0,0 +1,53 @@ +# Task implementation + +## Project files + +Cortex makes all files in the project directory (i.e. the directory which contains `cortex.yaml`) available for use in your Task implementation. Python bytecode files (`*.pyc`, `*.pyo`, `*.pyd`), files or folders that start with `.`, and the api configuration file (e.g. `cortex.yaml`) are excluded. + +The following files can also be added at the root of the project's directory: + +* `.cortexignore` file, which follows the same syntax and behavior as a [.gitignore file](https://git-scm.com/docs/gitignore). +* `.env` file, which exports environment variables that can be used in the task. Each line of this file must follow the `VARIABLE=value` format. + +For example, if your directory looks like this: + +```text +./my-classifier/ +├── cortex.yaml +├── values.json +├── task.py +├── ... +└── requirements.txt +``` + +You can access `values.json` in your Task like this: + +```python +import json + +class Task: + def __call__(self, config): + with open('values.json', 'r') as values_file: + values = json.load(values_file) + self.values = values +``` + +## Task + +### Interface + +```python +# initialization code and variables can be declared here in global scope + +class Task: + def __call__(self, config): + """(Required) Task runnable. + + Args: + config (required): Dictionary passed from API configuration (if + specified) merged with configuration passed in with Job + Submission API. If there are conflicting keys, values in + configuration specified in Job submission takes precedence. + """ + pass +``` diff --git a/docs/workloads/task/example.md b/docs/workloads/task/example.md new file mode 100644 index 0000000000..9a9ce176cb --- /dev/null +++ b/docs/workloads/task/example.md @@ -0,0 +1,117 @@ +# TaskAPI + +Deploy a task API that trains a model on the iris flower dataset and uploads it to an S3 bucket. + +## Key features + +* Lambda-style execution +* Task monitoring +* Scale to 0 + +## How it works + +### Install cortex + +```bash +$ pip install cortex +``` + +### Spin up a cluster on AWS + +```bash +$ cortex cluster up +``` + +### Define a task API + +```python +# task.py + +import cortex + +def train_iris_model(config): + import os + + import boto3, pickle + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + from sklearn.linear_model import LogisticRegression + + s3_filepath = config["dest_s3_dir"] + bucket, key = s3_filepath.replace("s3://", "").split("/", 1) + + # get iris flower dataset + iris = load_iris() + data, labels = iris.data, iris.target + training_data, test_data, training_labels, test_labels = train_test_split(data, labels) + + # train the model + model = LogisticRegression(solver="lbfgs", multi_class="multinomial") + model.fit(training_data, training_labels) + accuracy = model.score(test_data, test_labels) + print("accuracy: {:.2f}".format(accuracy)) + + # upload the model + pickle.dump(model, open("model.pkl", "wb")) + s3 = boto3.client("s3") + s3.upload_file("model.pkl", bucket, os.path.join(key, "model.pkl")) + +requirements = ["scikit-learn==0.23.2", "boto3"] + +api_spec = { + "name": "trainer", + "kind": "TaskAPI", +} + +cx = cortex.client("aws") +cx.create_api(api_spec, task=train_iris_model, requirements=requirements) +``` + +### Deploy to your Cortex cluster on AWS + +```bash +$ python task.py +``` + +### Describe the task API + +```bash +$ cortex get trainer +``` + +### Submit a job + +```python +import cortex +import requests + +cx = cortex.client("aws") +task_endpoint = cx.get_api("trainer")["endpoint"] + +dest_s3_dir = # specify S3 directory where the trained model will get pushed to +job_spec = { + "config": { + "dest_s3_dir": dest_s3_dir + } +} +response = requests.post(task_endpoint, json=job_spec) + +print(response.text) +# > {"job_id":"69b183ed6bdf3e9b","api_name":"trainer", "config": {"dest_s3_dir": ...}} +``` + +### Monitor the job + +```bash +$ cortex get trainer 69b183ed6bdf3e9b +``` + +### View the results + +Once the job is complete, you should be able to find the trained model of the task job in the S3 directory you've specified. + +### Delete the Task API + +```bash +$ cortex delete trainer +``` diff --git a/docs/workloads/task/jobs.md b/docs/workloads/task/jobs.md new file mode 100644 index 0000000000..bf4c1e87b3 --- /dev/null +++ b/docs/workloads/task/jobs.md @@ -0,0 +1,76 @@ +# Task API endpoint + +A deployed Task API endpoint supports the following: + +1. Submitting a task job +1. Getting the status of a job +1. Stopping a job + +You can find the url for your Task API using Cortex CLI command `cortex get `. + +## Submit a Job + +```yaml +POST : +{ + "timeout": , # duration in seconds since the submission of a job before it is terminated (optional) + "config": { # custom fields for this specific job (will override values in `config` specified in your api configuration) (optional) + "string": + } +} + +RESPONSE: +{ + "job_id": , + "api_name": , + "kind": "TaskAPI", + "workers": 1, + "config": {: }, + "api_id": , + "timeout": , + "created_time": # e.g. 2020-07-16T14:56:10.276007415Z +} +``` + +## Job status + +You can get the status of a job by making a GET request to `/` (note that you can also get a job's status with the Cortex CLI command `cortex get `). + +See [Job Status Codes](statuses.md) for a list of the possible job statuses and what they mean. + +```yaml +GET ?jobID=: + +RESPONSE: +{ + "job_status": { + "job_id": , + "api_name": , + "kind": "TaskAPI", + "workers": 1, + "config": {: }, + "api_id": , + "status": , # will be one of the following values: status_unknown|status_running|status_succeeded|status_unexpected_error|status_worker_error|status_worker_oom|status_timed_out|status_stopped + "created_time": # e.g. 2020-07-16T14:56:10.276007415Z + "start_time": # e.g. 2020-07-16T14:56:10.276007415Z + "end_time": (optional) # e.g. 2020-07-16T14:56:10.276007415Z (only present if the job has completed) + }, + "endpoint": # endpoint for this job + "api_spec": { + ... + } +} +``` + +## Stop a Job + +Stop a job in progress. You can also use the Cortex CLI command + +You stop a running job by making a DELETE request to `/` (note that you can also delete a job with the Cortex CLI command `cortex delete `). + +```yaml +DELETE ?jobID=: + +RESPONSE: +{"message":"stopped job "} +``` diff --git a/docs/workloads/task/statuses.md b/docs/workloads/task/statuses.md new file mode 100644 index 0000000000..b51eaf010f --- /dev/null +++ b/docs/workloads/task/statuses.md @@ -0,0 +1,10 @@ +# Statuses + +| Status | Meaning | +| :--- | :--- | +| running | Task is running | +| succeeded | Task has finished without errors | +| worker error | The task has experienced an irrecoverable error, causing the job to fail; check job logs for more details | +| out of memory | The task has ran out of memory, causing the job to fail; check job logs for more details | +| timed out | Job was terminated after the specified timeout has elapsed | +| stopped | Job was stopped by the user or the Task API was deleted | diff --git a/pkg/cortex/client/cortex/client.py b/pkg/cortex/client/cortex/client.py index e520b260ee..ddbc97b1ab 100644 --- a/pkg/cortex/client/cortex/client.py +++ b/pkg/cortex/client/cortex/client.py @@ -56,6 +56,7 @@ def create_api( self, api_spec: dict, predictor=None, + task=None, requirements=[], conda_packages=[], project_dir: Optional[str] = None, @@ -67,7 +68,8 @@ def create_api( Args: api_spec: A dictionary defining a single Cortex API. See https://docs.cortex.dev/v/master/ for schema. - predictor: A Cortex Predictor class implementation. Not required when deploying a traffic splitter. + predictor: A Cortex Predictor class implementation. Not required for TaskAPI/TrafficSplitter kinds. + task: A callable class/function implementation. Not required for RealtimeAPI/BatchAPI/TrafficSplitter kinds. requirements: A list of PyPI dependencies that will be installed before the predictor class implementation is invoked. conda_packages: A list of Conda dependencies that will be installed before the predictor class implementation is invoked. project_dir: Path to a python project. @@ -83,10 +85,15 @@ def create_api( "`wait` flag is not supported for clusters on GCP, please set the `wait` flag to false" ) - if project_dir is not None and predictor is not None: - raise ValueError( - "`predictor` and `project_dir` parameters cannot be specified at the same time, please choose one" - ) + if project_dir is not None: + if predictor is not None: + raise ValueError( + "`predictor` and `project_dir` parameters cannot be specified at the same time, please choose one" + ) + if task is not None: + raise ValueError( + "`task` and `project_dir` parameters cannot be specified at the same time, please choose one" + ) if project_dir is not None: cortex_yaml_path = os.path.join(project_dir, f".cortex-{uuid.uuid4()}.yaml") @@ -95,6 +102,27 @@ def create_api( yaml.dump([api_spec], f) # write a list return self._deploy(cortex_yaml_path, force, wait) + api_kind = api_spec.get("kind") + if api_kind == "TrafficSplitter": + if predictor: + raise ValueError(f"`predictor` parameter cannot be specified for {api_kind} kind") + if task: + raise ValueError(f"`task` parameter cannot be specified for {api_kind} kind") + elif api_kind == "TaskAPI": + if predictor: + raise ValueError(f"`predictor` parameter cannnot be specified for {api_kind} kind") + if task is None: + raise ValueError(f"`task` parameter must be specified for {api_kind} kind") + elif api_kind in ["BatchAPI", "RealtimeAPI"]: + if not predictor: + raise ValueError(f"`predictor` parameter must be specified for {api_kind}") + if task: + raise ValueError(f"`task` parameter cannot be specified for {api_kind}") + else: + raise ValueError( + f"invalid {api_kind} kind, `api_spec` must have the `kind` field set to one of the following kinds: {['TrafficSplitter', 'TaskAPI', 'BatchAPI', 'RealtimeAPI']}" + ) + if api_spec.get("name") is None: raise ValueError("`api_spec` must have the `name` key set") @@ -107,7 +135,7 @@ def create_api( cortex_yaml_path = os.path.join(project_dir, "cortex.yaml") - if predictor is None: + if api_kind == "TrafficSplitter": # for deploying a traffic splitter with open(cortex_yaml_path, "w") as f: yaml.dump([api_spec], f) # write a list @@ -134,23 +162,35 @@ def create_api( with open(project_dir / "conda-packages.txt", "w") as conda_file: conda_file.write("\n".join(conda_packages)) - if not inspect.isclass(predictor): - raise ValueError("predictor parameter must be a class definition") - - with open(project_dir / "predictor.pickle", "wb") as pickle_file: - dill.dump(predictor, pickle_file) - if api_spec.get("predictor") is None: - api_spec["predictor"] = {} - - if predictor.__name__ == "PythonPredictor": - predictor_type = "python" - if predictor.__name__ == "TensorFlowPredictor": - predictor_type = "tensorflow" - if predictor.__name__ == "ONNXPredictor": - predictor_type = "onnx" - - api_spec["predictor"]["path"] = "predictor.pickle" - api_spec["predictor"]["type"] = predictor_type + if api_kind in ["BatchAPI", "RealtimeAPI"]: + if not inspect.isclass(predictor): + raise ValueError("`predictor` parameter must be a class definition") + + with open(project_dir / "predictor.pickle", "wb") as pickle_file: + dill.dump(predictor, pickle_file) + if api_spec.get("predictor") is None: + api_spec["predictor"] = {} + + if predictor.__name__ == "PythonPredictor": + predictor_type = "python" + if predictor.__name__ == "TensorFlowPredictor": + predictor_type = "tensorflow" + if predictor.__name__ == "ONNXPredictor": + predictor_type = "onnx" + + api_spec["predictor"]["path"] = "predictor.pickle" + api_spec["predictor"]["type"] = predictor_type + + if api_kind == "TaskAPI": + if not callable(task): + raise ValueError( + "`task` parameter must be a callable (e.g. a function definition or a class definition called `Task` with a `__call__` method implemented" + ) + with open(project_dir / "task.pickle", "wb") as pickle_file: + dill.dump(task, pickle_file) + if api_spec.get("definition") is None: + api_spec["definition"] = {} + api_spec["definition"]["path"] = "task.pickle" with open(cortex_yaml_path, "w") as f: yaml.dump([api_spec], f) # write a list @@ -267,7 +307,7 @@ def get_job(self, api_name: str, job_id: str) -> dict: Get information about a submitted job. Args: - api_name: Name of the Batch API. + api_name: Name of the Batch/Task API. job_id: Job ID. Returns: @@ -342,7 +382,7 @@ def stop_job(self, api_name: str, job_id: str, keep_cache: bool = False): Stop a running job. Args: - api_name: Name of the Batch API. + api_name: Name of the Batch/Task API. job_id: ID of the Job to stop. """ args = [ diff --git a/pkg/cortex/serve/cortex_internal/lib/api/__init__.py b/pkg/cortex/serve/cortex_internal/lib/api/__init__.py index 77e7e0edc0..b87ad5bf7f 100644 --- a/pkg/cortex/serve/cortex_internal/lib/api/__init__.py +++ b/pkg/cortex/serve/cortex_internal/lib/api/__init__.py @@ -14,3 +14,4 @@ from cortex_internal.lib.api.predictor import Predictor from cortex_internal.lib.api.api import API, get_api, get_spec +from cortex_internal.lib.api.task import TaskAPI diff --git a/pkg/cortex/serve/cortex_internal/lib/api/api.py b/pkg/cortex/serve/cortex_internal/lib/api/api.py index add576b5ad..683920c7cf 100644 --- a/pkg/cortex/serve/cortex_internal/lib/api/api.py +++ b/pkg/cortex/serve/cortex_internal/lib/api/api.py @@ -136,6 +136,7 @@ def get_spec( spec_path: str, cache_dir: str, region: Optional[str] = None, + spec_name: str = "api_spec.json", ) -> Tuple[Union[S3, GCS], dict]: """ Args: @@ -143,6 +144,7 @@ def get_spec( spec_path: Path to API spec (i.e. "s3://cortex-dev-0/apis/iris-classifier/api/69b93378fa5c0218-jy1fjtyihu-9fcc10739e7fc8050cefa8ca27ece1ee/master-spec.json"). cache_dir: Local directory where the API spec gets saved to. region: Region of the bucket. Only required for "S3" provider. + spec_name: File name of the spec as it is saved on disk. """ if provider == "aws": @@ -154,7 +156,7 @@ def get_spec( else: raise ValueError('invalid "provider" argument') - local_spec_path = os.path.join(cache_dir, "api_spec.json") + local_spec_path = os.path.join(cache_dir, spec_name) if not os.path.isfile(local_spec_path): storage.download_file(key, local_spec_path) diff --git a/pkg/cortex/serve/cortex_internal/lib/api/task.py b/pkg/cortex/serve/cortex_internal/lib/api/task.py new file mode 100644 index 0000000000..e1487ba0ca --- /dev/null +++ b/pkg/cortex/serve/cortex_internal/lib/api/task.py @@ -0,0 +1,131 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import imp +import inspect +import dill + +from cortex_internal.lib.log import logger +from cortex_internal.lib.exceptions import CortexException, UserException, UserRuntimeException + + +class TaskAPI: + def __init__(self, provider: str, api_spec: dict): + """ + Args: + provider: "aws" or "gcp". + api_spec: API configuration. + """ + + self.provider = provider + + self.path = api_spec["definition"]["path"] + self.config = api_spec["definition"].get("config", {}) + self.api_spec = api_spec + + def get_callable(self, project_dir: str): + impl = self._get_impl(project_dir) + if inspect.isclass(impl): + return impl() + return impl + + def _get_impl(self, project_dir: str): + try: + task_callable = self._read_impl( + "cortex_task", os.path.join(project_dir, self.path), "Task" + ) + except CortexException as e: + e.wrap("error in " + self.path) + raise + + try: + self._validate_impl(task_callable) + except CortexException as e: + e.wrap("error in " + self.path) + raise + return task_callable + + def _read_impl(self, module_name: str, impl_path: str, target_class_name: str): + if impl_path.endswith(".pickle"): + try: + with open(impl_path, "rb") as pickle_file: + return dill.load(pickle_file) + except Exception as e: + raise UserException("unable to load pickle", str(e)) from e + + try: + impl = imp.load_source(module_name, impl_path) + except Exception as e: + raise UserException(str(e)) from e + + classes = inspect.getmembers(impl, inspect.isclass) + callables = inspect.getmembers(impl, callable) + + if len(classes) > 0: + task_class = None + for class_df in classes: + if class_df[0] == target_class_name: + if task_class is not None: + raise UserException( + f"multiple definitions for {target_class_name} class found; please check your imports and class definitions and ensure that there is only one task class definition" + ) + task_class = class_df[1] + if task_class is None: + raise UserException(f"{target_class_name} class is not defined") + return task_class + elif len(callables) == 0: + raise UserException("no callable class or function were provided") + else: + return callables[0] + + def _validate_impl(self, impl): + if inspect.isclass(impl): + target_class_name = impl.__name__ + + constructor_fn = getattr(impl, "__init__", None) + if constructor_fn: + argspec = inspect.getfullargspec(constructor_fn) + if not (len(argspec.args) == 1 and argspec.args[0] == "self"): + raise UserException( + f"class {target_class_name}", + f'invalid signature for method "__init__"', + f'only "self" parameter must be present in method\'s signature', + ) + + callable_fn = getattr(impl, "__call__", None) + if callable_fn: + argspec = inspect.getfullargspec(callable_fn) + if not ( + len(argspec.args) == 2 + and argspec.args[0] == "self" + and argspec.args[1] == "config" + ): + raise UserException( + f"class {target_class_name}", + f'invalid signature for method "__call__"', + f'the following parameters must be present in method\'s signature: "self", "config"', + ) + else: + raise UserException( + f"class {target_class_name}", + f'"__call__" method not defined', + ) + else: + callable_fn = impl + argspec = inspect.getfullargspec(callable_fn) + if not (len(argspec.args) == 1 and argspec.args[0] == "config"): + raise UserException( + f'callable function must have the "config" parameter in its signature', + ) diff --git a/pkg/cortex/serve/init/bootloader.sh b/pkg/cortex/serve/init/bootloader.sh index 829b00e2b1..f7fcbedf73 100755 --- a/pkg/cortex/serve/init/bootloader.sh +++ b/pkg/cortex/serve/init/bootloader.sh @@ -182,10 +182,13 @@ if [ "$CORTEX_KIND" = "RealtimeAPI" ]; then # generate nginx conf /opt/conda/envs/env/bin/python -c 'from cortex_internal.lib import util; import os; generated = util.render_jinja_template("/src/cortex/serve/nginx.conf.j2", os.environ); print(generated);' > /run/nginx.conf -# prepare batch otherwise -else + # create the python initialization service + create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py" +elif [ "$CORTEX_KIND" = "BatchAPI" ]; then create_s6_task "batch" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/batch.py" -fi -# create the python initialization service -create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py" + # create the python initialization service + create_s6_service "py_init" "cd /mnt/project && /opt/conda/envs/env/bin/python /src/cortex/serve/init/script.py" +elif [ "$CORTEX_KIND" = "TaskAPI" ]; then + create_s6_task "task" "cd /mnt/project && $source_env_file_cmd && PYTHONUNBUFFERED=TRUE PYTHONPATH=$PYTHONPATH:$CORTEX_PYTHON_PATH /opt/conda/envs/env/bin/python /src/cortex/serve/start/task.py" +fi diff --git a/pkg/cortex/serve/start/batch.py b/pkg/cortex/serve/start/batch.py index 2b5e3af4a2..de69c3cc43 100644 --- a/pkg/cortex/serve/start/batch.py +++ b/pkg/cortex/serve/start/batch.py @@ -12,30 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import os -import argparse import inspect -import time import json -import threading -import math +import os import pathlib +import threading +import time import uuid import boto3 import botocore - -from cortex_internal.lib.log import configure_logger - -logger = configure_logger("cortex", os.environ["CORTEX_LOG_CONFIG_FILE"]) - -from cortex_internal import consts -from cortex_internal.lib import util -from cortex_internal.lib.api import API, get_spec, get_api +from cortex_internal.lib.api import get_api, get_spec from cortex_internal.lib.concurrency import LockedFile from cortex_internal.lib.storage import S3 from cortex_internal.lib.exceptions import UserRuntimeException +from cortex_internal.lib.log import configure_logger + +logger = configure_logger("cortex", os.environ["CORTEX_LOG_CONFIG_FILE"]) SQS_POLL_WAIT_TIME = 10 # seconds MESSAGE_NOT_FOUND_SLEEP = 10 # seconds @@ -48,13 +41,9 @@ "job_spec": None, "provider": None, "predictor_impl": None, - "predict_route": None, - "client": None, - "class_set": set(), "sqs_client": None, } - receipt_handle_mutex = threading.Lock() stop_renewal = set() diff --git a/pkg/cortex/serve/start/task.py b/pkg/cortex/serve/start/task.py new file mode 100644 index 0000000000..7eab8b4dcb --- /dev/null +++ b/pkg/cortex/serve/start/task.py @@ -0,0 +1,52 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from copy import deepcopy + +from cortex_internal.lib import util +from cortex_internal.lib.api import get_spec, TaskAPI +from cortex_internal.lib.log import configure_logger + +logger = configure_logger("cortex", os.environ["CORTEX_LOG_CONFIG_FILE"]) + + +def start(): + cache_dir = os.environ["CORTEX_CACHE_DIR"] + provider = os.environ["CORTEX_PROVIDER"] + project_dir = os.environ["CORTEX_PROJECT_DIR"] + region = os.getenv("AWS_REGION") + + api_spec_path = os.environ["CORTEX_API_SPEC"] + task_spec_path = os.environ["CORTEX_TASK_SPEC"] + + _, api_spec = get_spec(provider, api_spec_path, cache_dir, region) + _, task_spec = get_spec(provider, task_spec_path, cache_dir, region, spec_name="task-spec.json") + + logger.info("loading the task definition from {}".format(api_spec["definition"]["path"])) + task_api = TaskAPI(provider, api_spec) + + logger.info("executing the task definition from {}".format(api_spec["definition"]["path"])) + callable_fn = task_api.get_callable(project_dir) + + config = deepcopy(api_spec["definition"]["config"]) + if task_spec is not None and task_spec.get("config") is not None: + util.merge_dicts_in_place_overwrite(config, task_spec["config"]) + + callable_fn(config) + + +if __name__ == "__main__": + start() diff --git a/pkg/lib/gcp/gcs.go b/pkg/lib/gcp/gcs.go index 527970d3db..a5cc8b66fa 100644 --- a/pkg/lib/gcp/gcs.go +++ b/pkg/lib/gcp/gcs.go @@ -114,6 +114,20 @@ func (c *Client) IsGCSFile(bucket string, key string) (bool, error) { return true, nil } +func (c *Client) UploadStringToGCS(str string, bucket string, key string) error { + gcsClient, err := c.GCS() + if err != nil { + return err + } + objectWriter := gcsClient.Bucket(bucket).Object(key).NewWriter(context.Background()) + objectWriter.ContentType = "text/plain" + defer objectWriter.Close() + if _, err := objectWriter.Write([]byte(str)); err != nil { + return errors.WithStack(err) + } + return nil +} + func (c *Client) ReadJSONFromGCS(objPtr interface{}, bucket string, key string) error { jsonBytes, err := c.ReadBytesFromGCS(bucket, key) if err != nil { @@ -215,19 +229,32 @@ func (c *Client) ReadBytesFromGCS(bucket string, key string) ([]byte, error) { return buf.Bytes(), nil } -func (c *Client) ListGCSDir(bucket string, gcsDir string, maxResults *int64) ([]string, error) { +func ConvertGCSObjectsToKeys(gcsObjects ...*storage.ObjectAttrs) []string { + paths := make([]string, 0, len(gcsObjects)) + for _, object := range gcsObjects { + if object != nil { + paths = append(paths, object.Name) + } + } + return paths +} + +func (c *Client) ListGCSDir(bucket string, gcsDir string, maxResults *int64) ([]*storage.ObjectAttrs, error) { + gcsDir = s.EnsureSuffix(gcsDir, "/") + return c.ListGCSPrefix(bucket, gcsDir, maxResults) +} + +func (c *Client) ListGCSPrefix(bucket string, prefix string, maxResults *int64) ([]*storage.ObjectAttrs, error) { gcsClient, err := c.GCS() if err != nil { return nil, err } - gcsDir = s.EnsureSuffix(gcsDir, "/") objectIterator := gcsClient.Bucket(bucket).Objects(context.Background(), &storage.Query{ - Prefix: gcsDir, + Prefix: prefix, }) - allNames := strset.New() - + var gcsObjects []*storage.ObjectAttrs for { attrs, err := objectIterator.Next() if err != nil { @@ -236,16 +263,19 @@ func (c *Client) ListGCSDir(bucket string, gcsDir string, maxResults *int64) ([] } return nil, errors.WithStack(err) } - allNames.Add(attrs.Name) - if maxResults != nil && int64(len(allNames)) >= *maxResults { + if attrs == nil { + continue + } + gcsObjects = append(gcsObjects, attrs) + if maxResults != nil && int64(len(gcsObjects)) >= *maxResults { break } } - return allNames.SliceSorted(), nil + return gcsObjects, nil } -func (c *Client) ListGCSPathDir(gcsDirPath string, maxResults *int64) ([]string, error) { +func (c *Client) ListGCSPathDir(gcsDirPath string, maxResults *int64) ([]*storage.ObjectAttrs, error) { bucket, gcsDir, err := SplitGCSPath(gcsDirPath) if err != nil { return nil, err diff --git a/pkg/operator/config/wrappers.go b/pkg/operator/config/wrappers.go index ba7c5a906b..58085f0620 100644 --- a/pkg/operator/config/wrappers.go +++ b/pkg/operator/config/wrappers.go @@ -17,8 +17,11 @@ limitations under the License. package config import ( + "cloud.google.com/go/storage" + "github.com/aws/aws-sdk-go/service/s3" "github.com/cortexlabs/cortex/pkg/lib/aws" "github.com/cortexlabs/cortex/pkg/lib/gcp" + s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/types" ) @@ -132,6 +135,16 @@ func UploadBytesToBucket(data []byte, key string) error { return nil } +func UploadStringToBucket(str string, key string) error { + switch Provider { + case types.AWSProviderType: + return AWS.UploadStringToS3(str, Cluster.Bucket, key) + case types.GCPProviderType: + return GCP.UploadStringToGCS(str, GCPCluster.Bucket, key) + } + return nil +} + func UploadJSONToBucket(obj interface{}, key string) error { switch Provider { case types.AWSProviderType: @@ -152,6 +165,39 @@ func ListBucketDirOneLevel(dir string, maxResults *int64) ([]string, error) { return nil, nil } +func ListBucketDir(dir string, maxResults *int64) ([]*storage.ObjectAttrs, []*s3.Object, error) { + dir = s.EnsureSuffix(dir, "/") + return ListBucketPrefix(dir, maxResults) +} + +func ListBucketPrefix(prefix string, maxResults *int64) ([]*storage.ObjectAttrs, []*s3.Object, error) { + switch Provider { + case types.AWSProviderType: + s3Objects, err := AWS.ListS3Prefix(Cluster.Bucket, prefix, false, maxResults) + if err != nil { + return nil, nil, err + } + return nil, s3Objects, nil + case types.GCPProviderType: + gcsObjects, err := GCP.ListGCSPrefix(GCPCluster.Bucket, prefix, maxResults) + if err != nil { + return nil, nil, err + } + return gcsObjects, nil, nil + } + return nil, nil, nil +} + +func DeleteBucketFile(file string) error { + switch Provider { + case types.AWSProviderType: + return AWS.DeleteS3File(Cluster.Bucket, file) + case types.GCPProviderType: + return GCP.DeleteGCSFile(GCPCluster.Bucket, file) + } + return nil +} + func DeleteBucketDir(dir string, continueIfFailure bool) error { switch Provider { case types.AWSProviderType: @@ -162,6 +208,16 @@ func DeleteBucketDir(dir string, continueIfFailure bool) error { return nil } +func DeleteBucketPrefix(prefix string, continueIfFailure bool) error { + switch Provider { + case types.AWSProviderType: + return AWS.DeleteS3Prefix(Cluster.Bucket, prefix, continueIfFailure) + case types.GCPProviderType: + return GCP.DeleteGCSPrefix(GCPCluster.Bucket, prefix, continueIfFailure) + } + return nil +} + func ImageDownloader() string { switch Provider { case types.AWSProviderType: diff --git a/pkg/operator/endpoints/get_job.go b/pkg/operator/endpoints/get_batch_job.go similarity index 72% rename from pkg/operator/endpoints/get_job.go rename to pkg/operator/endpoints/get_batch_job.go index c56f5f9e71..a292cfa826 100644 --- a/pkg/operator/endpoints/get_job.go +++ b/pkg/operator/endpoints/get_batch_job.go @@ -18,18 +18,18 @@ package endpoints import ( "net/http" + "net/url" - "github.com/cortexlabs/cortex/pkg/lib/urls" "github.com/cortexlabs/cortex/pkg/operator/operator" "github.com/cortexlabs/cortex/pkg/operator/resources" - "github.com/cortexlabs/cortex/pkg/operator/resources/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/batchapi" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/gorilla/mux" ) -func GetJob(w http.ResponseWriter, r *http.Request) { +func GetBatchJob(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) apiName := vars["apiName"] jobID, err := getRequiredQueryParam("jobID", r) @@ -48,7 +48,11 @@ func GetJob(w http.ResponseWriter, r *http.Request) { return } - jobKey := spec.JobKey{APIName: apiName, ID: jobID} + jobKey := spec.JobKey{ + APIName: apiName, + ID: jobID, + Kind: userconfig.BatchAPIKind, + } jobStatus, err := batchapi.GetJobStatus(jobKey) if err != nil { @@ -56,22 +60,30 @@ func GetJob(w http.ResponseWriter, r *http.Request) { return } - spec, err := operator.DownloadAPISpec(jobStatus.APIName, jobStatus.APIID) + apiSpec, err := operator.DownloadAPISpec(jobStatus.APIName, jobStatus.APIID) if err != nil { respondError(w, r, err) return } - endpoint, err := operator.APIEndpoint(spec) + endpoint, err := operator.APIEndpoint(apiSpec) if err != nil { respondError(w, r, err) return } - response := schema.JobResponse{ + parsedURL, err := url.Parse(endpoint) + if err != nil { + respondError(w, r, err) + } + q := parsedURL.Query() + q.Add("jobID", jobKey.ID) + parsedURL.RawQuery = q.Encode() + + response := schema.BatchJobResponse{ JobStatus: *jobStatus, - APISpec: *spec, - Endpoint: urls.Join(endpoint, jobKey.ID), + APISpec: *apiSpec, + Endpoint: parsedURL.String(), } respond(w, response) diff --git a/pkg/operator/endpoints/get_task_job.go b/pkg/operator/endpoints/get_task_job.go new file mode 100644 index 0000000000..75b08aeaec --- /dev/null +++ b/pkg/operator/endpoints/get_task_job.go @@ -0,0 +1,90 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package endpoints + +import ( + "net/http" + "net/url" + + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/taskapi" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/gorilla/mux" +) + +func GetTaskJob(w http.ResponseWriter, r *http.Request) { + vars := mux.Vars(r) + apiName := vars["apiName"] + jobID, err := getRequiredQueryParam("jobID", r) + if err != nil { + respondError(w, r, err) + return + } + + deployedResource, err := resources.GetDeployedResourceByName(apiName) + if err != nil { + respondError(w, r, err) + return + } + if deployedResource.Kind != userconfig.TaskAPIKind { + respondError(w, r, resources.ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.TaskAPIKind)) + return + } + + jobKey := spec.JobKey{ + APIName: apiName, + ID: jobID, + Kind: userconfig.TaskAPIKind, + } + + jobStatus, err := taskapi.GetJobStatus(jobKey) + if err != nil { + respondError(w, r, err) + return + } + + apiSpec, err := operator.DownloadAPISpec(jobStatus.APIName, jobStatus.APIID) + if err != nil { + respondError(w, r, err) + return + } + + endpoint, err := operator.APIEndpoint(apiSpec) + if err != nil { + respondError(w, r, err) + return + } + + parsedURL, err := url.Parse(endpoint) + if err != nil { + respondError(w, r, err) + } + q := parsedURL.Query() + q.Add("jobID", jobKey.ID) + parsedURL.RawQuery = q.Encode() + + response := schema.TaskJobResponse{ + JobStatus: *jobStatus, + APISpec: *apiSpec, + Endpoint: parsedURL.String(), + } + + respond(w, response) +} diff --git a/pkg/operator/endpoints/logs.go b/pkg/operator/endpoints/logs.go index 8e89db381d..ad0a690cbc 100644 --- a/pkg/operator/endpoints/logs.go +++ b/pkg/operator/endpoints/logs.go @@ -41,7 +41,7 @@ func ReadLogs(w http.ResponseWriter, r *http.Request) { return } - if deployedResource.Kind == userconfig.BatchAPIKind { + if deployedResource.Kind == userconfig.BatchAPIKind || deployedResource.Kind == userconfig.TaskAPIKind { respondError(w, r, ErrorLogsJobIDRequired(*deployedResource)) return } else if deployedResource.Kind != userconfig.RealtimeAPIKind { diff --git a/pkg/operator/endpoints/logs_job.go b/pkg/operator/endpoints/logs_job.go index c00416a125..28a6d9dd13 100644 --- a/pkg/operator/endpoints/logs_job.go +++ b/pkg/operator/endpoints/logs_job.go @@ -39,8 +39,8 @@ func ReadJobLogs(w http.ResponseWriter, r *http.Request) { respondError(w, r, err) return } - if deployedResource.Kind != userconfig.BatchAPIKind { - respondError(w, r, resources.ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.BatchAPIKind)) + if deployedResource.Kind != userconfig.BatchAPIKind && deployedResource.Kind != userconfig.TaskAPIKind { + respondError(w, r, resources.ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.BatchAPIKind, userconfig.TaskAPIKind)) return } diff --git a/pkg/operator/endpoints/stop_batch_job.go b/pkg/operator/endpoints/stop_batch_job.go new file mode 100644 index 0000000000..98cc4d33a6 --- /dev/null +++ b/pkg/operator/endpoints/stop_batch_job.go @@ -0,0 +1,52 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package endpoints + +import ( + "fmt" + "net/http" + + "github.com/cortexlabs/cortex/pkg/operator/resources/job/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/gorilla/mux" +) + +func StopBatchJob(w http.ResponseWriter, r *http.Request) { + vars := mux.Vars(r) + apiName := vars["apiName"] + jobID, err := getRequiredQueryParam("jobID", r) + if err != nil { + respondError(w, r, err) + return + } + + err = batchapi.StopJob(spec.JobKey{ + APIName: apiName, + ID: jobID, + Kind: userconfig.BatchAPIKind, + }) + if err != nil { + respondError(w, r, err) + return + } + + respond(w, schema.DeleteResponse{ + Message: fmt.Sprintf("stopped job %s", jobID), + }) +} diff --git a/pkg/operator/endpoints/stop_job.go b/pkg/operator/endpoints/stop_task_job.go similarity index 78% rename from pkg/operator/endpoints/stop_job.go rename to pkg/operator/endpoints/stop_task_job.go index 84a9527648..9e9d09e852 100644 --- a/pkg/operator/endpoints/stop_job.go +++ b/pkg/operator/endpoints/stop_task_job.go @@ -20,13 +20,14 @@ import ( "fmt" "net/http" - "github.com/cortexlabs/cortex/pkg/operator/resources/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/taskapi" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/gorilla/mux" ) -func StopJob(w http.ResponseWriter, r *http.Request) { +func StopTaskJob(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) apiName := vars["apiName"] jobID, err := getRequiredQueryParam("jobID", r) @@ -35,7 +36,11 @@ func StopJob(w http.ResponseWriter, r *http.Request) { return } - err = batchapi.StopJob(spec.JobKey{APIName: apiName, ID: jobID}) + err = taskapi.StopJob(spec.JobKey{ + APIName: apiName, + ID: jobID, + Kind: userconfig.TaskAPIKind, + }) if err != nil { respondError(w, r, err) return diff --git a/pkg/operator/endpoints/submit_job.go b/pkg/operator/endpoints/submit_batch.go similarity index 87% rename from pkg/operator/endpoints/submit_job.go rename to pkg/operator/endpoints/submit_batch.go index 577874a729..65c96751b2 100644 --- a/pkg/operator/endpoints/submit_job.go +++ b/pkg/operator/endpoints/submit_batch.go @@ -26,13 +26,13 @@ import ( "github.com/cortexlabs/cortex/pkg/consts" "github.com/cortexlabs/cortex/pkg/lib/errors" "github.com/cortexlabs/cortex/pkg/operator/resources" - "github.com/cortexlabs/cortex/pkg/operator/resources/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/batchapi" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/userconfig" "github.com/gorilla/mux" ) -func SubmitJob(w http.ResponseWriter, r *http.Request) { +func SubmitBatchJob(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) apiName := vars["apiName"] dryRun := getOptionalBoolQParam("dryRun", false, r) @@ -56,7 +56,7 @@ func SubmitJob(w http.ResponseWriter, r *http.Request) { return } - submission := schema.JobSubmission{} + submission := schema.BatchJobSubmission{} err = json.Unmarshal(bodyBytes, &submission) if err != nil { @@ -71,7 +71,7 @@ func SubmitJob(w http.ResponseWriter, r *http.Request) { fileNames, err := batchapi.DryRun(&submission) if err != nil { w.WriteHeader(http.StatusBadRequest) - io.WriteString(w, "\n"+err.Error()+"\n") + _, _ = io.WriteString(w, "\n"+err.Error()+"\n") return } @@ -79,12 +79,12 @@ func SubmitJob(w http.ResponseWriter, r *http.Request) { _, err := io.WriteString(w, fileName+"\n") if err != nil { w.WriteHeader(http.StatusBadRequest) - io.WriteString(w, "\n"+err.Error()+"\n") + _, _ = io.WriteString(w, "\n"+err.Error()+"\n") return } } - io.WriteString(w, "validations passed") + _, _ = io.WriteString(w, "validations passed") return } diff --git a/pkg/operator/endpoints/submit_task.go b/pkg/operator/endpoints/submit_task.go new file mode 100644 index 0000000000..0a122eb81b --- /dev/null +++ b/pkg/operator/endpoints/submit_task.go @@ -0,0 +1,78 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package endpoints + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + + "github.com/cortexlabs/cortex/pkg/consts" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/operator/resources" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/taskapi" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + "github.com/gorilla/mux" +) + +func SubmitTaskJob(w http.ResponseWriter, r *http.Request) { + vars := mux.Vars(r) + apiName := vars["apiName"] + + deployedResource, err := resources.GetDeployedResourceByName(apiName) + if err != nil { + respondError(w, r, err) + return + } + if deployedResource.Kind != userconfig.TaskAPIKind { + respondError(w, r, resources.ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.TaskAPIKind)) + return + } + + // max payload size, same as API Gateway + rw := http.MaxBytesReader(w, r.Body, 10<<20) + + bodyBytes, err := ioutil.ReadAll(rw) + if err != nil { + respondError(w, r, err) + return + } + + submission := schema.TaskJobSubmission{ + RuntimeTaskJobConfig: spec.RuntimeTaskJobConfig{Workers: 1}, + } + + err = json.Unmarshal(bodyBytes, &submission) + if err != nil { + respondError(w, r, errors.Append(err, + fmt.Sprintf("\n\ntask job submission schema can be found at https://docs.cortex.dev/v/%s/", + consts.CortexVersionMinor)), + ) + return + } + + jobSpec, err := taskapi.SubmitJob(apiName, &submission) + if err != nil { + respondError(w, r, err) + return + } + + respond(w, jobSpec) +} diff --git a/pkg/operator/lib/logging/logging.go b/pkg/operator/lib/logging/logging.go index d5dc732958..ba8e4330df 100644 --- a/pkg/operator/lib/logging/logging.go +++ b/pkg/operator/lib/logging/logging.go @@ -61,19 +61,24 @@ func DefaultZapConfig(level userconfig.LogLevel, fields ...map[string]interface{ encoderConfig := zap.NewProductionEncoderConfig() encoderConfig.MessageKey = "message" - initialFields := map[string]interface{}{} + labels := map[string]interface{}{} for _, m := range fields { for k, v := range m { - initialFields[k] = v + labels[k] = v } } + initialFields := map[string]interface{}{} + if len(labels) > 0 { + initialFields["labels"] = labels + } + return zap.Config{ Level: zap.NewAtomicLevelAt(userconfig.ToZapLogLevel(level)), Encoding: "json", EncoderConfig: encoderConfig, OutputPaths: []string{"stdout"}, ErrorOutputPaths: []string{"stderr"}, - InitialFields: map[string]interface{}{"labels": initialFields}, + InitialFields: initialFields, } } diff --git a/pkg/operator/main.go b/pkg/operator/main.go index 7d566b565b..85ea9621d5 100644 --- a/pkg/operator/main.go +++ b/pkg/operator/main.go @@ -28,7 +28,8 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/lib/exit" "github.com/cortexlabs/cortex/pkg/operator/lib/logging" "github.com/cortexlabs/cortex/pkg/operator/operator" - "github.com/cortexlabs/cortex/pkg/operator/resources/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/taskapi" "github.com/cortexlabs/cortex/pkg/operator/resources/realtimeapi" "github.com/cortexlabs/cortex/pkg/types" "github.com/cortexlabs/cortex/pkg/types/userconfig" @@ -80,8 +81,9 @@ func main() { } } - cron.Run(batchapi.ManageJobResources, operator.ErrorHandler("manage jobs"), batchapi.ManageJobResourcesCronPeriod) + cron.Run(batchapi.ManageJobResources, operator.ErrorHandler("manage batch jobs"), batchapi.ManageJobResourcesCronPeriod) } + cron.Run(taskapi.ManageJobResources, operator.ErrorHandler("manage task jobs"), taskapi.ManageJobResourcesCronPeriod) router := mux.NewRouter() @@ -90,10 +92,13 @@ func main() { routerWithoutAuth.HandleFunc("/verifycortex", endpoints.VerifyCortex).Methods("GET") if config.Provider == types.AWSProviderType { - routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.SubmitJob).Methods("POST") - routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.GetJob).Methods("GET") - routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.StopJob).Methods("DELETE") + routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.SubmitBatchJob).Methods("POST") + routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.GetBatchJob).Methods("GET") + routerWithoutAuth.HandleFunc("/batch/{apiName}", endpoints.StopBatchJob).Methods("DELETE") } + routerWithoutAuth.HandleFunc("/tasks/{apiName}", endpoints.SubmitTaskJob).Methods("POST") + routerWithoutAuth.HandleFunc("/tasks/{apiName}", endpoints.GetTaskJob).Methods("GET") + routerWithoutAuth.HandleFunc("/tasks/{apiName}", endpoints.StopTaskJob).Methods("DELETE") routerWithAuth := router.NewRoute().Subrouter() diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index 3ca1d952af..fd82ca0bc2 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -36,7 +36,6 @@ import ( istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" kcore "k8s.io/api/core/v1" kresource "k8s.io/apimachinery/pkg/api/resource" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" intstr "k8s.io/apimachinery/pkg/util/intstr" ) @@ -91,6 +90,23 @@ type downloadContainerArg struct { HideUnzippingLog bool `json:"hide_unzipping_log"` // if true, don't log when unzipping } +func TaskInitContainer(api *spec.API) kcore.Container { + return kcore.Container{ + Name: _downloaderInitContainerName, + Image: config.ImageDownloader(), + ImagePullPolicy: "Always", + Args: []string{"--download=" + pythonDownloadArgs(api)}, + EnvFrom: baseEnvVars(), + Env: []kcore.EnvVar{ + { + Name: "CORTEX_LOG_LEVEL", + Value: strings.ToUpper(api.TaskDefinition.LogLevel.String()), + }, + }, + VolumeMounts: defaultVolumeMounts(), + } +} + func InitContainer(api *spec.API) kcore.Container { downloadArgs := "" @@ -114,12 +130,72 @@ func InitContainer(api *spec.API) kcore.Container { } } +func TaskContainers(api *spec.API) ([]kcore.Container, []kcore.Volume) { + apiPodResourceList := kcore.ResourceList{} + apiPodResourceLimitsList := kcore.ResourceList{} + apiPodVolumeMounts := defaultVolumeMounts() + volumes := DefaultVolumes() + var containers []kcore.Container + + if api.Compute.GPU > 0 { + apiPodResourceList["nvidia.com/gpu"] = *kresource.NewQuantity(api.Compute.GPU, kresource.DecimalSI) + apiPodResourceLimitsList["nvidia.com/gpu"] = *kresource.NewQuantity(api.Compute.GPU, kresource.DecimalSI) + } else { + volumes = append(volumes, kcore.Volume{ + Name: "neuron-sock", + }) + rtdVolumeMounts := []kcore.VolumeMount{ + { + Name: "neuron-sock", + MountPath: "/sock", + }, + } + apiPodVolumeMounts = append(apiPodVolumeMounts, rtdVolumeMounts...) + neuronContainer := *neuronRuntimeDaemonContainer(api, rtdVolumeMounts) + + if api.Compute.CPU != nil { + q1, q2 := k8s.SplitInTwo(k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())) + apiPodResourceList[kcore.ResourceCPU] = *q1 + neuronContainer.Resources.Requests[kcore.ResourceCPU] = *q2 + } + + if api.Compute.Mem != nil { + q1, q2 := k8s.SplitInTwo(k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())) + apiPodResourceList[kcore.ResourceMemory] = *q1 + neuronContainer.Resources.Requests[kcore.ResourceMemory] = *q2 + } + + containers = append(containers, neuronContainer) + } + + containers = append(containers, kcore.Container{ + Name: APIContainerName, + Image: api.TaskDefinition.Image, + ImagePullPolicy: kcore.PullAlways, + Env: getTaskEnvVars(api, APIContainerName), + EnvFrom: baseEnvVars(), + VolumeMounts: apiPodVolumeMounts, + Resources: kcore.ResourceRequirements{ + Requests: apiPodResourceList, + Limits: apiPodResourceLimitsList, + }, + Ports: []kcore.ContainerPort{ + {ContainerPort: DefaultPortInt32}, + }, + SecurityContext: &kcore.SecurityContext{ + Privileged: pointer.Bool(true), + }}, + ) + + return containers, volumes +} + func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume) { apiPodResourceList := kcore.ResourceList{} apiPodResourceLimitsList := kcore.ResourceList{} apiPodVolumeMounts := defaultVolumeMounts() volumes := DefaultVolumes() - containers := []kcore.Container{} + var containers []kcore.Container if api.Compute.Inf == 0 { if api.Compute.CPU != nil { @@ -216,7 +292,7 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo tfServingLimitsList := kcore.ResourceList{} volumeMounts := defaultVolumeMounts() volumes := DefaultVolumes() - containers := []kcore.Container{} + var containers []kcore.Container if api.Compute.Inf == 0 { if api.Compute.CPU != nil { @@ -326,7 +402,7 @@ func ONNXPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume) resourceLimitsList := kcore.ResourceList{} apiPodVolumeMounts := defaultVolumeMounts() volumes := DefaultVolumes() - containers := []kcore.Container{} + var containers []kcore.Container if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) @@ -385,6 +461,74 @@ func ONNXPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume) return containers, volumes } +func getTaskEnvVars(api *spec.API, container string) []kcore.EnvVar { + envVars := []kcore.EnvVar{ + { + Name: "CORTEX_KIND", + Value: api.Kind.String(), + }, + { + Name: "CORTEX_LOG_LEVEL", + Value: strings.ToUpper(api.TaskDefinition.LogLevel.String()), + }, + } + + for name, val := range api.TaskDefinition.Env { + envVars = append(envVars, kcore.EnvVar{ + Name: name, + Value: val, + }) + } + + if container == APIContainerName { + envVars = append(envVars, + kcore.EnvVar{ + Name: "HOST_IP", + ValueFrom: &kcore.EnvVarSource{ + FieldRef: &kcore.ObjectFieldSelector{ + FieldPath: "status.hostIP", + }, + }, + }, + kcore.EnvVar{ + Name: "CORTEX_PROJECT_DIR", + Value: path.Join(_emptyDirMountPath, "project"), + }, + kcore.EnvVar{ + Name: "CORTEX_CACHE_DIR", + Value: _specCacheDir, + }, + kcore.EnvVar{ + Name: "CORTEX_API_SPEC", + Value: config.BucketPath(api.Key), + }, + ) + + cortexPythonPath := path.Join(_emptyDirMountPath, "project") + if api.TaskDefinition.PythonPath != nil { + cortexPythonPath = path.Join(_emptyDirMountPath, "project", *api.TaskDefinition.PythonPath) + } + envVars = append(envVars, kcore.EnvVar{ + Name: "CORTEX_PYTHON_PATH", + Value: cortexPythonPath, + }) + + if api.Compute.Inf > 0 { + envVars = append(envVars, + kcore.EnvVar{ + Name: "NEURONCORE_GROUP_SIZES", + Value: s.Int64(api.Compute.Inf * consts.NeuronCoresPerInf), + }, + kcore.EnvVar{ + Name: "NEURON_RTD_ADDRESS", + Value: fmt.Sprintf("unix:%s", _neuronRTDSocket), + }, + ) + } + } + return envVars +} + func getEnvVars(api *spec.API, container string) []kcore.EnvVar { if container == _requestMonitorContainerName || container == _downloaderInitContainerName { return []kcore.EnvVar{ @@ -979,13 +1123,3 @@ func GetEndpointFromVirtualService(virtualService *istioclientnetworking.Virtual return endpoints.GetOne(), nil } - -func extractCortexAnnotations(obj kmeta.Object) map[string]string { - cortexAnnotations := make(map[string]string) - for key, value := range obj.GetAnnotations() { - if strings.Contains(key, "cortex.dev/") { - cortexAnnotations[key] = value - } - } - return cortexAnnotations -} diff --git a/pkg/operator/operator/logging.go b/pkg/operator/operator/logging.go index 8a8a60033d..bbcae83878 100644 --- a/pkg/operator/operator/logging.go +++ b/pkg/operator/operator/logging.go @@ -133,19 +133,36 @@ func GetJobLogger(jobKey spec.JobKey) (*zap.SugaredLogger, error) { return logger, nil } - jobSpec, err := DownloadJobSpec(jobKey) - if err != nil { - return nil, err - } - - apiSpec, err := DownloadAPISpec(jobKey.APIName, jobSpec.APIID) - if err != nil { - return nil, err + apiName := jobKey.APIName + var logLevel userconfig.LogLevel + switch jobKey.Kind { + case userconfig.BatchAPIKind: + jobSpec, err := DownloadBatchJobSpec(jobKey) + if err != nil { + return nil, err + } + apiSpec, err := DownloadAPISpec(apiName, jobSpec.APIID) + if err != nil { + return nil, err + } + logLevel = apiSpec.Predictor.LogLevel + case userconfig.TaskAPIKind: + jobSpec, err := DownloadTaskJobSpec(jobKey) + if err != nil { + return nil, err + } + apiSpec, err := DownloadAPISpec(apiName, jobSpec.APIID) + if err != nil { + return nil, err + } + logLevel = apiSpec.TaskDefinition.LogLevel + default: + return nil, errors.ErrorUnexpected("unexpected kind", jobKey.Kind.String()) } - return initializeLogger(loggerCacheKey, apiSpec.Predictor.LogLevel, map[string]interface{}{ + return initializeLogger(loggerCacheKey, logLevel, map[string]interface{}{ "apiName": jobKey.APIName, - "apiKind": userconfig.BatchAPIKind.String(), + "apiKind": jobKey.Kind.String(), "jobID": jobKey.ID, }) } @@ -159,7 +176,7 @@ func GetJobLoggerFromSpec(apiSpec *spec.API, jobKey spec.JobKey) (*zap.SugaredLo return initializeLogger(loggerCacheKey, apiSpec.Predictor.LogLevel, map[string]interface{}{ "apiName": jobKey.APIName, - "apiKind": userconfig.BatchAPIKind.String(), + "apiKind": jobKey.Kind.String(), "jobID": jobKey.ID, }) } diff --git a/pkg/operator/operator/storage.go b/pkg/operator/operator/storage.go index 1f30644f85..0125bf7632 100644 --- a/pkg/operator/operator/storage.go +++ b/pkg/operator/operator/storage.go @@ -58,8 +58,17 @@ func DownloadAPISpecs(apiNames []string, apiIDs []string) ([]spec.API, error) { return apis, nil } -func DownloadJobSpec(jobKey spec.JobKey) (*spec.Job, error) { - jobSpec := spec.Job{} +func DownloadBatchJobSpec(jobKey spec.JobKey) (*spec.BatchJob, error) { + jobSpec := spec.BatchJob{} + err := config.ReadJSONFromBucket(&jobSpec, jobKey.SpecFilePath(config.Cluster.ClusterName)) + if err != nil { + return nil, errors.Wrap(err, "unable to download job specification", jobKey.UserString()) + } + return &jobSpec, nil +} + +func DownloadTaskJobSpec(jobKey spec.JobKey) (*spec.TaskJob, error) { + jobSpec := spec.TaskJob{} err := config.ReadJSONFromBucket(&jobSpec, jobKey.SpecFilePath(config.Cluster.ClusterName)) if err != nil { return nil, errors.Wrap(err, "unable to download job specification", jobKey.UserString()) diff --git a/pkg/operator/resources/batchapi/in_progress_cache.go b/pkg/operator/resources/batchapi/in_progress_cache.go deleted file mode 100644 index 76f37cac87..0000000000 --- a/pkg/operator/resources/batchapi/in_progress_cache.go +++ /dev/null @@ -1,93 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package batchapi - -import ( - "path" - "strings" - - "github.com/cortexlabs/cortex/pkg/operator/config" - "github.com/cortexlabs/cortex/pkg/types/spec" -) - -var ( - _inProgressFilePrefix = "in_progress_jobs" -) - -func inProgressS3Key(jobKey spec.JobKey) string { - return path.Join(config.Cluster.ClusterName, _inProgressFilePrefix, jobKey.APIName, jobKey.ID) -} - -func jobKeyFromInProgressS3Key(s3Key string) spec.JobKey { - s3PathSplit := strings.Split(s3Key, "/") - apiName := s3PathSplit[len(s3PathSplit)-2] - jobID := s3PathSplit[len(s3PathSplit)-1] - - return spec.JobKey{APIName: apiName, ID: jobID} -} - -func uploadInProgressFile(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, inProgressS3Key(jobKey)) - if err != nil { - return err - } - return nil -} - -func deleteInProgressFile(jobKey spec.JobKey) error { - err := config.AWS.DeleteS3File(config.Cluster.Bucket, inProgressS3Key(jobKey)) - if err != nil { - return err - } - return nil -} - -func deleteAllInProgressFilesByAPI(apiName string) error { - err := config.AWS.DeleteS3Prefix(config.Cluster.Bucket, path.Join(config.Cluster.ClusterName, _inProgressFilePrefix, apiName), true) - if err != nil { - return err - } - return nil -} - -func listAllInProgressJobKeys() ([]spec.JobKey, error) { - s3Objects, err := config.AWS.ListS3Dir(config.Cluster.Bucket, path.Join(config.Cluster.ClusterName, _inProgressFilePrefix), false, nil) - if err != nil { - return nil, err - } - - jobKeys := make([]spec.JobKey, 0, len(s3Objects)) - for _, obj := range s3Objects { - jobKeys = append(jobKeys, jobKeyFromInProgressS3Key(*obj.Key)) - } - - return jobKeys, nil -} - -func listAllInProgressJobKeysByAPI(apiName string) ([]spec.JobKey, error) { - s3Objects, err := config.AWS.ListS3Dir(config.Cluster.Bucket, path.Join(config.Cluster.ClusterName, _inProgressFilePrefix, apiName), false, nil) - if err != nil { - return nil, err - } - - jobKeys := make([]spec.JobKey, 0, len(s3Objects)) - for _, obj := range s3Objects { - jobKeys = append(jobKeys, jobKeyFromInProgressS3Key(*obj.Key)) - } - - return jobKeys, nil -} diff --git a/pkg/operator/resources/batchapi/job_state.go b/pkg/operator/resources/batchapi/job_state.go deleted file mode 100644 index 7967c5669c..0000000000 --- a/pkg/operator/resources/batchapi/job_state.go +++ /dev/null @@ -1,440 +0,0 @@ -/* -Copyright 2021 Cortex Labs, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package batchapi - -import ( - "path" - "path/filepath" - "time" - - "github.com/cortexlabs/cortex/pkg/lib/errors" - "github.com/cortexlabs/cortex/pkg/lib/pointer" - "github.com/cortexlabs/cortex/pkg/operator/config" - "github.com/cortexlabs/cortex/pkg/operator/operator" - "github.com/cortexlabs/cortex/pkg/types/spec" - "github.com/cortexlabs/cortex/pkg/types/status" - kbatch "k8s.io/api/batch/v1" - kcore "k8s.io/api/core/v1" -) - -const ( - _averageFilesPerJobState = 10 -) - -type JobState struct { - spec.JobKey - Status status.JobCode - LastUpdatedMap map[string]time.Time - EndTime *time.Time -} - -func (j JobState) GetLastUpdated() time.Time { - lastUpdated := time.Time{} - - for _, fileLastUpdated := range j.LastUpdatedMap { - if lastUpdated.After(fileLastUpdated) { - lastUpdated = fileLastUpdated - } - } - - return lastUpdated -} - -func (j JobState) GetFirstCreated() time.Time { - firstCreated := time.Unix(1<<63-62135596801, 999999999) // Max time - - for _, fileLastUpdated := range j.LastUpdatedMap { - if firstCreated.After(fileLastUpdated) { - firstCreated = fileLastUpdated - } - } - - return firstCreated -} - -// Doesn't assume only status files are present. The order below matters. -func getStatusCode(lastUpdatedMap map[string]time.Time) status.JobCode { - if _, ok := lastUpdatedMap[status.JobStopped.String()]; ok { - return status.JobStopped - } - - if _, ok := lastUpdatedMap[status.JobTimedOut.String()]; ok { - return status.JobTimedOut - } - - if _, ok := lastUpdatedMap[status.JobWorkerOOM.String()]; ok { - return status.JobWorkerOOM - } - - if _, ok := lastUpdatedMap[status.JobWorkerError.String()]; ok { - return status.JobWorkerError - } - - if _, ok := lastUpdatedMap[status.JobEnqueueFailed.String()]; ok { - return status.JobEnqueueFailed - } - - if _, ok := lastUpdatedMap[status.JobUnexpectedError.String()]; ok { - return status.JobUnexpectedError - } - - if _, ok := lastUpdatedMap[status.JobCompletedWithFailures.String()]; ok { - return status.JobCompletedWithFailures - } - - if _, ok := lastUpdatedMap[status.JobSucceeded.String()]; ok { - return status.JobSucceeded - } - - if _, ok := lastUpdatedMap[status.JobRunning.String()]; ok { - return status.JobRunning - } - - if _, ok := lastUpdatedMap[status.JobEnqueuing.String()]; ok { - return status.JobEnqueuing - } - - return status.JobUnknown -} - -func getJobState(jobKey spec.JobKey) (*JobState, error) { - s3Objects, err := config.AWS.ListS3Prefix(config.Cluster.Bucket, jobKey.Prefix(config.Cluster.ClusterName), false, nil) - if err != nil { - return nil, errors.Wrap(err, "failed to get job state", jobKey.UserString()) - } - - if len(s3Objects) == 0 { - return nil, errors.Wrap(ErrorJobNotFound(jobKey), "failed to get job state") - } - - lastUpdatedMap := map[string]time.Time{} - - for _, s3Object := range s3Objects { - lastUpdatedMap[filepath.Base(*s3Object.Key)] = *s3Object.LastModified - } - - jobState := getJobStateFromFiles(jobKey, lastUpdatedMap) - return &jobState, nil -} - -func getJobStateFromFiles(jobKey spec.JobKey, lastUpdatedFileMap map[string]time.Time) JobState { - statusCode := getStatusCode(lastUpdatedFileMap) - - var jobEndTime *time.Time - if statusCode.IsCompleted() { - if endTime, ok := lastUpdatedFileMap[statusCode.String()]; ok { - jobEndTime = &endTime - } - } - - return JobState{ - JobKey: jobKey, - LastUpdatedMap: lastUpdatedFileMap, - Status: statusCode, - EndTime: jobEndTime, - } -} - -func getMostRecentlySubmittedJobStates(apiName string, count int) ([]*JobState, error) { - // a single job state may include 5 files on average, overshoot the number of files needed - s3Objects, err := config.AWS.ListS3Prefix(config.Cluster.Bucket, spec.BatchAPIJobPrefix(apiName, config.Cluster.ClusterName), false, pointer.Int64(int64(count*_averageFilesPerJobState))) - if err != nil { - return nil, err - } - - // job id -> file name -> last update timestamp - lastUpdatedMaps := map[string]map[string]time.Time{} - - jobIDOrder := []string{} - for _, s3Object := range s3Objects { - fileName := filepath.Base(*s3Object.Key) - jobID := filepath.Base(filepath.Dir(*s3Object.Key)) - - if _, ok := lastUpdatedMaps[jobID]; !ok { - jobIDOrder = append(jobIDOrder, jobID) - lastUpdatedMaps[jobID] = map[string]time.Time{fileName: *s3Object.LastModified} - } else { - lastUpdatedMaps[jobID][fileName] = *s3Object.LastModified - } - } - - jobStates := make([]*JobState, 0, count) - - jobStateCount := 0 - for _, jobID := range jobIDOrder { - jobState := getJobStateFromFiles(spec.JobKey{APIName: apiName, ID: jobID}, lastUpdatedMaps[jobID]) - jobStates = append(jobStates, &jobState) - - jobStateCount++ - if jobStateCount == count { - break - } - } - - return jobStates, nil -} - -func setStatusForJob(jobKey spec.JobKey, jobStatus status.JobCode) error { - switch jobStatus { - case status.JobEnqueuing: - return setEnqueuingStatus(jobKey) - case status.JobRunning: - return setRunningStatus(jobKey) - case status.JobEnqueueFailed: - return setEnqueueFailedStatus(jobKey) - case status.JobCompletedWithFailures: - return setCompletedWithFailuresStatus(jobKey) - case status.JobSucceeded: - return setSucceededStatus(jobKey) - case status.JobUnexpectedError: - return setUnexpectedErrorStatus(jobKey) - case status.JobWorkerError: - return setWorkerErrorStatus(jobKey) - case status.JobWorkerOOM: - return setWorkerOOMStatus(jobKey) - case status.JobTimedOut: - return setTimedOutStatus(jobKey) - case status.JobStopped: - return setStoppedStatus(jobKey) - } - return nil -} - -func setEnqueuingStatus(jobKey spec.JobKey) error { - err := updateLiveness(jobKey) - if err != nil { - return err - } - - err = config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobEnqueuing.String())) - if err != nil { - return err - } - - err = uploadInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setRunningStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobRunning.String())) - if err != nil { - return err - } - - err = uploadInProgressFile(jobKey) // in progress file should already be there but just in case - if err != nil { - return err - } - - return nil -} - -func setStoppedStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobStopped.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setSucceededStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobSucceeded.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setCompletedWithFailuresStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobCompletedWithFailures.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setWorkerErrorStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobWorkerError.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setWorkerOOMStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobWorkerOOM.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setEnqueueFailedStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobEnqueueFailed.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setUnexpectedErrorStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobUnexpectedError.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func setTimedOutStatus(jobKey spec.JobKey) error { - err := config.AWS.UploadStringToS3("", config.Cluster.Bucket, path.Join(jobKey.Prefix(config.Cluster.ClusterName), status.JobTimedOut.String())) - if err != nil { - return err - } - - err = deleteInProgressFile(jobKey) - if err != nil { - return err - } - - return nil -} - -func getJobStatusFromJobState(jobState *JobState, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.JobStatus, error) { - jobKey := jobState.JobKey - - jobSpec, err := operator.DownloadJobSpec(jobKey) - if err != nil { - return nil, err - } - - jobStatus := status.JobStatus{ - Job: *jobSpec, - EndTime: jobState.EndTime, - Status: jobState.Status, - } - - if jobState.Status.IsInProgress() { - queueMetrics, err := getQueueMetrics(jobKey) - if err != nil { - return nil, err - } - - jobStatus.BatchesInQueue = queueMetrics.TotalUserMessages() - - if jobState.Status == status.JobEnqueuing { - jobStatus.TotalBatchCount = queueMetrics.TotalUserMessages() - } - - if jobState.Status == status.JobRunning { - metrics, err := getRealTimeBatchMetrics(jobKey) - if err != nil { - return nil, err - } - jobStatus.BatchMetrics = metrics - - // There can be race conditions where the job state is temporarily out of sync with the cluster state - if k8sJob != nil { - workerCounts := getWorkerCountsForJob(*k8sJob, pods) - jobStatus.WorkerCounts = &workerCounts - } - } - } - - if jobState.Status.IsCompleted() { - metrics, err := getCompletedBatchMetrics(jobKey, jobSpec.StartTime, *jobState.EndTime) - if err != nil { - return nil, err - } - jobStatus.BatchMetrics = metrics - } - - return &jobStatus, nil -} - -func GetJobStatus(jobKey spec.JobKey) (*status.JobStatus, error) { - jobState, err := getJobState(jobKey) - if err != nil { - return nil, err - } - - k8sJob, err := config.K8s.GetJob(jobKey.K8sName()) - if err != nil { - return nil, err - } - - pods, err := config.K8s.ListPodsByLabels(map[string]string{"apiName": jobKey.APIName, "jobID": jobKey.ID}) - if err != nil { - return nil, err - } - - return getJobStatusFromJobState(jobState, k8sJob, pods) -} - -func getJobStatusFromK8sJob(jobKey spec.JobKey, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.JobStatus, error) { - jobState, err := getJobState(jobKey) - if err != nil { - return nil, err - } - - return getJobStatusFromJobState(jobState, k8sJob, pods) -} diff --git a/pkg/operator/resources/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go similarity index 79% rename from pkg/operator/resources/batchapi/api.go rename to pkg/operator/resources/job/batchapi/api.go index 6e02cf5f55..ceae3b3a0f 100644 --- a/pkg/operator/resources/batchapi/api.go +++ b/pkg/operator/resources/job/batchapi/api.go @@ -26,6 +26,7 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" @@ -33,8 +34,6 @@ import ( istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" kbatch "k8s.io/api/batch/v1" kcore "k8s.io/api/core/v1" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - klabels "k8s.io/apimachinery/pkg/labels" ) func UpdateAPI(apiConfig *userconfig.API, projectID string) (*spec.API, string, error) { @@ -43,7 +42,7 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string) (*spec.API, string, return nil, "", err } - api := spec.GetAPISpec(apiConfig, projectID, "", config.Cluster.ClusterName) // Deployment ID not needed for BatchAPI spec + api := spec.GetAPISpec(apiConfig, projectID, "", config.ClusterName()) // Deployment ID not needed for BatchAPI spec if prevVirtualService == nil { if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil { @@ -101,48 +100,33 @@ func DeleteAPI(apiName string, keepCache bool) error { return nil } -func deleteK8sResources(apiName string) error { - return parallel.RunFirstErr( - func() error { - _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ - LabelSelector: klabels.SelectorFromSet(map[string]string{"apiName": apiName}).String(), - }) - return err - }, - func() error { - _, err := config.K8s.DeleteVirtualService(operator.K8sName(apiName)) - return err - }, - ) -} - func deleteS3Resources(apiName string) error { return parallel.RunFirstErr( func() error { - prefix := filepath.Join(config.Cluster.ClusterName, "apis", apiName) + prefix := filepath.Join(config.ClusterName(), "apis", apiName) return config.AWS.DeleteS3Dir(config.Cluster.Bucket, prefix, true) }, func() error { - prefix := spec.BatchAPIJobPrefix(apiName, config.Cluster.ClusterName) + prefix := spec.JobAPIPrefix(config.ClusterName(), userconfig.BatchAPIKind, apiName) routines.RunWithPanicHandler(func() { config.AWS.DeleteS3Dir(config.Cluster.Bucket, prefix, true) // deleting job files may take a while }) return nil }, func() error { - deleteAllInProgressFilesByAPI(apiName) // not useful xml error is thrown, swallow the error + _ = job.DeleteAllInProgressFilesByAPI(userconfig.BatchAPIKind, apiName) // not useful xml error is thrown, swallow the error return nil }, ) } -// Returns all batch apis, for each API returning the most recently submitted job and all running jobs +// GetAllAPIs returns all batch apis, for each API returning the most recently submitted job and all running jobs func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs []kbatch.Job, pods []kcore.Pod) ([]schema.APIResponse, error) { batchAPIsMap := map[string]*schema.APIResponse{} jobIDToK8sJobMap := map[string]*kbatch.Job{} - for _, job := range k8sJobs { - jobIDToK8sJobMap[job.Labels["jobID"]] = &job + for _, kJob := range k8sJobs { + jobIDToK8sJobMap[kJob.Labels["jobID"]] = &kJob } jobIDToPodsMap := map[string][]kcore.Pod{} @@ -166,9 +150,9 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs return nil, err } - jobStates, err := getMostRecentlySubmittedJobStates(apiName, 1) + jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.BatchAPIKind) - jobStatuses := []status.JobStatus{} + var jobStatuses []status.BatchJobStatus if len(jobStates) > 0 { jobStatus, err := getJobStatusFromJobState(jobStates[0], jobIDToK8sJobMap[jobStates[0].ID], jobIDToPodsMap[jobStates[0].ID]) if err != nil { @@ -179,13 +163,13 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } batchAPIsMap[apiName] = &schema.APIResponse{ - Spec: *api, - Endpoint: endpoint, - JobStatuses: jobStatuses, + Spec: *api, + Endpoint: endpoint, + BatchJobStatuses: jobStatuses, } } - inProgressJobKeys, err := listAllInProgressJobKeys() + inProgressJobKeys, err := job.ListAllInProgressJobKeys(userconfig.BatchAPIKind) if err != nil { return nil, err } @@ -197,7 +181,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs continue } - for _, jobStatus := range batchAPIsMap[jobKey.APIName].JobStatuses { + for _, jobStatus := range batchAPIsMap[jobKey.APIName].BatchJobStatuses { if jobStatus.ID == jobKey.ID { alreadyAdded = true break @@ -214,7 +198,7 @@ func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs } if jobStatus.Status.IsInProgress() { - batchAPIsMap[jobKey.APIName].JobStatuses = append(batchAPIsMap[jobKey.APIName].JobStatuses, *jobStatus) + batchAPIsMap[jobKey.APIName].BatchJobStatuses = append(batchAPIsMap[jobKey.APIName].BatchJobStatuses, *jobStatus) } } @@ -242,8 +226,8 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp } jobIDToK8sJobMap := map[string]*kbatch.Job{} - for _, job := range k8sJobs { - jobIDToK8sJobMap[job.Labels["jobID"]] = &job + for _, kJob := range k8sJobs { + jobIDToK8sJobMap[kJob.Labels["jobID"]] = &kJob } endpoint, err := operator.APIEndpoint(api) @@ -261,12 +245,12 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp jobIDToPodsMap[pod.Labels["jobID"]] = append(jobIDToPodsMap[pod.Labels["jobID"]], pod) } - inProgressJobKeys, err := listAllInProgressJobKeysByAPI(deployedResource.Name) + inProgressJobKeys, err := job.ListAllInProgressJobKeysByAPI(userconfig.BatchAPIKind, deployedResource.Name) if err != nil { return nil, err } - jobStatuses := []status.JobStatus{} + var jobStatuses []status.BatchJobStatus jobIDSet := strset.New() for _, jobKey := range inProgressJobKeys { jobStatus, err := getJobStatusFromK8sJob(jobKey, jobIDToK8sJobMap[jobKey.ID], jobIDToPodsMap[jobKey.ID]) @@ -279,7 +263,7 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp } if len(jobStatuses) < 10 { - jobStates, err := getMostRecentlySubmittedJobStates(deployedResource.Name, 10+len(jobStatuses)) + jobStates, err := job.GetMostRecentlySubmittedJobStates(deployedResource.Name, 10+len(jobStatuses), userconfig.BatchAPIKind) if err != nil { return nil, err } @@ -303,9 +287,9 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp return []schema.APIResponse{ { - Spec: *api, - JobStatuses: jobStatuses, - Endpoint: endpoint, + Spec: *api, + BatchJobStatuses: jobStatuses, + Endpoint: endpoint, }, }, nil } diff --git a/pkg/operator/resources/batchapi/batch_enqueuer.go b/pkg/operator/resources/job/batchapi/batch_enqueuer.go similarity index 100% rename from pkg/operator/resources/batchapi/batch_enqueuer.go rename to pkg/operator/resources/job/batchapi/batch_enqueuer.go diff --git a/pkg/operator/resources/batchapi/manage_resources_cron.go b/pkg/operator/resources/job/batchapi/cron.go similarity index 86% rename from pkg/operator/resources/batchapi/manage_resources_cron.go rename to pkg/operator/resources/job/batchapi/cron.go index 2e880ebfa2..3ac97e486b 100644 --- a/pkg/operator/resources/batchapi/manage_resources_cron.go +++ b/pkg/operator/resources/job/batchapi/cron.go @@ -29,9 +29,13 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/telemetry" "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" "github.com/cortexlabs/cortex/pkg/types/spec" "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" kbatch "k8s.io/api/batch/v1" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + klabels "k8s.io/apimachinery/pkg/labels" ) const ( @@ -41,11 +45,11 @@ const ( _k8sJobExistenceGracePeriod = 10 * time.Second ) -var _jobsToDelete strset.Set = strset.New() -var _inProgressJobSpecMap = map[string]*spec.Job{} +var _jobsToDelete = strset.New() +var _inProgressJobSpecMap = map[string]*spec.BatchJob{} func ManageJobResources() error { - inProgressJobKeys, err := listAllInProgressJobKeys() + inProgressJobKeys, err := job.ListAllInProgressJobKeys(userconfig.BatchAPIKind) if err != nil { return err } @@ -75,17 +79,22 @@ func ManageJobResources() error { queueURLMap[jobKey.ID] = queueURL } - jobs, err := config.K8s.ListJobs(nil) + jobs, err := config.K8s.ListJobs( + &kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{"apiKind": userconfig.BatchAPIKind.String()}, + ).String(), + }, + ) if err != nil { return err } k8sJobMap := map[string]*kbatch.Job{} k8sJobIDSet := strset.Set{} - for i := range jobs { - job := jobs[i] - k8sJobMap[job.Labels["jobID"]] = &job - k8sJobIDSet.Add(job.Labels["jobID"]) + for _, kJob := range jobs { + k8sJobMap[kJob.Labels["jobID"]] = &kJob + k8sJobIDSet.Add(kJob.Labels["jobID"]) } for _, jobKey := range inProgressJobKeys { @@ -103,12 +112,12 @@ func ManageJobResources() error { continue } - jobState, err := getJobState(jobKey) + jobState, err := job.GetJobState(jobKey) if err != nil { - jobLogger.Error(err.Error()) + jobLogger.Error(err) jobLogger.Error("terminating job and cleaning up job resources") err := errors.FirstError( - deleteInProgressFile(jobKey), + job.DeleteInProgressFile(jobKey), deleteJobRuntimeResources(jobKey), ) if err != nil { @@ -121,8 +130,8 @@ func ManageJobResources() error { if !jobState.Status.IsInProgress() { // best effort cleanup - deleteInProgressFile(jobKey) - deleteJobRuntimeResources(jobKey) + _ = job.DeleteInProgressFile(jobKey) + _ = deleteJobRuntimeResources(jobKey) continue } @@ -134,7 +143,7 @@ func ManageJobResources() error { } if newStatusCode != jobState.Status { jobLogger.Error(msg) - err := setStatusForJob(jobKey, newStatusCode) + err := job.SetStatusForJob(jobKey, newStatusCode) if err != nil { telemetry.Error(err) operatorLogger.Error(err) @@ -147,18 +156,17 @@ func ManageJobResources() error { } if _, ok := _inProgressJobSpecMap[jobKey.ID]; !ok { - jobSpec, err := operator.DownloadJobSpec(jobKey) + jobSpec, err := operator.DownloadBatchJobSpec(jobKey) if err != nil { - jobLogger.Error(err.Error()) + jobLogger.Error(err) jobLogger.Error("terminating job and cleaning up job resources") err := errors.FirstError( - deleteInProgressFile(jobKey), + job.DeleteInProgressFile(jobKey), deleteJobRuntimeResources(jobKey), ) if err != nil { telemetry.Error(err) operatorLogger.Error(err) - continue } continue } @@ -170,7 +178,7 @@ func ManageJobResources() error { if jobSpec.Timeout != nil && time.Since(jobSpec.StartTime) > time.Second*time.Duration(*jobSpec.Timeout) { jobLogger.Errorf("terminating job after exceeding the specified timeout of %d seconds", *jobSpec.Timeout) err := errors.FirstError( - setTimedOutStatus(jobKey), + job.SetTimedOutStatus(jobKey), deleteJobRuntimeResources(jobKey), ) if err != nil { @@ -241,7 +249,7 @@ func ManageJobResources() error { } // verifies that queue exists for an in progress job and k8s job exists for a job in running status, if verification fails return the a job code to reflect the state -func reconcileInProgressJob(jobState *JobState, queueURL *string, k8sJob *kbatch.Job) (status.JobCode, string, error) { +func reconcileInProgressJob(jobState *job.State, queueURL *string, k8sJob *kbatch.Job) (status.JobCode, string, error) { jobKey := jobState.JobKey if queueURL == nil { @@ -258,7 +266,7 @@ func reconcileInProgressJob(jobState *JobState, queueURL *string, k8sJob *kbatch return status.JobUnexpectedError, fmt.Sprintf("terminating job %s; sqs queue with url %s was not found", jobKey.UserString(), expectedQueueURL), nil } - if jobState.Status == status.JobEnqueuing && time.Since(jobState.LastUpdatedMap[_enqueuingLivenessFile]) >= _enqueuingLivenessPeriod+_enqueuingLivenessBuffer { + if jobState.Status == status.JobEnqueuing && time.Since(jobState.LastUpdatedMap[job.LivenessFile()]) >= _enqueuingLivenessPeriod+_enqueuingLivenessBuffer { return status.JobEnqueueFailed, fmt.Sprintf("terminating job %s; enqueuing liveness check failed", jobKey.UserString()), nil } @@ -277,7 +285,7 @@ func reconcileInProgressJob(jobState *JobState, queueURL *string, k8sJob *kbatch func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job) error { if int(k8sJob.Status.Failed) > 0 { - return investigateJobFailure(jobKey, k8sJob) + return investigateJobFailure(jobKey) } queueMessages, err := getQueueMetricsFromURL(queueURL) @@ -297,7 +305,7 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job _jobsToDelete.Remove(jobKey.ID) jobLogger.Error("unexpected job status because cluster state indicates job has completed but metrics indicate that job is still in progress") return errors.FirstError( - setUnexpectedErrorStatus(jobKey), + job.SetUnexpectedErrorStatus(jobKey), deleteJobRuntimeResources(jobKey), ) } @@ -311,7 +319,7 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job return err } - jobSpec, err := operator.DownloadJobSpec(jobKey) + jobSpec, err := operator.DownloadBatchJobSpec(jobKey) if err != nil { return err } @@ -319,7 +327,7 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job if jobSpec.TotalBatchCount == batchMetrics.Succeeded { _jobsToDelete.Remove(jobKey.ID) return errors.FirstError( - setSucceededStatus(jobKey), + job.SetSucceededStatus(jobKey), deleteJobRuntimeResources(jobKey), ) } @@ -328,7 +336,7 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job if _jobsToDelete.Has(jobKey.ID) { _jobsToDelete.Remove(jobKey.ID) return errors.FirstError( - setCompletedWithFailuresStatus(jobKey), + job.SetCompletedWithFailuresStatus(jobKey), deleteJobRuntimeResources(jobKey), ) } @@ -341,7 +349,7 @@ func checkIfJobCompleted(jobKey spec.JobKey, queueURL string, k8sJob *kbatch.Job return nil } -func investigateJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) error { +func investigateJobFailure(jobKey spec.JobKey) error { reasonFound := false jobLogger, err := operator.GetJobLogger(jobKey) @@ -354,7 +362,7 @@ func investigateJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) error { if k8s.WasPodOOMKilled(&pod) { jobLogger.Error("at least one worker was killed because it ran out of out of memory") return errors.FirstError( - setWorkerOOMStatus(jobKey), + job.SetWorkerOOMStatus(jobKey), deleteJobRuntimeResources(jobKey), ) } @@ -380,7 +388,7 @@ func investigateJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) error { return errors.FirstError( err, - setWorkerErrorStatus(jobKey), + job.SetWorkerErrorStatus(jobKey), deleteJobRuntimeResources(jobKey), ) } diff --git a/pkg/operator/resources/batchapi/enqueue.go b/pkg/operator/resources/job/batchapi/enqueue.go similarity index 90% rename from pkg/operator/resources/batchapi/enqueue.go rename to pkg/operator/resources/job/batchapi/enqueue.go index 69d0c33936..0aebf995b0 100644 --- a/pkg/operator/resources/batchapi/enqueue.go +++ b/pkg/operator/resources/job/batchapi/enqueue.go @@ -21,7 +21,6 @@ import ( "encoding/json" "fmt" "io" - "path" "time" "github.com/aws/aws-sdk-go/aws" @@ -35,12 +34,12 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/lib/logging" "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" ) const ( - _enqueuingLivenessFile = "enqueuing_liveness" _enqueuingLivenessPeriod = 20 * time.Second _s3DownloadChunkSize = 32 * 1024 * 1024 ) @@ -51,18 +50,9 @@ func randomMessageID() string { return random.String(40) // maximum is 80 (for sqs.SendMessageBatchRequestEntry.Id) but this ID may show up in a user error message } -func updateLiveness(jobKey spec.JobKey) error { - s3Key := path.Join(jobKey.Prefix(config.Cluster.ClusterName), _enqueuingLivenessFile) - err := config.AWS.UploadJSONToS3(time.Now(), config.Cluster.Bucket, s3Key) - if err != nil { - return errors.Wrap(err, "failed to update liveness", jobKey.UserString()) - } - return nil -} - -func enqueue(jobSpec *spec.Job, submission *schema.JobSubmission) (int, error) { +func enqueue(jobSpec *spec.BatchJob, submission *schema.BatchJobSubmission) (int, error) { livenessUpdater := func() error { - return updateLiveness(jobSpec.JobKey) + return job.UpdateLiveness(jobSpec.JobKey) } livenessCron := cron.Run(livenessUpdater, operator.ErrorHandler(fmt.Sprintf("liveness check for %s", jobSpec.UserString())), _enqueuingLivenessPeriod) @@ -115,7 +105,7 @@ func enqueue(jobSpec *spec.Job, submission *schema.JobSubmission) (int, error) { return totalBatches, nil } -func enqueueItems(jobSpec *spec.Job, itemList *schema.ItemList) (int, error) { +func enqueueItems(jobSpec *spec.BatchJob, itemList *schema.ItemList) (int, error) { batchCount := len(itemList.Items) / itemList.BatchSize if len(itemList.Items)%itemList.BatchSize != 0 { batchCount++ @@ -165,7 +155,7 @@ func enqueueItems(jobSpec *spec.Job, itemList *schema.ItemList) (int, error) { return uploader.TotalBatches, nil } -func enqueueS3Paths(jobSpec *spec.Job, s3PathsLister *schema.FilePathLister) (int, error) { +func enqueueS3Paths(jobSpec *spec.BatchJob, s3PathsLister *schema.FilePathLister) (int, error) { jobLogger, err := operator.GetJobLogger(jobSpec.JobKey) if err != nil { return 0, err @@ -248,7 +238,7 @@ func (j *jsonBuffer) Length() int { return len(j.messageList) } -func enqueueS3FileContents(jobSpec *spec.Job, delimitedFiles *schema.DelimitedFiles) (int, error) { +func enqueueS3FileContents(jobSpec *spec.BatchJob, delimitedFiles *schema.DelimitedFiles) (int, error) { jobLogger, err := operator.GetJobLogger(jobSpec.JobKey) if err != nil { return 0, err @@ -305,7 +295,7 @@ func enqueueS3FileContents(jobSpec *spec.Job, delimitedFiles *schema.DelimitedFi return uploader.TotalBatches, nil } -func streamJSONToQueue(jobSpec *spec.Job, uploader *sqsBatchUploader, bytesBuffer *bytes.Buffer, jsonMessageList *jsonBuffer, itemIndex *int) error { +func streamJSONToQueue(jobSpec *spec.BatchJob, uploader *sqsBatchUploader, bytesBuffer *bytes.Buffer, jsonMessageList *jsonBuffer, itemIndex *int) error { jobLogger, err := operator.GetJobLogger(jobSpec.JobKey) if err != nil { return err diff --git a/pkg/operator/resources/batchapi/errors.go b/pkg/operator/resources/job/batchapi/errors.go similarity index 57% rename from pkg/operator/resources/batchapi/errors.go rename to pkg/operator/resources/job/batchapi/errors.go index c48db380f6..7f5b23904c 100644 --- a/pkg/operator/resources/batchapi/errors.go +++ b/pkg/operator/resources/job/batchapi/errors.go @@ -20,44 +20,16 @@ import ( "fmt" "github.com/cortexlabs/cortex/pkg/lib/errors" - s "github.com/cortexlabs/cortex/pkg/lib/strings" - "github.com/cortexlabs/cortex/pkg/types/spec" ) const ( - ErrJobNotFound = "batchapi.job_not_found" - ErrJobIsNotInProgress = "batchapi.job_is_not_in_progress" - ErrJobHasAlreadyBeenStopped = "batchapi.job_has_already_been_stopped" ErrNoS3FilesFound = "batchapi.no_s3_files_found" ErrNoDataFoundInJobSubmission = "batchapi.no_data_found_in_job_submission" ErrFailedToEnqueueMessages = "batchapi.failed_to_enqueue_messages" ErrMessageExceedsMaxSize = "batchapi.message_exceeds_max_size" - ErrConflictingFields = "batchapi.conflicting_fields" ErrBatchItemSizeExceedsLimit = "batchapi.item_size_exceeds_limit" - ErrSpecifyExactlyOneKey = "batchapi.specify_exactly_one_key" ) -func ErrorJobNotFound(jobKey spec.JobKey) error { - return errors.WithStack(&errors.Error{ - Kind: ErrJobNotFound, - Message: fmt.Sprintf("unable to find batch job %s", jobKey.UserString()), - }) -} - -func ErrorJobIsNotInProgress() error { - return errors.WithStack(&errors.Error{ - Kind: ErrJobIsNotInProgress, - Message: "cannot stop batch job because it is not in progress", - }) -} - -func ErrorJobHasAlreadyBeenStopped() error { - return errors.WithStack(&errors.Error{ - Kind: ErrJobHasAlreadyBeenStopped, - Message: "batch job has already been stopped", - }) -} - func ErrorNoS3FilesFound() error { return errors.WithStack(&errors.Error{ Kind: ErrNoS3FilesFound, @@ -86,26 +58,9 @@ func ErrorMessageExceedsMaxSize(messageSize int, messageLimit int) error { }) } -func ErrorConflictingFields(key string, keys ...string) error { - allKeys := append([]string{key}, keys...) - - return errors.WithStack(&errors.Error{ - Kind: ErrConflictingFields, - Message: fmt.Sprintf("please specify either the %s field (but not more than one at the same time)", s.StrsOr(allKeys)), - }) -} - func ErrorItemSizeExceedsLimit(index int, size int, limit int) error { return errors.WithStack(&errors.Error{ Kind: ErrBatchItemSizeExceedsLimit, Message: fmt.Sprintf("item %d has size %d bytes which exceeds the limit (%d bytes)", index, size, limit), }) } - -func ErrorSpecifyExactlyOneKey(key string, keys ...string) error { - allKeys := append([]string{key}, keys...) - return errors.WithStack(&errors.Error{ - Kind: ErrSpecifyExactlyOneKey, - Message: fmt.Sprintf("specify exactly one of the following keys: %s", s.StrsOr(allKeys)), - }) -} diff --git a/pkg/operator/resources/batchapi/job.go b/pkg/operator/resources/job/batchapi/job.go similarity index 77% rename from pkg/operator/resources/batchapi/job.go rename to pkg/operator/resources/job/batchapi/job.go index c2fb677580..c8b15ad1d6 100644 --- a/pkg/operator/resources/batchapi/job.go +++ b/pkg/operator/resources/job/batchapi/job.go @@ -24,13 +24,12 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/cortexlabs/cortex/pkg/types/spec" - kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" - klabels "k8s.io/apimachinery/pkg/labels" ) -func DryRun(submission *schema.JobSubmission) ([]string, error) { +func DryRun(submission *schema.BatchJobSubmission) ([]string, error) { err := validateJobSubmission(submission) if err != nil { return nil, err @@ -57,7 +56,7 @@ func DryRun(submission *schema.JobSubmission) ([]string, error) { return nil, nil } -func SubmitJob(apiName string, submission *schema.JobSubmission) (*spec.Job, error) { +func SubmitJob(apiName string, submission *schema.BatchJobSubmission) (*spec.BatchJob, error) { err := validateJobSubmission(submission) if err != nil { return nil, err @@ -80,6 +79,7 @@ func SubmitJob(apiName string, submission *schema.JobSubmission) (*spec.Job, err jobKey := spec.JobKey{ APIName: apiSpec.Name, ID: jobID, + Kind: apiSpec.Kind, } tags := map[string]string{ @@ -93,14 +93,14 @@ func SubmitJob(apiName string, submission *schema.JobSubmission) (*spec.Job, err return nil, err } - jobSpec := spec.Job{ - RuntimeJobConfig: submission.RuntimeJobConfig, - JobKey: jobKey, - APIID: apiSpec.ID, - SpecID: apiSpec.SpecID, - PredictorID: apiSpec.PredictorID, - SQSUrl: queueURL, - StartTime: time.Now(), + jobSpec := spec.BatchJob{ + RuntimeBatchJobConfig: submission.RuntimeBatchJobConfig, + JobKey: jobKey, + APIID: apiSpec.ID, + SpecID: apiSpec.SpecID, + PredictorID: apiSpec.PredictorID, + SQSUrl: queueURL, + StartTime: time.Now(), } err = uploadJobSpec(&jobSpec) @@ -115,7 +115,7 @@ func SubmitJob(apiName string, submission *schema.JobSubmission) (*spec.Job, err return nil, err } - err = setEnqueuingStatus(jobKey) + err = job.SetEnqueuingStatus(jobKey) if err != nil { deleteQueueByURL(queueURL) return nil, err @@ -130,15 +130,15 @@ func SubmitJob(apiName string, submission *schema.JobSubmission) (*spec.Job, err return &jobSpec, nil } -func uploadJobSpec(jobSpec *spec.Job) error { - err := config.AWS.UploadJSONToS3(jobSpec, config.Cluster.Bucket, jobSpec.SpecFilePath(config.Cluster.ClusterName)) +func uploadJobSpec(jobSpec *spec.BatchJob) error { + err := config.AWS.UploadJSONToS3(jobSpec, config.Cluster.Bucket, jobSpec.SpecFilePath(config.ClusterName())) if err != nil { return err } return nil } -func deployJob(apiSpec *spec.API, jobSpec *spec.Job, submission *schema.JobSubmission) { +func deployJob(apiSpec *spec.API, jobSpec *spec.BatchJob, submission *schema.BatchJobSubmission) { jobLogger, err := operator.GetJobLoggerFromSpec(apiSpec, jobSpec.JobKey) if err != nil { telemetry.Error(err) @@ -151,7 +151,7 @@ func deployJob(apiSpec *spec.API, jobSpec *spec.Job, submission *schema.JobSubmi jobLogger.Error(errors.Wrap(err, "failed to enqueue all batches").Error()) err := errors.FirstError( - setEnqueueFailedStatus(jobSpec.JobKey), + job.SetEnqueueFailedStatus(jobSpec.JobKey), deleteJobRuntimeResources(jobSpec.JobKey), ) if err != nil { @@ -167,7 +167,7 @@ func deployJob(apiSpec *spec.API, jobSpec *spec.Job, submission *schema.JobSubmi if submission.DelimitedFiles != nil { jobLogger.Error("please verify that the files are not empty (the files being read can be retrieved by providing `dryRun=true` query param with your job submission") } - errs = append(errs, setEnqueueFailedStatus(jobSpec.JobKey)) + errs = append(errs, job.SetEnqueueFailedStatus(jobSpec.JobKey)) errs = append(errs, deleteJobRuntimeResources(jobSpec.JobKey)) err := errors.FirstError(errs...) @@ -194,7 +194,7 @@ func deployJob(apiSpec *spec.API, jobSpec *spec.Job, submission *schema.JobSubmi handleJobSubmissionError(jobSpec.JobKey, err) } - err = setRunningStatus(jobSpec.JobKey) + err = job.SetRunningStatus(jobSpec.JobKey) if err != nil { handleJobSubmissionError(jobSpec.JobKey, err) return @@ -211,7 +211,7 @@ func handleJobSubmissionError(jobKey spec.JobKey, jobErr error) { jobLogger.Error(jobErr.Error()) err = errors.FirstError( - setUnexpectedErrorStatus(jobKey), + job.SetUnexpectedErrorStatus(jobKey), deleteJobRuntimeResources(jobKey), ) if err != nil { @@ -220,31 +220,6 @@ func handleJobSubmissionError(jobKey spec.JobKey, jobErr error) { } } -func createK8sJob(apiSpec *spec.API, jobSpec *spec.Job) error { - job, err := k8sJobSpec(apiSpec, jobSpec) - if err != nil { - return err - } - - _, err = config.K8s.CreateJob(job) - if err != nil { - return err - } - - return nil -} - -func deleteK8sJob(jobKey spec.JobKey) error { - _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ - LabelSelector: klabels.SelectorFromSet(map[string]string{"apiName": jobKey.APIName, "jobID": jobKey.ID}).String(), - }) - if err != nil { - return err - } - - return nil -} - func deleteJobRuntimeResources(jobKey spec.JobKey) error { err := errors.FirstError( deleteK8sJob(jobKey), @@ -259,7 +234,7 @@ func deleteJobRuntimeResources(jobKey spec.JobKey) error { } func StopJob(jobKey spec.JobKey) error { - jobState, err := getJobState(jobKey) + jobState, err := job.GetJobState(jobKey) if err != nil { routines.RunWithPanicHandler(func() { deleteJobRuntimeResources(jobKey) @@ -271,7 +246,7 @@ func StopJob(jobKey spec.JobKey) error { routines.RunWithPanicHandler(func() { deleteJobRuntimeResources(jobKey) }) - return errors.Wrap(ErrorJobIsNotInProgress(), jobKey.UserString()) + return errors.Wrap(job.ErrorJobIsNotInProgress(jobKey.Kind), jobKey.UserString()) } jobLogger, err := operator.GetJobLogger(jobKey) @@ -281,6 +256,6 @@ func StopJob(jobKey spec.JobKey) error { return errors.FirstError( deleteJobRuntimeResources(jobKey), - setStoppedStatus(jobKey), + job.SetStoppedStatus(jobKey), ) } diff --git a/pkg/operator/resources/job/batchapi/job_status.go b/pkg/operator/resources/job/batchapi/job_status.go new file mode 100644 index 0000000000..1ceefdf73a --- /dev/null +++ b/pkg/operator/resources/job/batchapi/job_status.go @@ -0,0 +1,107 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package batchapi + +import ( + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + kbatch "k8s.io/api/batch/v1" + kcore "k8s.io/api/core/v1" +) + +func GetJobStatus(jobKey spec.JobKey) (*status.BatchJobStatus, error) { + jobState, err := job.GetJobState(jobKey) + if err != nil { + return nil, err + } + + k8sJob, err := config.K8s.GetJob(jobKey.K8sName()) + if err != nil { + return nil, err + } + + pods, err := config.K8s.ListPodsByLabels(map[string]string{"apiName": jobKey.APIName, "jobID": jobKey.ID}) + if err != nil { + return nil, err + } + + return getJobStatusFromJobState(jobState, k8sJob, pods) +} + +func getJobStatusFromJobState(jobState *job.State, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.BatchJobStatus, error) { + jobKey := jobState.JobKey + + jobSpec, err := operator.DownloadBatchJobSpec(jobKey) + if err != nil { + return nil, err + } + + jobStatus := status.BatchJobStatus{ + BatchJob: *jobSpec, + EndTime: jobState.EndTime, + Status: jobState.Status, + } + + if jobState.Status.IsInProgress() { + queueMetrics, err := getQueueMetrics(jobKey) + if err != nil { + return nil, err + } + + jobStatus.BatchesInQueue = queueMetrics.TotalUserMessages() + + if jobState.Status == status.JobEnqueuing { + jobStatus.TotalBatchCount = queueMetrics.TotalUserMessages() + } + + if jobState.Status == status.JobRunning { + metrics, err := getRealTimeBatchMetrics(jobKey) + if err != nil { + return nil, err + } + jobStatus.BatchMetrics = metrics + + // There can be race conditions where the job state is temporarily out of sync with the cluster state + if k8sJob != nil { + workerCounts := job.GetWorkerCountsForJob(*k8sJob, pods) + jobStatus.WorkerCounts = &workerCounts + } + } + } + + if jobState.Status.IsCompleted() { + metrics, err := getCompletedBatchMetrics(jobKey, jobSpec.StartTime, *jobState.EndTime) + if err != nil { + return nil, err + } + jobStatus.BatchMetrics = metrics + } + + return &jobStatus, nil +} + +func getJobStatusFromK8sJob(jobKey spec.JobKey, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.BatchJobStatus, error) { + jobState, err := job.GetJobState(jobKey) + if err != nil { + return nil, err + } + + return getJobStatusFromJobState(jobState, k8sJob, pods) +} diff --git a/pkg/operator/resources/batchapi/k8s_specs.go b/pkg/operator/resources/job/batchapi/k8s_specs.go similarity index 81% rename from pkg/operator/resources/batchapi/k8s_specs.go rename to pkg/operator/resources/job/batchapi/k8s_specs.go index dac3e7bd6b..84ae83509c 100644 --- a/pkg/operator/resources/batchapi/k8s_specs.go +++ b/pkg/operator/resources/job/batchapi/k8s_specs.go @@ -20,6 +20,7 @@ import ( "path" "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/parallel" "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/operator" @@ -28,11 +29,13 @@ import ( istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" kbatch "k8s.io/api/batch/v1" kcore "k8s.io/api/core/v1" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + klabels "k8s.io/apimachinery/pkg/labels" ) const _operatorService = "operator" -func k8sJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { +func k8sJobSpec(api *spec.API, job *spec.BatchJob) (*kbatch.Job, error) { switch api.Predictor.Type { case userconfig.TensorFlowPredictorType: return tensorFlowPredictorJobSpec(api, job) @@ -45,13 +48,13 @@ func k8sJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { } } -func pythonPredictorJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { +func pythonPredictorJobSpec(api *spec.API, job *spec.BatchJob) (*kbatch.Job, error) { containers, volumes := operator.PythonPredictorContainers(api) for i, container := range containers { if container.Name == operator.APIContainerName { containers[i].Env = append(container.Env, kcore.EnvVar{ Name: "CORTEX_JOB_SPEC", - Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.Cluster.ClusterName), + Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.ClusterName()), }) } } @@ -94,13 +97,13 @@ func pythonPredictorJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { }), nil } -func tensorFlowPredictorJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { +func tensorFlowPredictorJobSpec(api *spec.API, job *spec.BatchJob) (*kbatch.Job, error) { containers, volumes := operator.TensorFlowPredictorContainers(api) for i, container := range containers { if container.Name == operator.APIContainerName { containers[i].Env = append(container.Env, kcore.EnvVar{ Name: "CORTEX_JOB_SPEC", - Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.Cluster.ClusterName), + Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.ClusterName()), }) } } @@ -143,14 +146,14 @@ func tensorFlowPredictorJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, erro }), nil } -func onnxPredictorJobSpec(api *spec.API, job *spec.Job) (*kbatch.Job, error) { +func onnxPredictorJobSpec(api *spec.API, job *spec.BatchJob) (*kbatch.Job, error) { containers, volumes := operator.ONNXPredictorContainers(api) for i, container := range containers { if container.Name == operator.APIContainerName { containers[i].Env = append(container.Env, kcore.EnvVar{ Name: "CORTEX_JOB_SPEC", - Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.Cluster.ClusterName), + Value: "s3://" + config.Cluster.Bucket + "/" + job.SpecFilePath(config.ClusterName()), }) } } @@ -226,3 +229,43 @@ func applyK8sResources(api *spec.API, prevVirtualService *istioclientnetworking. _, err := config.K8s.UpdateVirtualService(prevVirtualService, newVirtualService) return err } + +func deleteK8sResources(apiName string) error { + return parallel.RunFirstErr( + func() error { + _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet(map[string]string{"apiName": apiName}).String(), + }) + return err + }, + func() error { + _, err := config.K8s.DeleteVirtualService(operator.K8sName(apiName)) + return err + }, + ) +} + +func deleteK8sJob(jobKey spec.JobKey) error { + _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet(map[string]string{"apiName": jobKey.APIName, "jobID": jobKey.ID}).String(), + }) + if err != nil { + return err + } + + return nil +} + +func createK8sJob(apiSpec *spec.API, jobSpec *spec.BatchJob) error { + kJob, err := k8sJobSpec(apiSpec, jobSpec) + if err != nil { + return err + } + + _, err = config.K8s.CreateJob(kJob) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/operator/resources/batchapi/metrics.go b/pkg/operator/resources/job/batchapi/metrics.go similarity index 96% rename from pkg/operator/resources/batchapi/metrics.go rename to pkg/operator/resources/job/batchapi/metrics.go index e7a76ca70c..70f34c389f 100644 --- a/pkg/operator/resources/batchapi/metrics.go +++ b/pkg/operator/resources/job/batchapi/metrics.go @@ -181,7 +181,7 @@ func batchMetricsDef(jobKey *spec.JobKey, period int64) []*cloudwatch.MetricData Label: aws.String("Succeeded"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("Succeeded"), Dimensions: getJobDimensionsCounter(jobKey), }, @@ -194,7 +194,7 @@ func batchMetricsDef(jobKey *spec.JobKey, period int64) []*cloudwatch.MetricData Label: aws.String("Failed"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("Failed"), Dimensions: getJobDimensionsCounter(jobKey), }, @@ -207,7 +207,7 @@ func batchMetricsDef(jobKey *spec.JobKey, period int64) []*cloudwatch.MetricData Label: aws.String("AverageTimePerBatch"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("TimePerBatch"), Dimensions: getJobDimensionsHistogram(jobKey), }, @@ -220,7 +220,7 @@ func batchMetricsDef(jobKey *spec.JobKey, period int64) []*cloudwatch.MetricData Label: aws.String("Total"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("TimePerBatch"), Dimensions: getJobDimensionsHistogram(jobKey), }, diff --git a/pkg/operator/resources/batchapi/queue.go b/pkg/operator/resources/job/batchapi/queue.go similarity index 97% rename from pkg/operator/resources/batchapi/queue.go rename to pkg/operator/resources/job/batchapi/queue.go index 3b44e7f11c..a3317cefb7 100644 --- a/pkg/operator/resources/batchapi/queue.go +++ b/pkg/operator/resources/job/batchapi/queue.go @@ -158,16 +158,16 @@ func getQueueMetricsFromURL(queueURL string) (*metrics.QueueMetrics, error) { return nil, errors.Wrap(err, "failed to get queue metrics") } - metrics := metrics.QueueMetrics{} + qMetrics := metrics.QueueMetrics{} parsedInt, ok := s.ParseInt(attributes["ApproximateNumberOfMessages"]) if ok { - metrics.Visible = parsedInt + qMetrics.Visible = parsedInt } parsedInt, ok = s.ParseInt(attributes["ApproximateNumberOfMessagesNotVisible"]) if ok { - metrics.NotVisible = parsedInt + qMetrics.NotVisible = parsedInt } - return &metrics, nil + return &qMetrics, nil } diff --git a/pkg/operator/resources/batchapi/s3_iterator.go b/pkg/operator/resources/job/batchapi/s3_iterator.go similarity index 100% rename from pkg/operator/resources/batchapi/s3_iterator.go rename to pkg/operator/resources/job/batchapi/s3_iterator.go diff --git a/pkg/operator/resources/batchapi/validations.go b/pkg/operator/resources/job/batchapi/validations.go similarity index 92% rename from pkg/operator/resources/batchapi/validations.go rename to pkg/operator/resources/job/batchapi/validations.go index de48e6afd2..d1ea72aff0 100644 --- a/pkg/operator/resources/batchapi/validations.go +++ b/pkg/operator/resources/job/batchapi/validations.go @@ -24,11 +24,12 @@ import ( awslib "github.com/cortexlabs/cortex/pkg/lib/aws" cr "github.com/cortexlabs/cortex/pkg/lib/configreader" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" "github.com/cortexlabs/cortex/pkg/operator/schema" "github.com/gobwas/glob" ) -func validateJobSubmissionSchema(submission *schema.JobSubmission) error { +func validateJobSubmissionSchema(submission *schema.BatchJobSubmission) error { providedKeys := []string{} if submission.ItemList != nil { providedKeys = append(providedKeys, schema.ItemListKey) @@ -41,11 +42,11 @@ func validateJobSubmissionSchema(submission *schema.JobSubmission) error { } if len(providedKeys) == 0 { - return ErrorSpecifyExactlyOneKey(schema.ItemListKey, schema.FilePathListerKey, schema.DelimitedFilesKey) + return job.ErrorSpecifyExactlyOneKey(schema.ItemListKey, schema.FilePathListerKey, schema.DelimitedFilesKey) } if len(providedKeys) > 1 { - return ErrorConflictingFields(providedKeys[0], providedKeys[1:]...) + return job.ErrorConflictingFields(providedKeys[0], providedKeys[1:]...) } if submission.ItemList != nil { @@ -96,7 +97,7 @@ func validateJobSubmissionSchema(submission *schema.JobSubmission) error { return nil } -func validateJobSubmission(submission *schema.JobSubmission) error { +func validateJobSubmission(submission *schema.BatchJobSubmission) error { err := validateJobSubmissionSchema(submission) if err != nil { return errors.Append(err, fmt.Sprintf("\n\njob submission schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor)) diff --git a/pkg/operator/resources/job/cache.go b/pkg/operator/resources/job/cache.go new file mode 100644 index 0000000000..25b02f4620 --- /dev/null +++ b/pkg/operator/resources/job/cache.go @@ -0,0 +1,121 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package job + +import ( + "path" + "strings" + + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" +) + +func ListAllInProgressJobKeysByAPI(kind userconfig.Kind, apiName string) ([]spec.JobKey, error) { + return listAllInProgressJobKeysByAPI(kind, &apiName) +} + +func ListAllInProgressJobKeys(kind userconfig.Kind) ([]spec.JobKey, error) { + return listAllInProgressJobKeysByAPI(kind, nil) +} + +func DeleteInProgressFile(jobKey spec.JobKey) error { + err := config.DeleteBucketFile(inProgressKey(jobKey)) + if err != nil { + return err + } + return nil +} + +func DeleteAllInProgressFilesByAPI(kind userconfig.Kind, apiName string) error { + err := config.DeleteBucketPrefix(allInProgressForAPIKey(kind, apiName), true) + if err != nil { + return err + } + return nil +} + +func listAllInProgressJobKeysByAPI(kind userconfig.Kind, apiName *string) ([]spec.JobKey, error) { + _, ok := _jobKinds[kind] + if !ok { + return nil, ErrorInvalidJobKind(kind) + } + + var jobPath string + if apiName != nil { + jobPath = allInProgressForAPIKey(kind, *apiName) + } else { + jobPath = allInProgressKey(kind) + } + + gcsObjects, s3Objects, err := config.ListBucketDir(jobPath, nil) + if err != nil { + return nil, err + } + + if len(gcsObjects) > 0 { + jobKeys := make([]spec.JobKey, 0, len(gcsObjects)) + for _, obj := range gcsObjects { + if obj != nil { + jobKeys = append(jobKeys, jobKeyFromInProgressKey(obj.Name)) + } + } + return jobKeys, nil + } + jobKeys := make([]spec.JobKey, 0, len(s3Objects)) + for _, obj := range s3Objects { + if obj != nil { + jobKeys = append(jobKeys, jobKeyFromInProgressKey(*obj.Key)) + } + } + return jobKeys, nil +} + +func uploadInProgressFile(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", inProgressKey(jobKey)) + if err != nil { + return err + } + return nil +} + +// e.g. /jobs//in_progress +func allInProgressKey(kind userconfig.Kind) string { + return path.Join( + config.ClusterName(), _jobsPrefix, kind.String(), _inProgressFilePrefix, + ) +} + +// e.g. /jobs//in_progress/ +func allInProgressForAPIKey(kind userconfig.Kind, apiName string) string { + return path.Join(allInProgressKey(kind), apiName) +} + +// e.g. /jobs//in_progress// +func inProgressKey(jobKey spec.JobKey) string { + return path.Join(allInProgressForAPIKey(jobKey.Kind, jobKey.APIName), jobKey.ID) +} + +func jobKeyFromInProgressKey(s3Key string) spec.JobKey { + pathSplit := strings.Split(s3Key, "/") + + kind := pathSplit[len(pathSplit)-4] + apiName := pathSplit[len(pathSplit)-2] + jobID := pathSplit[len(pathSplit)-1] + + return spec.JobKey{APIName: apiName, ID: jobID, Kind: userconfig.KindFromString(kind)} +} diff --git a/pkg/operator/resources/job/consts.go b/pkg/operator/resources/job/consts.go new file mode 100644 index 0000000000..aba04fe271 --- /dev/null +++ b/pkg/operator/resources/job/consts.go @@ -0,0 +1,34 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package job + +import "github.com/cortexlabs/cortex/pkg/types/userconfig" + +const ( + _jobsPrefix = "jobs" + _inProgressFilePrefix = "in_progress" + _enqueuingLivenessFile = "enqueuing_liveness" +) + +var _jobKinds = map[userconfig.Kind]bool{ + userconfig.TaskAPIKind: true, + userconfig.BatchAPIKind: true, +} + +func LivenessFile() string { + return _enqueuingLivenessFile +} diff --git a/pkg/operator/resources/job/errors.go b/pkg/operator/resources/job/errors.go new file mode 100644 index 0000000000..11d5f5e844 --- /dev/null +++ b/pkg/operator/resources/job/errors.go @@ -0,0 +1,80 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package job + +import ( + "fmt" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + s "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" +) + +const ( + ErrInvalidJobKind = "job.invalid_kind" + ErrJobNotFound = "job.not_found" + ErrJobIsNotInProgress = "job.job_is_not_in_progress" + ErrJobHasAlreadyBeenStopped = "job.job_has_already_been_stopped" + ErrConflictingFields = "job.conflicting_fields" + ErrSpecifyExactlyOneKey = "job.specify_exactly_one_key" +) + +func ErrorInvalidJobKind(kind userconfig.Kind) error { + return errors.WithStack(&errors.Error{ + Kind: ErrInvalidJobKind, + Message: fmt.Sprintf("invalid job kind %s", kind.String()), + }) +} + +func ErrorJobNotFound(jobKey spec.JobKey) error { + return errors.WithStack(&errors.Error{ + Kind: ErrJobNotFound, + Message: fmt.Sprintf("unable to find %s job %s", jobKey.Kind.String(), jobKey.UserString()), + }) +} + +func ErrorJobIsNotInProgress(kind userconfig.Kind) error { + return errors.WithStack(&errors.Error{ + Kind: ErrJobIsNotInProgress, + Message: fmt.Sprintf("cannot stop %s job because it is not in progress", kind.String()), + }) +} + +func ErrorJobHasAlreadyBeenStopped(kind userconfig.Kind) error { + return errors.WithStack(&errors.Error{ + Kind: ErrJobHasAlreadyBeenStopped, + Message: fmt.Sprintf("%s job has already been stopped", kind.String()), + }) +} + +func ErrorConflictingFields(key string, keys ...string) error { + allKeys := append([]string{key}, keys...) + + return errors.WithStack(&errors.Error{ + Kind: ErrConflictingFields, + Message: fmt.Sprintf("please specify either the %s field (but not more than one at the same time)", s.StrsOr(allKeys)), + }) +} + +func ErrorSpecifyExactlyOneKey(key string, keys ...string) error { + allKeys := append([]string{key}, keys...) + return errors.WithStack(&errors.Error{ + Kind: ErrSpecifyExactlyOneKey, + Message: fmt.Sprintf("specify exactly one of the following keys: %s", s.StrsOr(allKeys)), + }) +} diff --git a/pkg/operator/resources/job/state.go b/pkg/operator/resources/job/state.go new file mode 100644 index 0000000000..41e3c012b4 --- /dev/null +++ b/pkg/operator/resources/job/state.go @@ -0,0 +1,412 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package job + +import ( + "path" + "path/filepath" + "time" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/pointer" + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" +) + +const ( + _averageFilesPerJobState = 10 +) + +type State struct { + spec.JobKey + Status status.JobCode + LastUpdatedMap map[string]time.Time + EndTime *time.Time +} + +func (j State) GetLastUpdated() time.Time { + lastUpdated := time.Time{} + + for _, fileLastUpdated := range j.LastUpdatedMap { + if lastUpdated.After(fileLastUpdated) { + lastUpdated = fileLastUpdated + } + } + + return lastUpdated +} + +func (j State) GetFirstCreated() time.Time { + firstCreated := time.Unix(1<<63-62135596801, 999999999) // Max time + + for _, fileLastUpdated := range j.LastUpdatedMap { + if firstCreated.After(fileLastUpdated) { + firstCreated = fileLastUpdated + } + } + + return firstCreated +} + +// Doesn't assume only status files are present. The order below matters. +func GetStatusCode(lastUpdatedMap map[string]time.Time) status.JobCode { + if _, ok := lastUpdatedMap[status.JobStopped.String()]; ok { + return status.JobStopped + } + + if _, ok := lastUpdatedMap[status.JobTimedOut.String()]; ok { + return status.JobTimedOut + } + + if _, ok := lastUpdatedMap[status.JobWorkerOOM.String()]; ok { + return status.JobWorkerOOM + } + + if _, ok := lastUpdatedMap[status.JobWorkerError.String()]; ok { + return status.JobWorkerError + } + + if _, ok := lastUpdatedMap[status.JobEnqueueFailed.String()]; ok { + return status.JobEnqueueFailed + } + + if _, ok := lastUpdatedMap[status.JobUnexpectedError.String()]; ok { + return status.JobUnexpectedError + } + + if _, ok := lastUpdatedMap[status.JobCompletedWithFailures.String()]; ok { + return status.JobCompletedWithFailures + } + + if _, ok := lastUpdatedMap[status.JobSucceeded.String()]; ok { + return status.JobSucceeded + } + + if _, ok := lastUpdatedMap[status.JobRunning.String()]; ok { + return status.JobRunning + } + + if _, ok := lastUpdatedMap[status.JobEnqueuing.String()]; ok { + return status.JobEnqueuing + } + + return status.JobUnknown +} + +func GetJobState(jobKey spec.JobKey) (*State, error) { + gcsObjects, s3Objects, err := config.ListBucketPrefix(jobKey.Prefix(config.ClusterName()), nil) + if err != nil { + return nil, errors.Wrap(err, "failed to get job state", jobKey.UserString()) + } + + if len(gcsObjects) == 0 && len(s3Objects) == 0 { + return nil, errors.Wrap(ErrorJobNotFound(jobKey), "failed to get job state") + } + + lastUpdatedMap := map[string]time.Time{} + if len(gcsObjects) > 0 { + for _, object := range gcsObjects { + if object != nil { + lastUpdatedMap[filepath.Base(object.Name)] = object.Updated + } + } + } else { + for _, object := range s3Objects { + lastUpdatedMap[filepath.Base(*object.Key)] = *object.LastModified + } + } + + jobState := getJobStateFromFiles(jobKey, lastUpdatedMap) + return &jobState, nil +} + +func getJobStateFromFiles(jobKey spec.JobKey, lastUpdatedFileMap map[string]time.Time) State { + statusCode := GetStatusCode(lastUpdatedFileMap) + + var jobEndTime *time.Time + if statusCode.IsCompleted() { + if endTime, ok := lastUpdatedFileMap[statusCode.String()]; ok { + jobEndTime = &endTime + } + } + + return State{ + JobKey: jobKey, + LastUpdatedMap: lastUpdatedFileMap, + Status: statusCode, + EndTime: jobEndTime, + } +} + +func GetMostRecentlySubmittedJobStates(apiName string, count int, kind userconfig.Kind) ([]*State, error) { + // a single job state may include 5 files on average, overshoot the number of files needed + gcsObjects, s3Objects, err := config.ListBucketPrefix( + spec.JobAPIPrefix(config.ClusterName(), kind, apiName), + pointer.Int64(int64(count*_averageFilesPerJobState)), + ) + if err != nil { + return nil, err + } + + // job id -> file name -> last update timestamp + lastUpdatedMaps := map[string]map[string]time.Time{} + jobIDOrder := []string{} + if len(gcsObjects) > 0 { + for _, object := range gcsObjects { + if object == nil { + continue + } + fileName := filepath.Base(object.Name) + jobID := filepath.Base(filepath.Dir(object.Name)) + if _, ok := lastUpdatedMaps[jobID]; !ok { + jobIDOrder = append(jobIDOrder, jobID) + lastUpdatedMaps[jobID] = map[string]time.Time{fileName: object.Updated} + } else { + lastUpdatedMaps[jobID][fileName] = object.Updated + } + } + } else { + for _, object := range s3Objects { + if object == nil { + continue + } + fileName := filepath.Base(*object.Key) + jobID := filepath.Base(filepath.Dir(*object.Key)) + if _, ok := lastUpdatedMaps[jobID]; !ok { + jobIDOrder = append(jobIDOrder, jobID) + lastUpdatedMaps[jobID] = map[string]time.Time{fileName: *object.LastModified} + } else { + lastUpdatedMaps[jobID][fileName] = *object.LastModified + } + } + } + + jobStates := make([]*State, 0, count) + + jobStateCount := 0 + for _, jobID := range jobIDOrder { + jobState := getJobStateFromFiles(spec.JobKey{ + APIName: apiName, + ID: jobID, + Kind: kind, + }, lastUpdatedMaps[jobID]) + jobStates = append(jobStates, &jobState) + + jobStateCount++ + if jobStateCount == count { + break + } + } + + return jobStates, nil +} + +func SetStatusForJob(jobKey spec.JobKey, jobStatus status.JobCode) error { + switch jobStatus { + case status.JobEnqueuing: + return SetEnqueuingStatus(jobKey) + case status.JobRunning: + return SetRunningStatus(jobKey) + case status.JobEnqueueFailed: + return SetEnqueueFailedStatus(jobKey) + case status.JobCompletedWithFailures: + return SetCompletedWithFailuresStatus(jobKey) + case status.JobSucceeded: + return SetSucceededStatus(jobKey) + case status.JobUnexpectedError: + return SetUnexpectedErrorStatus(jobKey) + case status.JobWorkerError: + return SetWorkerErrorStatus(jobKey) + case status.JobWorkerOOM: + return SetWorkerOOMStatus(jobKey) + case status.JobTimedOut: + return SetTimedOutStatus(jobKey) + case status.JobStopped: + return SetStoppedStatus(jobKey) + } + return nil +} + +func UpdateLiveness(jobKey spec.JobKey) error { + s3Key := path.Join(jobKey.Prefix(config.ClusterName()), _enqueuingLivenessFile) + err := config.UploadJSONToBucket(time.Now(), s3Key) + if err != nil { + return errors.Wrap(err, "failed to update liveness", jobKey.UserString()) + } + return nil +} + +func SetEnqueuingStatus(jobKey spec.JobKey) error { + err := UpdateLiveness(jobKey) + if err != nil { + return err + } + + err = config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobEnqueuing.String())) + if err != nil { + return err + } + + err = uploadInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetFailedStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobEnqueueFailed.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetRunningStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobRunning.String())) + if err != nil { + return err + } + + err = uploadInProgressFile(jobKey) // in progress file should already be there but just in case + if err != nil { + return err + } + + return nil +} + +func SetStoppedStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobStopped.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetSucceededStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobSucceeded.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetCompletedWithFailuresStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobCompletedWithFailures.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetWorkerErrorStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobWorkerError.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetWorkerOOMStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobWorkerOOM.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetEnqueueFailedStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobEnqueueFailed.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetUnexpectedErrorStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobUnexpectedError.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} + +func SetTimedOutStatus(jobKey spec.JobKey) error { + err := config.UploadStringToBucket("", path.Join(jobKey.Prefix(config.ClusterName()), status.JobTimedOut.String())) + if err != nil { + return err + } + + err = DeleteInProgressFile(jobKey) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/operator/resources/job/taskapi/api.go b/pkg/operator/resources/job/taskapi/api.go new file mode 100644 index 0000000000..0ae98cb3d9 --- /dev/null +++ b/pkg/operator/resources/job/taskapi/api.go @@ -0,0 +1,289 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "fmt" + "path/filepath" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/lib/routines" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kbatch "k8s.io/api/batch/v1" + kcore "k8s.io/api/core/v1" +) + +// UpdateAPI deploys or update a task api without triggering any task +func UpdateAPI(apiConfig *userconfig.API, projectID string) (*spec.API, string, error) { + prevVirtualService, err := config.K8s.GetVirtualService(operator.K8sName(apiConfig.Name)) + if err != nil { + return nil, "", err + } + + api := spec.GetAPISpec(apiConfig, projectID, "", config.ClusterName()) // Deployment ID not needed for TaskAPI spec + + if prevVirtualService == nil { + if err := config.UploadJSONToBucket(api, api.Key); err != nil { + return nil, "", errors.Wrap(err, "upload api spec") + } + + err = applyK8sResources(api, prevVirtualService) + if err != nil { + routines.RunWithPanicHandler(func() { + deleteK8sResources(api.Name) + }) + return nil, "", err + } + + return api, fmt.Sprintf("created %s", api.Resource.UserString()), nil + } + + if prevVirtualService.Labels["specID"] != api.SpecID { + if err := config.UploadJSONToBucket(api, api.Key); err != nil { + return nil, "", errors.Wrap(err, "upload api spec") + } + + err = applyK8sResources(api, prevVirtualService) + if err != nil { + return nil, "", err + } + + return api, fmt.Sprintf("updated %s", api.Resource.UserString()), nil + } + + return api, fmt.Sprintf("%s is up to date", api.Resource.UserString()), nil +} + +// DeleteAPI deletes a task api +func DeleteAPI(apiName string, keepCache bool) error { + err := parallel.RunFirstErr( + func() error { + return deleteK8sResources(apiName) + }, + func() error { + if keepCache { + return nil + } + return deleteS3Resources(apiName) + }, + ) + + if err != nil { + return err + } + + return nil +} + +func deleteS3Resources(apiName string) error { + return parallel.RunFirstErr( + func() error { + prefix := filepath.Join(config.ClusterName(), "apis", apiName) + return config.DeleteBucketDir(prefix, true) + }, + func() error { + prefix := spec.JobAPIPrefix(config.ClusterName(), userconfig.TaskAPIKind, apiName) + go func() { + _ = config.DeleteBucketDir(prefix, true) // deleting job files may take a while + }() + return nil + }, + func() error { + job.DeleteAllInProgressFilesByAPI(userconfig.TaskAPIKind, apiName) // not useful xml error is thrown, swallow the error + return nil + }, + ) +} + +// GetAllAPIs returns all task APIs, for each API returning the most recently submitted job and all running jobs +func GetAllAPIs(virtualServices []istioclientnetworking.VirtualService, k8sJobs []kbatch.Job, pods []kcore.Pod) ([]schema.APIResponse, error) { + taskAPIsMap := map[string]*schema.APIResponse{} + + jobIDToK8sJobMap := map[string]*kbatch.Job{} + for _, job := range k8sJobs { + jobIDToK8sJobMap[job.Labels["jobID"]] = &job + } + + jobIDToPodsMap := map[string][]kcore.Pod{} + for _, pod := range pods { + if pod.Labels["jobID"] != "" { + jobIDToPodsMap[pod.Labels["jobID"]] = append(jobIDToPodsMap[pod.Labels["jobID"]], pod) + } + } + + for _, virtualService := range virtualServices { + apiName := virtualService.Labels["apiName"] + apiID := virtualService.Labels["apiID"] + + api, err := operator.DownloadAPISpec(apiName, apiID) + if err != nil { + return nil, err + } + + endpoint, err := operator.APIEndpoint(api) + if err != nil { + return nil, err + } + + jobStates, err := job.GetMostRecentlySubmittedJobStates(apiName, 1, userconfig.TaskAPIKind) + + jobStatuses := []status.TaskJobStatus{} + if len(jobStates) > 0 { + jobStatus, err := getJobStatusFromJobState(jobStates[0], jobIDToK8sJobMap[jobStates[0].ID], jobIDToPodsMap[jobStates[0].ID]) + if err != nil { + return nil, err + } + + jobStatuses = append(jobStatuses, *jobStatus) + } + + taskAPIsMap[apiName] = &schema.APIResponse{ + Spec: *api, + Endpoint: endpoint, + TaskJobStatuses: jobStatuses, + } + } + + inProgressJobKeys, err := job.ListAllInProgressJobKeys(userconfig.TaskAPIKind) + if err != nil { + return nil, err + } + + for _, jobKey := range inProgressJobKeys { + alreadyAdded := false + for _, jobStatus := range taskAPIsMap[jobKey.APIName].TaskJobStatuses { + if jobStatus.ID == jobKey.ID { + alreadyAdded = true + break + } + } + + if alreadyAdded { + continue + } + + jobStatus, err := getJobStatusFromK8sJob(jobKey, jobIDToK8sJobMap[jobKey.ID], jobIDToPodsMap[jobKey.ID]) + if err != nil { + return nil, err + } + + if jobStatus.Status.IsInProgress() { + taskAPIsMap[jobKey.APIName].TaskJobStatuses = append(taskAPIsMap[jobKey.APIName].TaskJobStatuses, *jobStatus) + } + } + + taskAPIList := make([]schema.APIResponse, 0, len(taskAPIsMap)) + + for _, batchAPI := range taskAPIsMap { + taskAPIList = append(taskAPIList, *batchAPI) + } + + return taskAPIList, nil +} + +// GetAllAPIs returns a single task API and its most recently submitted job along with all running task jobs +func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResponse, error) { + virtualService := deployedResource.VirtualService + + apiID := virtualService.Labels["apiID"] + api, err := operator.DownloadAPISpec(deployedResource.Name, apiID) + if err != nil { + return nil, err + } + + k8sJobs, err := config.K8s.ListJobsByLabel("apiName", deployedResource.Name) + if err != nil { + return nil, err + } + + jobIDToK8sJobMap := map[string]*kbatch.Job{} + for _, job := range k8sJobs { + jobIDToK8sJobMap[job.Labels["jobID"]] = &job + } + + endpoint, err := operator.APIEndpoint(api) + if err != nil { + return nil, err + } + + pods, err := config.K8s.ListPodsByLabel("apiName", deployedResource.Name) + if err != nil { + return nil, err + } + + jobIDToPodsMap := map[string][]kcore.Pod{} + for _, pod := range pods { + jobIDToPodsMap[pod.Labels["jobID"]] = append(jobIDToPodsMap[pod.Labels["jobID"]], pod) + } + + inProgressJobKeys, err := job.ListAllInProgressJobKeysByAPI(userconfig.TaskAPIKind, deployedResource.Name) + if err != nil { + return nil, err + } + + jobStatuses := []status.TaskJobStatus{} + jobIDSet := strset.New() + for _, jobKey := range inProgressJobKeys { + jobStatus, err := getJobStatusFromK8sJob(jobKey, jobIDToK8sJobMap[jobKey.ID], jobIDToPodsMap[jobKey.ID]) + if err != nil { + return nil, err + } + + jobStatuses = append(jobStatuses, *jobStatus) + jobIDSet.Add(jobKey.ID) + } + + if len(jobStatuses) < 10 { + jobStates, err := job.GetMostRecentlySubmittedJobStates(deployedResource.Name, 10+len(jobStatuses), userconfig.TaskAPIKind) + if err != nil { + return nil, err + } + for _, jobState := range jobStates { + if jobIDSet.Has(jobState.ID) { + continue + } + jobIDSet.Add(jobState.ID) + + jobStatus, err := getJobStatusFromJobState(jobState, nil, nil) + if err != nil { + return nil, err + } + + jobStatuses = append(jobStatuses, *jobStatus) + if len(jobStatuses) == 10 { + break + } + } + } + + return []schema.APIResponse{ + { + Spec: *api, + TaskJobStatuses: jobStatuses, + Endpoint: endpoint, + }, + }, nil +} diff --git a/pkg/operator/resources/job/taskapi/cron.go b/pkg/operator/resources/job/taskapi/cron.go new file mode 100644 index 0000000000..ab2de9d895 --- /dev/null +++ b/pkg/operator/resources/job/taskapi/cron.go @@ -0,0 +1,229 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "fmt" + "time" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" + "github.com/cortexlabs/cortex/pkg/lib/telemetry" + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/lib/logging" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + kbatch "k8s.io/api/batch/v1" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + klabels "k8s.io/apimachinery/pkg/labels" +) + +const ( + ManageJobResourcesCronPeriod = 60 * time.Second + _k8sJobExistenceGracePeriod = 10 * time.Second +) + +var operatorLogger = logging.GetOperatorLogger() +var _inProgressJobSpecMap = map[string]*spec.TaskJob{} + +func ManageJobResources() error { + inProgressJobKeys, err := job.ListAllInProgressJobKeys(userconfig.TaskAPIKind) + if err != nil { + return err + } + + inProgressJobIDSet := strset.Set{} + for _, jobKey := range inProgressJobKeys { + inProgressJobIDSet.Add(jobKey.ID) + } + + for jobID := range _inProgressJobSpecMap { + if !inProgressJobIDSet.Has(jobID) { + delete(_inProgressJobSpecMap, jobID) + } + } + + jobs, err := config.K8s.ListJobs( + &kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{"apiKind": userconfig.TaskAPIKind.String()}, + ).String(), + }, + ) + if err != nil { + return err + } + + k8sJobMap := map[string]*kbatch.Job{} + k8sJobIDSet := strset.Set{} + for _, kJob := range jobs { + k8sJobMap[kJob.Labels["jobID"]] = &kJob + k8sJobIDSet.Add(kJob.Labels["jobID"]) + } + + for _, jobKey := range inProgressJobKeys { + jobLogger, err := operator.GetJobLogger(jobKey) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + continue + } + + k8sJob := k8sJobMap[jobKey.ID] + + jobState, err := job.GetJobState(jobKey) + if err != nil { + jobLogger.Error(err) + jobLogger.Error("terminating job and cleaning up job resources") + err := errors.FirstError( + job.DeleteInProgressFile(jobKey), + deleteJobRuntimeResources(jobKey), + ) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + } + continue + } + + if !jobState.Status.IsInProgress() { + // best effort cleanup + _ = job.DeleteInProgressFile(jobKey) + _ = deleteJobRuntimeResources(jobKey) + continue + } + + // reconcile job state and k8s job + newStatusCode, msg, err := reconcileInProgressJob(jobState, k8sJob) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + continue + } + if newStatusCode != jobState.Status { + jobLogger.Error(msg) + err := job.SetStatusForJob(jobKey, newStatusCode) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + continue + } + } + + if _, ok := _inProgressJobSpecMap[jobKey.ID]; !ok { + jobSpec, err := operator.DownloadTaskJobSpec(jobKey) + if err != nil { + jobLogger.Error(err) + jobLogger.Error("terminating job and cleaning up job resources") + err := errors.FirstError( + job.DeleteInProgressFile(jobKey), + deleteJobRuntimeResources(jobKey), + ) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + } + continue + } + _inProgressJobSpecMap[jobKey.ID] = jobSpec + } + jobSpec := _inProgressJobSpecMap[jobKey.ID] + + if jobSpec.Timeout != nil && time.Since(jobSpec.StartTime) > time.Second*time.Duration(*jobSpec.Timeout) { + jobLogger.Errorf("terminating job after exceeding the specified timeout of %d seconds", *jobSpec.Timeout) + err := errors.FirstError( + job.SetTimedOutStatus(jobKey), + deleteJobRuntimeResources(jobKey), + ) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + } + continue + } + + if jobState.Status == status.JobRunning { + err = checkIfJobCompleted(jobKey, k8sJob) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + } + } + } + + // existing K8s job but job is not in progress + for jobID := range strset.Difference(k8sJobIDSet, inProgressJobIDSet) { + jobKey := spec.JobKey{ + APIName: k8sJobMap[jobID].Labels["apiName"], + ID: k8sJobMap[jobID].Labels["jobID"], + } + + err := deleteJobRuntimeResources(jobKey) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + } + } + + return nil +} + +// verifies k8s job exists for a job in running status, if verification fails return a job code to reflect the state +func reconcileInProgressJob(jobState *job.State, k8sJob *kbatch.Job) (status.JobCode, string, error) { + if jobState.Status == status.JobRunning { + if time.Now().Sub(jobState.LastUpdatedMap[status.JobRunning.String()]) <= _k8sJobExistenceGracePeriod { + return jobState.Status, "", nil + } + + if k8sJob == nil { // unexpected k8s job missing + return status.JobUnexpectedError, fmt.Sprintf("terminating job %s; unable to find kubernetes job", jobState.JobKey.UserString()), nil + } + } + + return jobState.Status, "", nil +} + +func checkIfJobCompleted(jobKey spec.JobKey, k8sJob *kbatch.Job) error { + if int(k8sJob.Status.Failed) == 1 { + pods, _ := config.K8s.ListPodsByLabel("jobID", jobKey.ID) + for _, pod := range pods { + if k8s.WasPodOOMKilled(&pod) { + return errors.FirstError( + job.SetWorkerOOMStatus(jobKey), + deleteJobRuntimeResources(jobKey), + ) + } + } + return errors.FirstError( + job.SetWorkerErrorStatus(jobKey), + deleteJobRuntimeResources(jobKey), + ) + } + + if int(k8sJob.Status.Succeeded) == 1 { + return errors.FirstError( + job.SetSucceededStatus(jobKey), + deleteJobRuntimeResources(jobKey), + ) + } + + return nil +} diff --git a/pkg/operator/resources/job/taskapi/job.go b/pkg/operator/resources/job/taskapi/job.go new file mode 100644 index 0000000000..058f3f092d --- /dev/null +++ b/pkg/operator/resources/job/taskapi/job.go @@ -0,0 +1,150 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "time" + + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/telemetry" + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/lib/routines" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" + "github.com/cortexlabs/cortex/pkg/operator/schema" + "github.com/cortexlabs/cortex/pkg/types/spec" +) + +func SubmitJob(apiName string, submission *schema.TaskJobSubmission) (*spec.TaskJob, error) { + err := validateJobSubmission(submission) + if err != nil { + return nil, err + } + + virtualService, err := config.K8s.GetVirtualService(operator.K8sName(apiName)) + if err != nil { + return nil, err + } + + apiID := virtualService.Labels["apiID"] + + apiSpec, err := operator.DownloadAPISpec(apiName, apiID) + if err != nil { + return nil, err + } + + jobID := spec.MonotonicallyDecreasingID() + + jobKey := spec.JobKey{ + APIName: apiSpec.Name, + ID: jobID, + Kind: apiSpec.Kind, + } + + jobSpec := spec.TaskJob{ + JobKey: jobKey, + RuntimeTaskJobConfig: submission.RuntimeTaskJobConfig, + APIID: apiSpec.ID, + SpecID: apiSpec.SpecID, + PredictorID: apiSpec.PredictorID, + StartTime: time.Now(), + } + + if err := uploadJobSpec(&jobSpec); err != nil { + return nil, err + } + + deployJob(apiSpec, &jobSpec) + + return &jobSpec, nil +} + +func uploadJobSpec(jobSpec *spec.TaskJob) error { + if err := config.UploadJSONToBucket( + jobSpec, jobSpec.SpecFilePath(config.ClusterName()), + ); err != nil { + return err + } + + return nil +} + +func deployJob(apiSpec *spec.API, jobSpec *spec.TaskJob) { + err := createK8sJob(apiSpec, jobSpec) + if err != nil { + handleJobSubmissionError(jobSpec.JobKey, err) + } + + err = job.SetRunningStatus(jobSpec.JobKey) + if err != nil { + handleJobSubmissionError(jobSpec.JobKey, err) + } +} + +func handleJobSubmissionError(jobKey spec.JobKey, jobErr error) { + jobLogger, err := operator.GetJobLogger(jobKey) + if err != nil { + telemetry.Error(err) + operatorLogger.Error(err) + return + } + jobLogger.Error(jobErr.Error()) + err = errors.FirstError( + job.SetUnexpectedErrorStatus(jobKey), + deleteJobRuntimeResources(jobKey), + ) + if err != nil { + telemetry.Error(err) + errors.PrintError(err) + } +} + +func deleteJobRuntimeResources(jobKey spec.JobKey) error { + err := deleteK8sJob(jobKey) + if err != nil { + return err + } + + return nil +} + +func StopJob(jobKey spec.JobKey) error { + jobState, err := job.GetJobState(jobKey) + if err != nil { + routines.RunWithPanicHandler(func() { + deleteJobRuntimeResources(jobKey) + }) + return err + } + + if !jobState.Status.IsInProgress() { + routines.RunWithPanicHandler(func() { + deleteJobRuntimeResources(jobKey) + }) + return errors.Wrap(job.ErrorJobIsNotInProgress(jobKey.Kind), jobKey.UserString()) + } + + jobLogger, err := operator.GetJobLogger(jobKey) + if err == nil { + jobLogger.Warn("request received to stop job; performing cleanup...") + } + + return errors.FirstError( + deleteJobRuntimeResources(jobKey), + job.SetStoppedStatus(jobKey), + ) +} diff --git a/pkg/operator/resources/job/taskapi/job_status.go b/pkg/operator/resources/job/taskapi/job_status.go new file mode 100644 index 0000000000..2bc459524a --- /dev/null +++ b/pkg/operator/resources/job/taskapi/job_status.go @@ -0,0 +1,77 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/operator/resources/job" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/status" + kbatch "k8s.io/api/batch/v1" + kcore "k8s.io/api/core/v1" +) + +func GetJobStatus(jobKey spec.JobKey) (*status.TaskJobStatus, error) { + jobState, err := job.GetJobState(jobKey) + if err != nil { + return nil, err + } + + k8sJob, err := config.K8s.GetJob(jobKey.K8sName()) + if err != nil { + return nil, err + } + + pods, err := config.K8s.ListPodsByLabels(map[string]string{"apiName": jobKey.APIName, "jobID": jobKey.ID}) + if err != nil { + return nil, err + } + + return getJobStatusFromJobState(jobState, k8sJob, pods) +} + +func getJobStatusFromK8sJob(jobKey spec.JobKey, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.TaskJobStatus, error) { + jobState, err := job.GetJobState(jobKey) + if err != nil { + return nil, err + } + + return getJobStatusFromJobState(jobState, k8sJob, pods) +} + +func getJobStatusFromJobState(jobState *job.State, k8sJob *kbatch.Job, pods []kcore.Pod) (*status.TaskJobStatus, error) { + jobKey := jobState.JobKey + + jobSpec, err := operator.DownloadTaskJobSpec(jobKey) + if err != nil { + return nil, err + } + + jobStatus := status.TaskJobStatus{ + TaskJob: *jobSpec, + EndTime: jobState.EndTime, + Status: jobState.Status, + } + + if jobState.Status.IsInProgress() && k8sJob != nil { + workerCounts := job.GetWorkerCountsForJob(*k8sJob, pods) + jobStatus.WorkerCounts = &workerCounts + } + + return &jobStatus, nil +} diff --git a/pkg/operator/resources/job/taskapi/k8s_specs.go b/pkg/operator/resources/job/taskapi/k8s_specs.go new file mode 100644 index 0000000000..abb1864d6c --- /dev/null +++ b/pkg/operator/resources/job/taskapi/k8s_specs.go @@ -0,0 +1,164 @@ +/* +Copyright 2021 Cortex Labs, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package taskapi + +import ( + "path" + + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/parallel" + "github.com/cortexlabs/cortex/pkg/lib/pointer" + "github.com/cortexlabs/cortex/pkg/operator/config" + "github.com/cortexlabs/cortex/pkg/operator/operator" + "github.com/cortexlabs/cortex/pkg/types/spec" + "github.com/cortexlabs/cortex/pkg/types/userconfig" + istioclientnetworking "istio.io/client-go/pkg/apis/networking/v1beta1" + kbatch "k8s.io/api/batch/v1" + kcore "k8s.io/api/core/v1" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + klabels "k8s.io/apimachinery/pkg/labels" +) + +const _operatorService = "operator" + +func virtualServiceSpec(api *spec.API) *istioclientnetworking.VirtualService { + return k8s.VirtualService(&k8s.VirtualServiceSpec{ + Name: operator.K8sName(api.Name), + Gateways: []string{"apis-gateway"}, + Destinations: []k8s.Destination{{ + ServiceName: _operatorService, + Weight: 100, + Port: uint32(operator.DefaultPortInt32), + }}, + PrefixPath: api.Networking.Endpoint, + Rewrite: pointer.String(path.Join("tasks", api.Name)), + Annotations: api.ToK8sAnnotations(), + Labels: map[string]string{ + "apiName": api.Name, + "apiID": api.ID, + "specID": api.SpecID, + "predictorID": api.PredictorID, + "apiKind": api.Kind.String(), + }, + }) +} + +func k8sJobSpec(api *spec.API, job *spec.TaskJob) (*kbatch.Job, error) { + containers, volumes := operator.TaskContainers(api) + for i, container := range containers { + if container.Name == operator.APIContainerName { + containers[i].Env = append(container.Env, kcore.EnvVar{ + Name: "CORTEX_TASK_SPEC", + Value: config.BucketPath(job.SpecFilePath(config.ClusterName())), + }) + } + } + + return k8s.Job(&k8s.JobSpec{ + Name: job.JobKey.K8sName(), + Parallelism: int32(job.Workers), + Labels: map[string]string{ + "apiName": api.Name, + "apiID": api.ID, + "specID": api.SpecID, + "predictorID": api.PredictorID, + "jobID": job.ID, + "apiKind": api.Kind.String(), + }, + PodSpec: k8s.PodSpec{ + Labels: map[string]string{ + "apiName": api.Name, + "predictorID": api.PredictorID, + "jobID": job.ID, + "apiKind": api.Kind.String(), + }, + Annotations: map[string]string{ + "traffic.sidecar.istio.io/excludeOutboundIPRanges": "0.0.0.0/0", + }, + K8sPodSpec: kcore.PodSpec{ + RestartPolicy: "Never", + InitContainers: []kcore.Container{ + operator.TaskInitContainer(api), + }, + Containers: containers, + NodeSelector: map[string]string{ + "workload": "true", + }, + Tolerations: operator.Tolerations, + Volumes: volumes, + ServiceAccountName: "default", + }, + }, + }), nil +} + +func applyK8sResources(api *spec.API, prevVirtualService *istioclientnetworking.VirtualService) error { + newVirtualService := virtualServiceSpec(api) + + if prevVirtualService == nil { + _, err := config.K8s.CreateVirtualService(newVirtualService) + return err + } + + _, err := config.K8s.UpdateVirtualService(prevVirtualService, newVirtualService) + return err +} + +func deleteK8sResources(apiName string) error { + return parallel.RunFirstErr( + func() error { + _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{ + "apiName": apiName, + "apiKind": userconfig.TaskAPIKind.String(), + }).String(), + }) + return err + }, + func() error { + _, err := config.K8s.DeleteVirtualService(operator.K8sName(apiName)) + return err + }, + ) +} + +func deleteK8sJob(jobKey spec.JobKey) error { + _, err := config.K8s.DeleteJobs(&kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{ + "apiName": jobKey.APIName, + "apiKind": userconfig.TaskAPIKind.String(), + "jobID": jobKey.ID, + }).String(), + }) + return err +} + +func createK8sJob(apiSpec *spec.API, jobSpec *spec.TaskJob) error { + k8sJob, err := k8sJobSpec(apiSpec, jobSpec) + if err != nil { + return err + } + + _, err = config.K8s.CreateJob(k8sJob) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/operator/endpoints/logs_gcp.go b/pkg/operator/resources/job/taskapi/validations.go similarity index 50% rename from pkg/operator/endpoints/logs_gcp.go rename to pkg/operator/resources/job/taskapi/validations.go index c040e3aa08..0584e1b3fa 100644 --- a/pkg/operator/endpoints/logs_gcp.go +++ b/pkg/operator/resources/job/taskapi/validations.go @@ -14,21 +14,18 @@ See the License for the specific language governing permissions and limitations under the License. */ -package endpoints +package taskapi import ( - "github.com/cortexlabs/cortex/pkg/operator/config" + cr "github.com/cortexlabs/cortex/pkg/lib/configreader" + "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/operator/schema" ) -func gcpLogsQueryParams(apiName string) map[string]string { - queryParams := make(map[string]string) +func validateJobSubmission(submission *schema.TaskJobSubmission) error { + if submission.Workers != 1 { + return errors.Wrap(cr.ErrorInvalidInt(submission.Workers, 1), schema.WorkersKey) + } - queryParams["resource.type"] = "k8s_container" - queryParams["resource.labels.namespace_name"] = "default" - queryParams["resource.labels.project_id"] = *config.GCPCluster.Project - queryParams["resource.labels.location"] = *config.GCPCluster.Zone - queryParams["resource.labels.cluster_name"] = config.GCPCluster.ClusterName - queryParams["labels.k8s-pod/apiName"] = apiName - - return queryParams + return nil } diff --git a/pkg/operator/resources/batchapi/worker_stats.go b/pkg/operator/resources/job/worker_stats.go similarity index 96% rename from pkg/operator/resources/batchapi/worker_stats.go rename to pkg/operator/resources/job/worker_stats.go index cd356680d6..ead07b7822 100644 --- a/pkg/operator/resources/batchapi/worker_stats.go +++ b/pkg/operator/resources/job/worker_stats.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package batchapi +package job import ( "time" @@ -27,7 +27,7 @@ import ( const _stalledPodTimeout = 10 * time.Minute -func getWorkerCountsForJob(k8sJob kbatch.Job, pods []kcore.Pod) status.WorkerCounts { +func GetWorkerCountsForJob(k8sJob kbatch.Job, pods []kcore.Pod) status.WorkerCounts { if k8sJob.Status.Failed > 0 { return status.WorkerCounts{ Failed: *k8sJob.Spec.Parallelism, // When one worker fails, the rest of the pods get deleted so you won't be able to get their statuses diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go index 01a79228d2..f97485904c 100644 --- a/pkg/operator/resources/realtimeapi/api.go +++ b/pkg/operator/resources/realtimeapi/api.go @@ -75,7 +75,7 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*spec.A } if config.Provider == types.AWSProviderType { - err = addAPIToDashboard(config.Cluster.ClusterName, api.Name) + err = addAPIToDashboard(config.ClusterName(), api.Name) if err != nil { errors.PrintError(err) } @@ -189,7 +189,7 @@ func DeleteAPI(apiName string, keepCache bool) error { for i, virtualService := range virtualServices { allAPINames[i] = virtualService.Labels["apiName"] } - err = removeAPIFromDashboard(allAPINames, config.Cluster.ClusterName, apiName) + err = removeAPIFromDashboard(allAPINames, config.ClusterName(), apiName) if err != nil { return errors.Wrap(err, "failed to delete API from dashboard") } diff --git a/pkg/operator/resources/realtimeapi/autoscaler.go b/pkg/operator/resources/realtimeapi/autoscaler.go index c9c9f9d9f2..d9cc662607 100644 --- a/pkg/operator/resources/realtimeapi/autoscaler.go +++ b/pkg/operator/resources/realtimeapi/autoscaler.go @@ -241,7 +241,7 @@ func getInflightRequests(apiName string, window time.Duration) (*float64, error) Label: aws.String("InFlight"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("in-flight"), Dimensions: []*cloudwatch.Dimension{ { diff --git a/pkg/operator/resources/realtimeapi/dashboard.go b/pkg/operator/resources/realtimeapi/dashboard.go index 02988fbbd0..0899b0eca2 100644 --- a/pkg/operator/resources/realtimeapi/dashboard.go +++ b/pkg/operator/resources/realtimeapi/dashboard.go @@ -147,7 +147,7 @@ func statusCodeMetric(dashboardName string, apiName string) []interface{} { func DashboardURL() string { if config.Provider == types.AWSProviderType { - return fmt.Sprintf("https://%s.console.aws.amazon.com/cloudwatch/home#dashboards:name=%s", *config.Cluster.Region, config.Cluster.ClusterName) + return fmt.Sprintf("https://%s.console.aws.amazon.com/cloudwatch/home#dashboards:name=%s", *config.Cluster.Region, config.ClusterName()) } return "" } diff --git a/pkg/operator/resources/realtimeapi/metrics.go b/pkg/operator/resources/realtimeapi/metrics.go index 6139c19790..3e33429490 100644 --- a/pkg/operator/resources/realtimeapi/metrics.go +++ b/pkg/operator/resources/realtimeapi/metrics.go @@ -221,7 +221,7 @@ func getNetworkStatsDef(api *spec.API, period int64) []*cloudwatch.MetricDataQue Label: aws.String(code), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("StatusCode"), Dimensions: statusCodeDimensions, }, @@ -236,7 +236,7 @@ func getNetworkStatsDef(api *spec.API, period int64) []*cloudwatch.MetricDataQue Label: aws.String("Latency"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("Latency"), Dimensions: getAPIDimensionsHistogram(api), }, @@ -250,7 +250,7 @@ func getNetworkStatsDef(api *spec.API, period int64) []*cloudwatch.MetricDataQue Label: aws.String("RequestCount"), MetricStat: &cloudwatch.MetricStat{ Metric: &cloudwatch.Metric{ - Namespace: aws.String(config.Cluster.ClusterName), + Namespace: aws.String(config.ClusterName()), MetricName: aws.String("Latency"), Dimensions: getAPIDimensionsHistogram(api), }, diff --git a/pkg/operator/resources/resources.go b/pkg/operator/resources/resources.go index 1037c746b5..0bc838f9b6 100644 --- a/pkg/operator/resources/resources.go +++ b/pkg/operator/resources/resources.go @@ -30,7 +30,8 @@ import ( "github.com/cortexlabs/cortex/pkg/operator/config" "github.com/cortexlabs/cortex/pkg/operator/lib/routines" "github.com/cortexlabs/cortex/pkg/operator/operator" - "github.com/cortexlabs/cortex/pkg/operator/resources/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/batchapi" + "github.com/cortexlabs/cortex/pkg/operator/resources/job/taskapi" "github.com/cortexlabs/cortex/pkg/operator/resources/realtimeapi" "github.com/cortexlabs/cortex/pkg/operator/resources/trafficsplitter" "github.com/cortexlabs/cortex/pkg/operator/schema" @@ -41,6 +42,8 @@ import ( kapps "k8s.io/api/apps/v1" kbatch "k8s.io/api/batch/v1" kcore "k8s.io/api/core/v1" + kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" + klabels "k8s.io/apimachinery/pkg/labels" ) // Returns an error if resource doesn't exist @@ -159,10 +162,17 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*schema api, msg, err = realtimeapi.UpdateAPI(apiConfig, projectID, force) case userconfig.BatchAPIKind: api, msg, err = batchapi.UpdateAPI(apiConfig, projectID) + case userconfig.TaskAPIKind: + api, msg, err = taskapi.UpdateAPI(apiConfig, projectID) case userconfig.TrafficSplitterKind: api, msg, err = trafficsplitter.UpdateAPI(apiConfig, force) default: - return nil, "", ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.RealtimeAPIKind, userconfig.BatchAPIKind, userconfig.TrafficSplitterKind) // unexpected + return nil, "", ErrorOperationIsOnlySupportedForKind( + *deployedResource, userconfig.RealtimeAPIKind, + userconfig.BatchAPIKind, + userconfig.TrafficSplitterKind, + userconfig.TaskAPIKind, + ) // unexpected } if err == nil && api != nil { @@ -198,7 +208,7 @@ func Patch(configBytes []byte, configFileName string, force bool) ([]schema.Depl apiConfig := &apiConfigs[i] result := schema.DeployResult{} - apiSpec, msg, err := patchAPI(apiConfig, configFileName, force) + apiSpec, msg, err := patchAPI(apiConfig, force) if err == nil && apiSpec != nil { apiEndpoint, _ := operator.APIEndpoint(apiSpec) @@ -218,7 +228,7 @@ func Patch(configBytes []byte, configFileName string, force bool) ([]schema.Depl return results, nil } -func patchAPI(apiConfig *userconfig.API, configFileName string, force bool) (*spec.API, string, error) { +func patchAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) { deployedResource, err := GetDeployedResourceByName(apiConfig.Name) if err != nil { return nil, "", err @@ -262,6 +272,8 @@ func patchAPI(apiConfig *userconfig.API, configFileName string, force bool) (*sp return realtimeapi.UpdateAPI(apiConfig, prevAPISpec.ProjectID, force) case userconfig.BatchAPIKind: return batchapi.UpdateAPI(apiConfig, prevAPISpec.ProjectID) + case userconfig.TaskAPIKind: + return taskapi.UpdateAPI(apiConfig, prevAPISpec.ProjectID) default: return trafficsplitter.UpdateAPI(apiConfig, force) } @@ -305,6 +317,9 @@ func DeleteAPI(apiName string, keepCache bool) (*schema.DeleteResponse, error) { } return nil }, + func() error { + return taskapi.DeleteAPI(apiName, keepCache) + }, ) if err != nil { telemetry.Error(err) @@ -333,6 +348,11 @@ func DeleteAPI(apiName string, keepCache bool) (*schema.DeleteResponse, error) { if err != nil { return nil, err } + case userconfig.TaskAPIKind: + err := taskapi.DeleteAPI(apiName, keepCache) + if err != nil { + return nil, err + } default: return nil, ErrorOperationIsOnlySupportedForKind(*deployedResource, userconfig.RealtimeAPIKind, userconfig.BatchAPIKind, userconfig.TrafficSplitterKind) // unexpected } @@ -344,7 +364,8 @@ func DeleteAPI(apiName string, keepCache bool) (*schema.DeleteResponse, error) { func GetAPIs() ([]schema.APIResponse, error) { var deployments []kapps.Deployment - var k8sJobs []kbatch.Job + var k8sBatchJobs []kbatch.Job + var k8sTaskJobs []kbatch.Job var pods []kcore.Pod var virtualServices []istioclientnetworking.VirtualService @@ -361,7 +382,28 @@ func GetAPIs() ([]schema.APIResponse, error) { }, func() error { var err error - k8sJobs, err = config.K8s.ListJobsWithLabelKeys("apiName") + k8sBatchJobs, err = config.K8s.ListJobs( + &kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{ + "apiKind": userconfig.BatchAPIKind.String(), + }, + ).String(), + }, + ) + return err + }, + func() error { + var err error + k8sTaskJobs, err = config.K8s.ListJobs( + &kmeta.ListOptions{ + LabelSelector: klabels.SelectorFromSet( + map[string]string{ + "apiKind": userconfig.TaskAPIKind.String(), + }, + ).String(), + }, + ) return err }, func() error { @@ -376,16 +418,20 @@ func GetAPIs() ([]schema.APIResponse, error) { realtimeAPIPods := []kcore.Pod{} batchAPIPods := []kcore.Pod{} + taskAPIPods := []kcore.Pod{} for _, pod := range pods { switch pod.Labels["apiKind"] { case userconfig.RealtimeAPIKind.String(): realtimeAPIPods = append(realtimeAPIPods, pod) case userconfig.BatchAPIKind.String(): batchAPIPods = append(batchAPIPods, pod) + case userconfig.TaskAPIKind.String(): + taskAPIPods = append(taskAPIPods, pod) } } var batchAPIVirtualServices []istioclientnetworking.VirtualService + var taskAPIVirtualServices []istioclientnetworking.VirtualService var trafficSplitterVirtualServices []istioclientnetworking.VirtualService for _, vs := range virtualServices { @@ -394,6 +440,8 @@ func GetAPIs() ([]schema.APIResponse, error) { batchAPIVirtualServices = append(batchAPIVirtualServices, vs) case userconfig.TrafficSplitterKind.String(): trafficSplitterVirtualServices = append(trafficSplitterVirtualServices, vs) + case userconfig.TaskAPIKind.String(): + taskAPIVirtualServices = append(taskAPIVirtualServices, vs) } } @@ -402,9 +450,15 @@ func GetAPIs() ([]schema.APIResponse, error) { return nil, err } + var taskAPIList []schema.APIResponse + taskAPIList, err = taskapi.GetAllAPIs(taskAPIVirtualServices, k8sTaskJobs, taskAPIPods) + if err != nil { + return nil, err + } + var batchAPIList []schema.APIResponse if config.Provider == types.AWSProviderType { - batchAPIList, err = batchapi.GetAllAPIs(batchAPIVirtualServices, k8sJobs, batchAPIPods) + batchAPIList, err = batchapi.GetAllAPIs(batchAPIVirtualServices, k8sBatchJobs, batchAPIPods) if err != nil { return nil, err } @@ -419,6 +473,7 @@ func GetAPIs() ([]schema.APIResponse, error) { response = append(response, realtimeAPIList...) response = append(response, batchAPIList...) + response = append(response, taskAPIList...) response = append(response, trafficSplitterList...) return response, nil @@ -443,6 +498,11 @@ func GetAPI(apiName string) ([]schema.APIResponse, error) { if err != nil { return nil, err } + case userconfig.TaskAPIKind: + apiResponse, err = taskapi.GetAPIByName(deployedResource) + if err != nil { + return nil, err + } case userconfig.TrafficSplitterKind: apiResponse, err = trafficsplitter.GetAPIByName(deployedResource) if err != nil { @@ -471,7 +531,7 @@ func GetAPIByID(apiName string, apiID string) ([]schema.APIResponse, error) { } // search for the API spec with the old ID - spec, err := operator.DownloadAPISpec(apiName, apiID) + apiSpec, err := operator.DownloadAPISpec(apiName, apiID) if err != nil { if aws.IsGenericNotFoundErr(err) { return nil, ErrorAPIIDNotFound(apiName, apiID) @@ -481,7 +541,7 @@ func GetAPIByID(apiName string, apiID string) ([]schema.APIResponse, error) { return []schema.APIResponse{ { - Spec: *spec, + Spec: *apiSpec, }, }, nil } diff --git a/pkg/operator/resources/trafficsplitter/api.go b/pkg/operator/resources/trafficsplitter/api.go index 9d08594134..5b861ef9f4 100644 --- a/pkg/operator/resources/trafficsplitter/api.go +++ b/pkg/operator/resources/trafficsplitter/api.go @@ -38,7 +38,7 @@ func UpdateAPI(apiConfig *userconfig.API, force bool) (*spec.API, string, error) return nil, "", err } - api := spec.GetAPISpec(apiConfig, "", "", config.Cluster.ClusterName) + api := spec.GetAPISpec(apiConfig, "", "", config.ClusterName()) if prevVirtualService == nil { if err := config.AWS.UploadJSONToS3(api, config.Cluster.Bucket, api.Key); err != nil { return nil, "", errors.Wrap(err, "upload api spec") @@ -172,6 +172,6 @@ func deleteK8sResources(apiName string) error { } func deleteS3Resources(apiName string) error { - prefix := filepath.Join(config.Cluster.ClusterName, "apis", apiName) + prefix := filepath.Join(config.ClusterName(), "apis", apiName) return config.AWS.DeleteS3Dir(config.Cluster.Bucket, prefix, true) } diff --git a/pkg/operator/resources/validations.go b/pkg/operator/resources/validations.go index 518eaf2720..259ef7b1d2 100644 --- a/pkg/operator/resources/validations.go +++ b/pkg/operator/resources/validations.go @@ -40,11 +40,11 @@ type ProjectFiles struct { } func (projectFiles ProjectFiles) AllPaths() []string { - files := make([]string, 0, len(projectFiles.ProjectByteMap)) + pFiles := make([]string, 0, len(projectFiles.ProjectByteMap)) for path := range projectFiles.ProjectByteMap { - files = append(files, path) + pFiles = append(pFiles, path) } - return files + return pFiles } func (projectFiles ProjectFiles) GetFile(path string) ([]byte, error) { @@ -92,7 +92,7 @@ func ValidateClusterAPIs(apis []userconfig.API, projectFiles spec.ProjectFiles) for i := range apis { api := &apis[i] - if api.Kind == userconfig.RealtimeAPIKind || api.Kind == userconfig.BatchAPIKind { + if api.Kind == userconfig.RealtimeAPIKind || api.Kind == userconfig.BatchAPIKind || api.Kind == userconfig.TaskAPIKind { if err := spec.ValidateAPI(api, nil, projectFiles, config.Provider, config.AWS, config.GCP, config.K8s); err != nil { return errors.Wrap(err, api.Identify()) } @@ -102,7 +102,7 @@ func ValidateClusterAPIs(apis []userconfig.API, projectFiles spec.ProjectFiles) } if api.Kind == userconfig.TrafficSplitterKind { - if err := spec.ValidateTrafficSplitter(api, config.Provider, config.AWS); err != nil { + if err := spec.ValidateTrafficSplitter(api); err != nil { return errors.Wrap(err, api.Identify()) } if err := checkIfAPIExists(api.APIs, realtimeAPIs, deployedRealtimeAPIs); err != nil { diff --git a/pkg/operator/schema/job_submission.go b/pkg/operator/schema/job_submission.go index 8214bfbcb6..7fadf3b163 100644 --- a/pkg/operator/schema/job_submission.go +++ b/pkg/operator/schema/job_submission.go @@ -43,9 +43,13 @@ type DelimitedFiles struct { BatchSize int `json:"batch_size"` } -type JobSubmission struct { - spec.RuntimeJobConfig +type BatchJobSubmission struct { + spec.RuntimeBatchJobConfig ItemList *ItemList `json:"item_list"` FilePathLister *FilePathLister `json:"file_path_lister"` DelimitedFiles *DelimitedFiles `json:"delimited_files"` } + +type TaskJobSubmission struct { + spec.RuntimeTaskJobConfig +} diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go index 36077ee55f..0a46cda848 100644 --- a/pkg/operator/schema/schema.go +++ b/pkg/operator/schema/schema.go @@ -53,19 +53,26 @@ type DeployResult struct { } type APIResponse struct { - Spec spec.API `json:"spec"` - Status *status.Status `json:"status,omitempty"` - Metrics *metrics.Metrics `json:"metrics,omitempty"` - Endpoint string `json:"endpoint"` - DashboardURL *string `json:"dashboard_url,omitempty"` - JobStatuses []status.JobStatus `json:"job_statuses,omitempty"` - APIVersions []APIVersion `json:"api_versions,omitempty"` -} - -type JobResponse struct { - APISpec spec.API `json:"api_spec"` - JobStatus status.JobStatus `json:"job_status"` - Endpoint string `json:"endpoint"` + Spec spec.API `json:"spec"` + Status *status.Status `json:"status,omitempty"` + Metrics *metrics.Metrics `json:"metrics,omitempty"` + Endpoint string `json:"endpoint"` + DashboardURL *string `json:"dashboard_url,omitempty"` + BatchJobStatuses []status.BatchJobStatus `json:"batch_job_statuses,omitempty"` + TaskJobStatuses []status.TaskJobStatus `json:"task_job_statuses,omitempty"` + APIVersions []APIVersion `json:"api_versions,omitempty"` +} + +type BatchJobResponse struct { + APISpec spec.API `json:"api_spec"` + JobStatus status.BatchJobStatus `json:"job_status"` + Endpoint string `json:"endpoint"` +} + +type TaskJobResponse struct { + APISpec spec.API `json:"api_spec"` + JobStatus status.TaskJobStatus `json:"job_status"` + Endpoint string `json:"endpoint"` } type DeleteResponse struct { diff --git a/pkg/types/spec/api.go b/pkg/types/spec/api.go index cb09aaa86b..39f98b4a41 100644 --- a/pkg/types/spec/api.go +++ b/pkg/types/spec/api.go @@ -65,6 +65,7 @@ APIID (uniquely identifies an api configuration for a given deployment) * PredictorID (used to determine when rolling updates need to happen) * Resource * Predictor + * TaskDefinition * Compute * ProjectID * Deployment Strategy @@ -78,6 +79,7 @@ func GetAPISpec(apiConfig *userconfig.API, projectID string, deploymentID string buf.WriteString(s.Obj(apiConfig.Resource)) buf.WriteString(s.Obj(apiConfig.Predictor)) + buf.WriteString(s.Obj(apiConfig.TaskDefinition)) buf.WriteString(projectID) if apiConfig.Compute != nil { buf.WriteString(s.Obj(apiConfig.Compute.Normalized())) diff --git a/pkg/types/spec/job_spec.go b/pkg/types/spec/job.go similarity index 60% rename from pkg/types/spec/job_spec.go rename to pkg/types/spec/job.go index f32626211c..e50f85822e 100644 --- a/pkg/types/spec/job_spec.go +++ b/pkg/types/spec/job.go @@ -24,25 +24,27 @@ import ( "github.com/cortexlabs/cortex/pkg/consts" s "github.com/cortexlabs/cortex/pkg/lib/strings" + "github.com/cortexlabs/cortex/pkg/types/userconfig" ) type JobKey struct { - ID string `json:"job_id"` - APIName string `json:"api_name"` + ID string `json:"job_id"` + APIName string `json:"api_name"` + Kind userconfig.Kind `json:"kind"` } func (j JobKey) UserString() string { return fmt.Sprintf("%s (%s api)", j.ID, j.APIName) } -// e.g. //jobs////spec.json +// e.g. //jobs/////spec.json func (j JobKey) SpecFilePath(clusterName string) string { return path.Join(j.Prefix(clusterName), "spec.json") } -// e.g. //jobs/// +// e.g. //jobs//// func (j JobKey) Prefix(clusterName string) string { - return s.EnsureSuffix(path.Join(BatchAPIJobPrefix(j.APIName, clusterName), j.ID), "/") + return s.EnsureSuffix(path.Join(JobAPIPrefix(clusterName, j.Kind, j.APIName), j.ID), "/") } func (j JobKey) K8sName() string { @@ -54,16 +56,22 @@ type SQSDeadLetterQueue struct { MaxReceiveCount int `json:"max_receive_count"` } -type RuntimeJobConfig struct { +type RuntimeBatchJobConfig struct { Workers int `json:"workers"` SQSDeadLetterQueue *SQSDeadLetterQueue `json:"sqs_dead_letter_queue"` Config map[string]interface{} `json:"config"` Timeout *int `json:"timeout"` } -type Job struct { +type RuntimeTaskJobConfig struct { + Workers int `json:"workers"` + Config map[string]interface{} `json:"config"` + Timeout *int `json:"timeout"` +} + +type BatchJob struct { JobKey - RuntimeJobConfig + RuntimeBatchJobConfig APIID string `json:"api_id"` SpecID string `json:"spec_id"` PredictorID string `json:"predictor_id"` @@ -72,6 +80,16 @@ type Job struct { StartTime time.Time `json:"start_time"` } -func BatchAPIJobPrefix(apiName string, clusterName string) string { - return filepath.Join(clusterName, "jobs", consts.CortexVersion, apiName) +type TaskJob struct { + JobKey + RuntimeTaskJobConfig + APIID string `json:"api_id"` + SpecID string `json:"spec_id"` + PredictorID string `json:"predictor_id"` + StartTime time.Time `json:"start_time"` +} + +// e.g. //jobs/// +func JobAPIPrefix(clusterName string, kind userconfig.Kind, apiName string) string { + return filepath.Join(clusterName, "jobs", kind.String(), consts.CortexVersion, apiName) } diff --git a/pkg/types/spec/utils.go b/pkg/types/spec/utils.go index 2ec6a09527..e63bfd0504 100644 --- a/pkg/types/spec/utils.go +++ b/pkg/types/spec/utils.go @@ -142,10 +142,11 @@ func validateDirModels( return nil, err } - modelDirPaths, err = gcpClient.ListGCSPathDir(modelPath, nil) + gcsObjects, err := gcpClient.ListGCSPathDir(modelPath, nil) if err != nil { return nil, err } + modelDirPaths = gcp.ConvertGCSObjectsToKeys(gcsObjects...) } if len(modelDirPaths) == 0 { return nil, errorForPredictorType(dirPrefix, modelDirPaths) @@ -265,10 +266,11 @@ func validateModels( } modelPrefix = s.EnsureSuffix(modelPrefix, "/") - modelPaths, err = gcpClient.ListGCSPathDir(modelPath, nil) + gcsObjects, err := gcpClient.ListGCSPathDir(modelPath, nil) if err != nil { return nil, errors.Wrap(err, model.Name) } + modelPaths = gcp.ConvertGCSObjectsToKeys(gcsObjects...) } if len(modelPaths) == 0 { return nil, errors.Wrap(errorForPredictorType(modelPrefix, modelPaths), model.Name) diff --git a/pkg/types/spec/validations.go b/pkg/types/spec/validations.go index 1478805576..54d2d6ca08 100644 --- a/pkg/types/spec/validations.go +++ b/pkg/types/spec/validations.go @@ -56,8 +56,8 @@ func apiValidation( awsClusterConfig *clusterconfig.Config, gcpClusterConfig *clusterconfig.GCPConfig, ) *cr.StructValidation { + var structFieldValidations []*cr.StructFieldValidation - structFieldValidations := []*cr.StructFieldValidation{} switch resource.Kind { case userconfig.RealtimeAPIKind: structFieldValidations = append(resourceStructValidations, @@ -73,6 +73,12 @@ func apiValidation( networkingValidation(resource.Kind, provider, awsClusterConfig, gcpClusterConfig), computeValidation(provider), ) + case userconfig.TaskAPIKind: + structFieldValidations = append(resourceStructValidations, + taskDefinitionValidation(), + networkingValidation(resource.Kind, provider, awsClusterConfig, gcpClusterConfig), + computeValidation(provider), + ) case userconfig.TrafficSplitterKind: structFieldValidations = append(resourceStructValidations, multiAPIsValidation(), @@ -245,6 +251,73 @@ func predictorValidation() *cr.StructFieldValidation { } } +func taskDefinitionValidation() *cr.StructFieldValidation { + return &cr.StructFieldValidation{ + StructField: "TaskDefinition", + StructValidation: &cr.StructValidation{ + Required: true, + StructFieldValidations: []*cr.StructFieldValidation{ + { + StructField: "Path", + StringValidation: &cr.StringValidation{ + Required: true, + }, + }, + { + StructField: "PythonPath", + StringPtrValidation: &cr.StringPtrValidation{ + AllowEmpty: false, + DisallowedValues: []string{".", "./", "./."}, + Validator: func(path string) (string, error) { + if files.IsAbsOrTildePrefixed(path) { + return "", ErrorMustBeRelativeProjectPath(path) + } + path = strings.TrimPrefix(path, "./") + path = s.EnsureSuffix(path, "/") + return path, nil + }, + }, + }, + { + StructField: "Image", + StringValidation: &cr.StringValidation{ + Required: false, + AllowEmpty: true, + DockerImageOrEmpty: true, + }, + }, + { + StructField: "LogLevel", + StringValidation: &cr.StringValidation{ + Default: "info", + AllowedValues: userconfig.LogLevelTypes(), + }, + Parser: func(str string) (interface{}, error) { + return userconfig.LogLevelFromString(str), nil + }, + }, + { + StructField: "Config", + InterfaceMapValidation: &cr.InterfaceMapValidation{ + StringKeysOnly: true, + AllowEmpty: true, + AllowExplicitNull: true, + ConvertNullToEmpty: true, + Default: map[string]interface{}{}, + }, + }, + { + StructField: "Env", + StringMapValidation: &cr.StringMapValidation{ + Default: map[string]string{}, + AllowEmpty: true, + }, + }, + }, + }, + } +} + func networkingValidation( kind userconfig.Kind, provider types.ProviderType, @@ -673,7 +746,8 @@ func ExtractAPIConfigs( return nil, errors.Append(err, fmt.Sprintf("\n\napi configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor)) } - if resourceStruct.Kind == userconfig.BatchAPIKind || resourceStruct.Kind == userconfig.TrafficSplitterKind { + if resourceStruct.Kind == userconfig.BatchAPIKind || + resourceStruct.Kind == userconfig.TrafficSplitterKind { if provider == types.GCPProviderType { return nil, errors.Wrap(ErrorKindIsNotSupportedByProvider(resourceStruct.Kind, provider), userconfig.IdentifyAPI(configFileName, resourceStruct.Name, resourceStruct.Kind, i)) } @@ -697,7 +771,9 @@ func ExtractAPIConfigs( api.SubmittedAPISpec = interfaceMap - if resourceStruct.Kind == userconfig.RealtimeAPIKind || resourceStruct.Kind == userconfig.BatchAPIKind { + if resourceStruct.Kind == userconfig.RealtimeAPIKind || + resourceStruct.Kind == userconfig.BatchAPIKind || + resourceStruct.Kind == userconfig.TaskAPIKind { api.ApplyDefaultDockerPaths() } @@ -726,8 +802,15 @@ func ValidateAPI( api.Networking.Endpoint = pointer.String("/" + api.Name) } - if err := validatePredictor(api, models, projectFiles, provider, awsClient, gcpClient, k8sClient); err != nil { - return errors.Wrap(err, userconfig.PredictorKey) + switch api.Kind { + case userconfig.TaskAPIKind: + if err := validateTaskDefinition(api, projectFiles, provider, awsClient, k8sClient); err != nil { + return errors.Wrap(err, userconfig.TaskDefinitionKey) + } + default: + if err := validatePredictor(api, models, projectFiles, provider, awsClient, gcpClient, k8sClient); err != nil { + return errors.Wrap(err, userconfig.PredictorKey) + } } if api.Autoscaling != nil { @@ -755,12 +838,42 @@ func ValidateAPI( return nil } -func ValidateTrafficSplitter( +func validateTaskDefinition( api *userconfig.API, + projectFiles ProjectFiles, provider types.ProviderType, awsClient *aws.Client, + k8sClient *k8s.Client, ) error { + taskDefinition := api.TaskDefinition + + if err := validateDockerImagePath(taskDefinition.Image, provider, awsClient, k8sClient); err != nil { + return errors.Wrap(err, userconfig.ImageKey) + } + + for key := range taskDefinition.Env { + if strings.HasPrefix(key, "CORTEX_") { + return errors.Wrap(ErrorCortexPrefixedEnvVarNotAllowed(), userconfig.EnvKey, key) + } + } + + if !projectFiles.HasFile(taskDefinition.Path) { + return errors.Wrap(files.ErrorFileDoesNotExist(taskDefinition.Path), userconfig.PathKey) + } + + if taskDefinition.PythonPath != nil { + if !projectFiles.HasDir(*taskDefinition.PythonPath) { + return errors.Wrap( + ErrorPythonPathNotFound(*taskDefinition.PythonPath), + userconfig.PythonPathKey, + ) + } + } + + return nil +} +func ValidateTrafficSplitter(api *userconfig.API) error { if api.Networking.Endpoint == nil { api.Networking.Endpoint = pointer.String("/" + api.Name) } diff --git a/pkg/types/status/job_status.go b/pkg/types/status/job_status.go index 32ad349962..06936b7514 100644 --- a/pkg/types/status/job_status.go +++ b/pkg/types/status/job_status.go @@ -23,11 +23,18 @@ import ( "github.com/cortexlabs/cortex/pkg/types/spec" ) -type JobStatus struct { - spec.Job +type BatchJobStatus struct { + spec.BatchJob EndTime *time.Time `json:"end_time"` Status JobCode `json:"status"` BatchesInQueue int `json:"batches_in_queue"` BatchMetrics *metrics.BatchMetrics `json:"batch_metrics"` WorkerCounts *WorkerCounts `json:"worker_counts"` } + +type TaskJobStatus struct { + spec.TaskJob + EndTime *time.Time `json:"end_time"` + Status JobCode `json:"status"` + WorkerCounts *WorkerCounts `json:"worker_counts"` +} diff --git a/pkg/types/userconfig/api.go b/pkg/types/userconfig/api.go index 2fa367a84c..8039172acf 100644 --- a/pkg/types/userconfig/api.go +++ b/pkg/types/userconfig/api.go @@ -34,6 +34,7 @@ type API struct { Resource APIs []*TrafficSplit `json:"apis" yaml:"apis"` Predictor *Predictor `json:"predictor" yaml:"predictor"` + TaskDefinition *TaskDefinition `json:"definition" yaml:"definition"` Networking *Networking `json:"networking" yaml:"networking"` Compute *Compute `json:"compute" yaml:"compute"` Autoscaling *Autoscaling `json:"autoscaling" yaml:"autoscaling"` @@ -62,6 +63,15 @@ type Predictor struct { Env map[string]string `json:"env" yaml:"env"` } +type TaskDefinition struct { + Path string `json:"path" yaml:"path"` + PythonPath *string `json:"python_path" yaml:"python_path"` + Image string `json:"image" yaml:"image"` + LogLevel LogLevel `json:"log_level" yaml:"log_level"` + Config map[string]interface{} `json:"config" yaml:"config"` + Env map[string]string `json:"env" yaml:"env"` +} + type MultiModels struct { Path *string `json:"path" yaml:"path"` Paths []*ModelResource `json:"paths" yaml:"paths"` @@ -134,6 +144,15 @@ func (api *API) ApplyDefaultDockerPaths() { usesGPU := api.Compute.GPU > 0 usesInf := api.Compute.Inf > 0 + switch api.Kind { + case RealtimeAPIKind, BatchAPIKind: + api.applyPredictorDefaultDockerPaths(usesGPU, usesInf) + case TaskAPIKind: + api.applyTaskDefaultDockerPaths(usesGPU, usesInf) + } +} + +func (api *API) applyPredictorDefaultDockerPaths(usesGPU, usesInf bool) { predictor := api.Predictor switch predictor.Type { case PythonPredictorType: @@ -170,6 +189,19 @@ func (api *API) ApplyDefaultDockerPaths() { } } +func (api *API) applyTaskDefaultDockerPaths(usesGPU, usesInf bool) { + task := api.TaskDefinition + if task.Image == "" { + if usesGPU { + task.Image = consts.DefaultImagePythonPredictorGPU + } else if usesInf { + task.Image = consts.DefaultImagePythonPredictorInf + } else { + task.Image = consts.DefaultImagePythonPredictorCPU + } + } +} + func IdentifyAPI(filePath string, name string, kind Kind, index int) string { str := "" @@ -301,6 +333,11 @@ func (api *API) UserStr(provider types.ProviderType) string { } } + if api.TaskDefinition != nil { + sb.WriteString(fmt.Sprintf("%s:\n", TaskDefinitionKey)) + sb.WriteString(s.Indent(api.TaskDefinition.UserStr(), " ")) + } + if api.Predictor != nil { sb.WriteString(fmt.Sprintf("%s:\n", PredictorKey)) sb.WriteString(s.Indent(api.Predictor.UserStr(), " ")) @@ -336,6 +373,29 @@ func (trafficSplit *TrafficSplit) UserStr() string { return sb.String() } +func (task *TaskDefinition) UserStr() string { + var sb strings.Builder + + sb.WriteString(fmt.Sprintf("%s: %s\n", PathKey, task.Path)) + if task.PythonPath != nil { + sb.WriteString(fmt.Sprintf("%s: %s\n", PythonPathKey, *task.PythonPath)) + } + sb.WriteString(fmt.Sprintf("%s: %s\n", ImageKey, task.Image)) + sb.WriteString(fmt.Sprintf("%s: %s\n", LogLevelKey, task.LogLevel)) + if len(task.Config) > 0 { + sb.WriteString(fmt.Sprintf("%s:\n", ConfigKey)) + d, _ := yaml.Marshal(&task.Config) + sb.WriteString(s.Indent(string(d), " ")) + } + if len(task.Env) > 0 { + sb.WriteString(fmt.Sprintf("%s:\n", EnvKey)) + d, _ := yaml.Marshal(&task.Env) + sb.WriteString(s.Indent(string(d), " ")) + } + + return sb.String() +} + func (predictor *Predictor) UserStr() string { var sb strings.Builder diff --git a/pkg/types/userconfig/config_key.go b/pkg/types/userconfig/config_key.go index 4d9ad4dda0..dccf186a53 100644 --- a/pkg/types/userconfig/config_key.go +++ b/pkg/types/userconfig/config_key.go @@ -21,6 +21,7 @@ const ( NameKey = "name" KindKey = "kind" PredictorKey = "predictor" + TaskDefinitionKey = "definition" NetworkingKey = "networking" ComputeKey = "compute" AutoscalingKey = "autoscaling" diff --git a/pkg/types/userconfig/kind.go b/pkg/types/userconfig/kind.go index 2459bc01cb..e2d9ded10c 100644 --- a/pkg/types/userconfig/kind.go +++ b/pkg/types/userconfig/kind.go @@ -23,6 +23,7 @@ const ( RealtimeAPIKind BatchAPIKind TrafficSplitterKind + TaskAPIKind ) var _kinds = []string{ @@ -30,6 +31,7 @@ var _kinds = []string{ "RealtimeAPI", "BatchAPI", "TrafficSplitter", + "TaskAPI", } func KindFromString(s string) Kind { diff --git a/test/apis/task/cortex.yaml b/test/apis/task/cortex.yaml new file mode 100644 index 0000000000..209a847c6c --- /dev/null +++ b/test/apis/task/cortex.yaml @@ -0,0 +1,7 @@ +- name: trainer + kind: TaskAPI + definition: + path: task.py + compute: + cpu: 200m + mem: 500Mi diff --git a/test/apis/task/requirements.txt b/test/apis/task/requirements.txt new file mode 100644 index 0000000000..bbc213cf3e --- /dev/null +++ b/test/apis/task/requirements.txt @@ -0,0 +1,2 @@ +boto3 +scikit-learn==0.21.3 diff --git a/test/apis/task/task.py b/test/apis/task/task.py new file mode 100644 index 0000000000..ad3cb1ec15 --- /dev/null +++ b/test/apis/task/task.py @@ -0,0 +1,28 @@ +import os + +import boto3, pickle +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression + + +class Task: + def __call__(self, config): + # get iris flower dataset + iris = load_iris() + data, labels = iris.data, iris.target + training_data, test_data, training_labels, test_labels = train_test_split(data, labels) + + # train the model + model = LogisticRegression(solver="lbfgs", multi_class="multinomial") + model.fit(training_data, training_labels) + accuracy = model.score(test_data, test_labels) + print("accuracy: {:.2f}".format(accuracy)) + + # upload the model if so specified + if config.get("upload_model"): + s3_filepath = config["dest_s3_dir"] + bucket, key = s3_filepath.replace("s3://", "").split("/", 1) + pickle.dump(model, open("model.pkl", "wb")) + s3 = boto3.client("s3") + s3.upload_file("model.pkl", bucket, os.path.join(key, "model.pkl"))