diff --git a/README.md b/README.md index 280b95c..9573343 100644 --- a/README.md +++ b/README.md @@ -113,11 +113,11 @@ Hyperflow provides two key Helm charts: To run a sample workflow on a clean Kubernetes cluster, you should do the following: - Install the `hyperflow-ops` chart ``` -helm upgrade --dependency-update -i hf-ops hyperflow-ops +helm upgrade --dependency-update -i hf-ops charts/hyperflow-ops ``` - Install the `hyperflow-run` chart (prefarably in a separate namespace) ``` -helm upgrade --dependency-update -i hf-run-montage hyperflow-run +helm upgrade --dependency-update -i hf-run-montage charts/hyperflow-run ``` - Once all pods are up and running or completed, you can manually run the workflow as follows: ``` diff --git a/charts/hyperflow-engine/templates/deployment.yml b/charts/hyperflow-engine/templates/deployment.yml index ba81f0c..59a1c9b 100644 --- a/charts/hyperflow-engine/templates/deployment.yml +++ b/charts/hyperflow-engine/templates/deployment.yml @@ -64,9 +64,9 @@ spec: - name: HF_VAR_ENABLE_TRACING value: "false" - name: HF_VAR_ENABLE_OTEL - value: "false" + value: "1" - name: HF_VAR_OPT_URL - value: nil + value: "http://hf-obs-opentelemetry-collector" - name: HF_VAR_function # The source of this function can be found here # https://github.com/hyperflow-wms/hyperflow/blob/master/functions/kubernetes/k8sCommand.js diff --git a/charts/hyperflow-observability/Chart.yaml b/charts/hyperflow-observability/Chart.yaml new file mode 100644 index 0000000..2871c2a --- /dev/null +++ b/charts/hyperflow-observability/Chart.yaml @@ -0,0 +1,20 @@ +apiVersion: v2 +name: hyperflow-observability +description: Helm chart to deploy observability stack +type: application +version: 0.1.0 +appVersion: "1.0" + +dependencies: + - name: opensearch + version: "2.34.0" + repository: https://opensearch-project.github.io/helm-charts/ + - name: opensearch-dashboards + version: "2.30.0" + repository: https://opensearch-project.github.io/helm-charts/ + - name: data-prepper + version: "0.3.1" + repository: https://opensearch-project.github.io/helm-charts/ + - name: opentelemetry-collector + version: "0.126.0" + repository: https://open-telemetry.github.io/opentelemetry-helm-charts diff --git a/charts/hyperflow-observability/README.md b/charts/hyperflow-observability/README.md new file mode 100644 index 0000000..46055f9 --- /dev/null +++ b/charts/hyperflow-observability/README.md @@ -0,0 +1,23 @@ +# HyperFlow K8S monitoring + +``` +helm upgrade --dependency-update -i hf-obs charts/hyperflow-observability +``` + +## Open opensearch dashboards + +``` +kubectl port-forward svc/hf-obs-opensearch-dashboards 5601:5601 +``` + +Navigate to +http://localhost:5601/ + +Go to Dashboards Management -> Index Patterns + +create index patterns +- hyperflow_traces +- hyperflow_metrics +- hyperflow_logs + +Go to Discover and choose one of new index patterns as source diff --git a/charts/hyperflow-observability/templates/metric-rules.yaml b/charts/hyperflow-observability/templates/metric-rules.yaml new file mode 100644 index 0000000..a0aa704 --- /dev/null +++ b/charts/hyperflow-observability/templates/metric-rules.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: hyperflow-rules + labels: + app: kube-prometheus-stack +spec: + groups: + - name: hyperflow-deployment-metrics + interval: 1s + rules: + - record: hyperflow_deployment_status_replicas_available + expr: | + kube_deployment_status_replicas_available + * on(namespace, deployment) group_left(label_origin) + kube_deployment_labels{label_origin="hyperflow"} + - name: node_cpu_usage + interval: 5s + rules: + - record: node_cpu_usage_percent + expr: | + 100 * ( + sum by (node) ( + rate(container_cpu_usage_seconds_total{container!=""}[1m]) + ) + / + sum by (node) ( + kube_node_status_allocatable{resource="cpu", unit="core"} + ) + ) + - name: node_memory_usage + interval: 5s + rules: + - record: node_memory_usage_percent + expr: | + ( + sum(container_memory_working_set_bytes) by (node) + / + sum(kube_node_status_allocatable{resource="memory"}) by (node) + ) * 100 \ No newline at end of file diff --git a/charts/hyperflow-observability/values.yaml b/charts/hyperflow-observability/values.yaml new file mode 100644 index 0000000..6051aca --- /dev/null +++ b/charts/hyperflow-observability/values.yaml @@ -0,0 +1,220 @@ +opensearch: + replicas: 1 + + config: + opensearch.yml: | + cluster.name: opensearch-cluster + network.host: 0.0.0.0 + plugins: + security: + disabled: true + extraEnvs: + - name: OPENSEARCH_JAVA_OPTS + value: "-Xms512m -Xmx512m" + - name: OPENSEARCH_INITIAL_ADMIN_PASSWORD + value: "Hyperflow1!" + +opensearch-dashboards: + opensearchHosts: "http://opensearch-cluster-master:9200" + + extraEnvs: + - name: DISABLE_SECURITY_DASHBOARDS_PLUGIN + value: "true" + + resources: + requests: + cpu: "200m" + memory: 0.5Gi + limits: + cpu: "1" + memory: 3Gi + +data-prepper: + pipelineConfig: + enabled: true + config: + entry-pipeline: + delay: "100" + source: + otel_trace_source: + ssl: false + sink: + - pipeline: + name: "raw-pipeline" + - pipeline: + name: "service-map-pipeline" + raw-pipeline: + source: + pipeline: + name: "entry-pipeline" + processor: + - otel_trace_raw: + sink: + - opensearch: + hosts: [ "http://opensearch-cluster-master:9200" ] + insecure: true + username: admin + password: "Hyperflow1!" + index_type: custom + index: hyperflow_traces + service-map-pipeline: + delay: "100" + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map_stateful: + sink: + - opensearch: + hosts: [ "http://opensearch-cluster-master:9200" ] + insecure: true + username: admin + password: "Hyperflow1!" + index_type: trace-analytics-service-map + + metrics-pipeline: + source: + otel_metrics_source: + ssl: false + sink: + - opensearch: + hosts: [ "http://opensearch-cluster-master:9200" ] + insecure: true + username: admin + password: "Hyperflow1!" + index_type: custom + index: hyperflow_metrics + + logs-pipeline: + source: + otel_logs_source: + ssl: false + sink: + - opensearch: + hosts: [ "http://opensearch-cluster-master:9200" ] + insecure: true + username: admin + password: "Hyperflow1!" + index: hyperflow_logs + +opentelemetry-collector: + mode: "statefulset" + + image: + repository: "otel/opentelemetry-collector" + tag: "0.123.0" + + command: + name: "otelcol" + + resources: + requests: + cpu: 1 + memory: 5Gi + limits: + cpu: 2 + memory: 5Gi + + config: + extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + zpages: + endpoint: 0.0.0.0:55679 + + receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + prometheus: + config: + scrape_configs: + - job_name: "kube-state-metrics" + scrape_interval: 1s + metrics_path: /federate + honor_labels: true + params: + match[]: + - '{label_origin="hyperflow"}' + static_configs: + - targets: [ "monitoring-prometheus:9090" ] + metric_relabel_configs: + - source_labels: [ __name__ ] + regex: "kube_deployment_labels" + action: drop + - job_name: "cpu-by-node" + scrape_interval: 5s + metrics_path: /federate + honor_labels: true + params: + match[]: + - 'node_cpu_usage_percent' + static_configs: + - targets: [ "monitoring-prometheus:9090" ] + - job_name: "memory-by-node" + scrape_interval: 5s + metrics_path: /federate + honor_labels: true + params: + match[]: + - 'node_memory_usage_percent' + static_configs: + - targets: [ "monitoring-prometheus:9090" ] + - job_name: "rabbitmq-exporter" + scrape_interval: 1s + static_configs: + - targets: [ "hf-ops-prometheus-rabbitmq-exporter:9419" ] + metric_relabel_configs: + - source_labels: [ __name__ ] + regex: "rabbitmq_queue_messages_ready" + action: keep + + processors: + batch: { } + filter: + metrics: + exclude: + match_type: regexp + metric_names: + - "up" + - "scrape_.*" + + + exporters: + otlp/traces: + endpoint: hf-obs-data-prepper:21890 + tls: + insecure: true + insecure_skip_verify: true + otlp/metrics: + endpoint: hf-obs-data-prepper:21891 + tls: + insecure: true + insecure_skip_verify: true + otlp/logs: + endpoint: hf-obs-data-prepper:21892 + tls: + insecure: true + insecure_skip_verify: true + debug: + verbosity: detailed + + service: + pipelines: + traces: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ debug, otlp/traces ] + metrics: + receivers: [ otlp, prometheus ] + processors: [ batch, filter ] + exporters: [ debug, otlp/metrics ] + logs: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ debug, otlp/logs ] + + extensions: [ health_check, pprof, zpages ] \ No newline at end of file diff --git a/charts/hyperflow-ops/values.yaml b/charts/hyperflow-ops/values.yaml index 8bb0511..c8d0aa6 100644 --- a/charts/hyperflow-ops/values.yaml +++ b/charts/hyperflow-ops/values.yaml @@ -6,7 +6,7 @@ worker-pools: enable-rabbitmq: &enable-rabbit-mq true enable-kube-prometheus-stack: &enable-kube-prometheus-stack true enable-alert-manager: &enable-alert-manager false - enable-grafana: &enable-grafana true + enable-grafana: &enable-grafana false enable-prometheus-operator: &enable-prometheus-operator true enable-prometheus: &enable-prometheus true diff --git a/charts/hyperflow-run/values.yaml b/charts/hyperflow-run/values.yaml index b562833..6f7a36a 100644 --- a/charts/hyperflow-run/values.yaml +++ b/charts/hyperflow-run/values.yaml @@ -173,6 +173,8 @@ hyperflow-engine: value: "${enableTracing}" - name: HF_VAR_ENABLE_OTEL value: "${enableOtel}" + - name: HF_VAR_OPT_URL + value: "http://hf-obs-opentelemetry-collector" - name: HF_VAR_OT_PARENT_ID value: "${optParentId}" - name: HF_VAR_OT_TRACE_ID @@ -197,6 +199,26 @@ hyperflow-engine: valueFrom: fieldRef: fieldPath: spec.serviceAccountName + - name: HF_LOG_CPU_REQUEST + valueFrom: + resourceFieldRef: + containerName: test + resource: requests.cpu + - name: HF_LOG_CPU_LIMIT + valueFrom: + resourceFieldRef: + containerName: test + resource: limits.cpu + - name: HF_LOG_MEM_REQUEST + valueFrom: + resourceFieldRef: + containerName: test + resource: requests.memory + - name: HF_LOG_MEM_LIMIT + valueFrom: + resourceFieldRef: + containerName: test + resource: limits.memory - name: HF_VAR_FS_MONIT_ENABLED value: "0" - name: HF_VAR_FS_MONIT_COMMAND diff --git a/charts/hyperflow-worker-pool-operator/values.yaml b/charts/hyperflow-worker-pool-operator/values.yaml index d08a3df..c5d58ef 100644 --- a/charts/hyperflow-worker-pool-operator/values.yaml +++ b/charts/hyperflow-worker-pool-operator/values.yaml @@ -10,6 +10,7 @@ config: namespace: {namespace} labels: app: {poolName} + origin: hyperflow spec: replicas: {minReplicas} selector: @@ -47,6 +48,10 @@ config: value: "0" - name: HF_VAR_NUM_RETRIES value: "1" + - name: HF_VAR_ENABLE_OTEL + value: "1" + - name: HF_VAR_OPT_URL + value: "http://hf-obs-opentelemetry-collector" - name: HF_LOG_NODE_NAME valueFrom: fieldRef: @@ -293,6 +298,8 @@ kube-prometheus-stack: rbac: create: true releaseLabel: true + metricLabelsAllowlist: + - deployments=[origin,app] prometheus: monitor: enabled: true diff --git a/values/monitoring/README.md b/values/monitoring/README.md deleted file mode 100644 index 826e3d2..0000000 --- a/values/monitoring/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# HyperFlow K8S monitoring - - -## Run opensearch stack -``` -helm install opensearch -f ./opensearch/opensearch-values.yaml opensearch/opensearch -helm install opensearch-dashboards -f ./opensearch/dashboards-values.yaml opensearch/opensearch-dashboards -helm install dataprepper -f ./opensearch/dataprepper-values.yaml opensearch/data-prepper -``` - -## Run otel stack -``` -helm install opentelemetry-collector -f ./otel/collector-values.yaml open-telemetry/opentelemetry-collector -``` - -## Open opensearch dashboards - -``` -kubectl port-forward svc/opensearch-dashboards 5601:5601 -``` - -Navigate to -http://localhost:5601/ - -Go to Dashboards Management -> Index Patterns - -create index patterns -- hyperflow_traces -- hyperflow_metrics -- hyperflow_logs - -Go to Discover and choose one of new index patterns as source diff --git a/values/monitoring/opensearch/dashboards-values.yaml b/values/monitoring/opensearch/dashboards-values.yaml deleted file mode 100644 index e7fd038..0000000 --- a/values/monitoring/opensearch/dashboards-values.yaml +++ /dev/null @@ -1,13 +0,0 @@ -opensearchHosts: "http://opensearch-cluster-master:9200" - -extraEnvs: - - name: DISABLE_SECURITY_DASHBOARDS_PLUGIN - value: "true" - -resources: - requests: - cpu: "500m" - memory: 1Gi - limits: - cpu: "500m" - memory: 1Gi diff --git a/values/monitoring/opensearch/dataprepper-values.yaml b/values/monitoring/opensearch/dataprepper-values.yaml deleted file mode 100644 index 66a4bf1..0000000 --- a/values/monitoring/opensearch/dataprepper-values.yaml +++ /dev/null @@ -1,66 +0,0 @@ -pipelineConfig: - enabled: true - config: - entry-pipeline: - delay: "100" - source: - otel_trace_source: - ssl: false - sink: - - pipeline: - name: "raw-pipeline" - - pipeline: - name: "service-map-pipeline" - raw-pipeline: - source: - pipeline: - name: "entry-pipeline" - processor: - - otel_trace_raw: - sink: - - opensearch: - hosts: [ "http://opensearch-cluster-master:9200" ] - insecure: true - username: admin - password: "Hyperflow1!" - index_type: custom - index: hyperflow_traces - service-map-pipeline: - delay: "100" - source: - pipeline: - name: "entry-pipeline" - processor: - - service_map_stateful: - sink: - - opensearch: - hosts: [ "http://opensearch-cluster-master:9200" ] - insecure: true - username: admin - password: "Hyperflow1!" - index_type: trace-analytics-service-map - - metrics-pipeline: - source: - otel_metrics_source: - ssl: false - sink: - - opensearch: - hosts: [ "http://opensearch-cluster-master:9200" ] - insecure: true - username: admin - password: "Hyperflow1!" - index_type: custom - index: hyperflow_metrics - - logs-pipeline: - source: - otel_logs_source: - ssl: false - sink: - - opensearch: - hosts: [ "http://opensearch-cluster-master:9200" ] - insecure: true - username: admin - password: "Hyperflow1!" - index: hyperflow_logs diff --git a/values/monitoring/opensearch/opensearch-values.yaml b/values/monitoring/opensearch/opensearch-values.yaml deleted file mode 100644 index 811c135..0000000 --- a/values/monitoring/opensearch/opensearch-values.yaml +++ /dev/null @@ -1,15 +0,0 @@ -replicas: 1 - -config: - opensearch.yml: | - cluster.name: opensearch-cluster - network.host: 0.0.0.0 - plugins: - security: - disabled: true - -extraEnvs: - - name: OPENSEARCH_JAVA_OPTS - value: "-Xms512m -Xmx512m" - - name: OPENSEARCH_INITIAL_ADMIN_PASSWORD - value: "Hyperflow1!" \ No newline at end of file diff --git a/values/monitoring/otel/collector-values.yaml b/values/monitoring/otel/collector-values.yaml deleted file mode 100644 index e2777b8..0000000 --- a/values/monitoring/otel/collector-values.yaml +++ /dev/null @@ -1,63 +0,0 @@ -mode: "statefulset" - -image: - repository: "otel/opentelemetry-collector" - -command: - name: "otelcol" - -config: - extensions: - health_check: - endpoint: 0.0.0.0:13133 - pprof: - endpoint: 0.0.0.0:1777 - zpages: - endpoint: 0.0.0.0:55679 - - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - - processors: - batch: {} - - exporters: - otlp/traces: - endpoint: dataprepper-data-prepper:21890 - tls: - insecure: true - insecure_skip_verify: true - otlp/metrics: - endpoint: dataprepper-data-prepper:21891 - tls: - insecure: true - insecure_skip_verify: true - otlp/logs: - endpoint: dataprepper-data-prepper:21892 - tls: - insecure: true - insecure_skip_verify: true - debug: - verbosity: detailed - - service: - pipelines: - traces: - receivers: [ otlp ] - processors: [ batch ] - exporters: [ debug, otlp/traces ] - metrics: - receivers: [ otlp ] - processors: [ batch ] - exporters: [ debug, otlp/metrics ] - logs: - receivers: [ otlp ] - processors: [ batch ] - exporters: [ debug, otlp/logs ] - - extensions: [ health_check, pprof, zpages ] \ No newline at end of file