Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

## Unreleased
### New features
- Add trace to cpuevents to display the payload of network flows. ([#442](https://github.com/KindlingProject/kindling/pull/442))
- Support Attach Agent for NoAPM Java Application. ([#431](https://github.com/KindlingProject/kindling/pull/431))

### Enhancements
Expand Down Expand Up @@ -38,9 +39,9 @@
- Add the missing timestamp of TCP connect data and filter the incorrect one without srcPort.([#405](https://github.com/KindlingProject/kindling/pull/405))
- Fix the bug that multiple events cannot be correlated when they are in one ON-CPU data. ([#395](https://github.com/KindlingProject/kindling/pull/395))
- Add the missed latency field for `cgoEvent` to fix the bug where the `request_sent_time` in `single_net_request_metric_group` is always 0. ([#394](https://github.com/KindlingProject/kindling/pull/394))
- Fix http-100 request is detected as NOSUPPORT([393](https://github.com/KindlingProject/kindling/pull/393))
- Fix the wrong thread name in the trace profiling function. ([#385])(https://github.com/KindlingProject/kindling/pull/385)
- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369])(https://github.com/KindlingProject/kindling/pull/369)
- Fix http-100 request is detected as NOSUPPORT([#393](https://github.com/KindlingProject/kindling/pull/393))
- Fix the wrong thread name in the trace profiling function. ([#385](https://github.com/KindlingProject/kindling/pull/385))
- Remove "reset" method of ScheduledTaskRoutine to fix a potential dead-lock issue. ([#369](https://github.com/KindlingProject/kindling/pull/369))
- Fix the bug where the pod metadata with persistent IP in the map is deleted incorrectly due to the deleting mechanism with a delay. ([#374](https://github.com/KindlingProject/kindling/pull/374))
- Fix the bug that when the response is nil, the NAT IP and port are not added to the labels of the "DataGroup". ([#378](https://github.com/KindlingProject/kindling/pull/378))
- Fix potential deadlock of exited thread delay queue. ([#373](https://github.com/KindlingProject/kindling/pull/373))
Expand Down Expand Up @@ -124,7 +125,7 @@

## v0.2.0 - 2022-05-07
### Features
- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling website[http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/]
- Provide a kindling Prometheus exporter that can support integration with Prometheus easily. See kindling's metrics from the kindling [website](http://kindling.harmonycloud.cn/docs/usage/prometheus-metric/).
- Support network performance, DNS performance, service network maps, and workload performance analysis.
- Support HTTP, MySQL, and REDIS request analysis.
- Provide a Grafana-plugin with four built-in dashboards to support basic analysis features.
Expand Down
2 changes: 1 addition & 1 deletion collector/docker/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ analyzers:
networkanalyzer:
connect_timeout: 100
# How many seconds to wait until we consider a request as complete.
fd_reuse_timeout: 15
fd_reuse_timeout: 2
# How many seconds to wait until we consider a request as no response.
no_response_threshold: 120
# How many milliseconds to wait until we consider a request-response as slow.
Expand Down
2 changes: 1 addition & 1 deletion collector/internal/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (a *Application) Shutdown() error {

func (a *Application) registerFactory() {
a.componentsFactory.RegisterReceiver(cgoreceiver.Cgo, cgoreceiver.NewCgoReceiver, &cgoreceiver.Config{})
a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, &network.Config{})
a.componentsFactory.RegisterAnalyzer(network.Network.String(), network.NewNetworkAnalyzer, network.NewDefaultConfig())
a.componentsFactory.RegisterAnalyzer(cpuanalyzer.CpuProfile.String(), cpuanalyzer.NewCpuAnalyzer, cpuanalyzer.NewDefaultConfig())
a.componentsFactory.RegisterProcessor(k8sprocessor.K8sMetadata, k8sprocessor.NewKubernetesProcessor, &k8sprocessor.DefaultConfig)
a.componentsFactory.RegisterExporter(otelexporter.Otel, otelexporter.NewExporter, &otelexporter.Config{})
Expand Down
12 changes: 12 additions & 0 deletions collector/pkg/component/analyzer/cpuanalyzer/cpu_analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/Kindling-project/kindling/collector/pkg/component/analyzer"
"github.com/Kindling-project/kindling/collector/pkg/component/consumer"
"github.com/Kindling-project/kindling/collector/pkg/model"
"github.com/Kindling-project/kindling/collector/pkg/model/constlabels"
"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
)

Expand Down Expand Up @@ -130,6 +131,17 @@ func (ca *CpuAnalyzer) ConsumeSpanEvent(event *model.KindlingEvent) {
ca.PutEventToSegments(event.GetPid(), event.Ctx.ThreadInfo.GetTid(), event.Ctx.ThreadInfo.Comm, ev)
}

func (ca *CpuAnalyzer) ConsumeTraces(trace SendTriggerEvent) {
tid := trace.OriginalData.Labels.GetIntValue(constlabels.RequestTid)
threadName := trace.OriginalData.Labels.GetStringValue(constlabels.Comm)
event := &InnerCall{
StartTime: trace.StartTime,
EndTime: trace.StartTime + trace.SpendTime,
Trace: trace.OriginalData,
}
ca.PutEventToSegments(trace.Pid, uint32(tid), threadName, event)
}

func (ca *CpuAnalyzer) ConsumeCpuEvent(event *model.KindlingEvent) {
ev := new(CpuEvent)
for i := 0; i < int(event.ParamsNumber); i++ {
Expand Down
30 changes: 29 additions & 1 deletion collector/pkg/component/analyzer/cpuanalyzer/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ const (
TimedJavaFutexEventKind
TimedTransactionIdEventKind
TimedApmSpanEventKind
TimedInnerCallEventKind
)

const (
CpuEventLabel = "cpuEvents"
JavaFutexEventLabel = "javaFutexEvents"
TransactionIdEventLabel = "transactionIds"
SpanLabel = "spans"
InnerCallLabel = "innerCalls"
)

type TimedEvent interface {
Expand Down Expand Up @@ -49,6 +51,7 @@ type Segment struct {
JavaFutexEvents []TimedEvent `json:"javaFutexEvents"`
TransactionIds []TimedEvent `json:"transactionIds"`
Spans []TimedEvent `json:"spans"`
InnerCalls []TimedEvent `json:"innerCalls"`
IsSend int
IndexTimestamp string `json:"indexTimestamp"`
}
Expand All @@ -61,6 +64,7 @@ func newSegment(startTime uint64, endTime uint64) *Segment {
JavaFutexEvents: make([]TimedEvent, 0),
TransactionIds: make([]TimedEvent, 0),
Spans: make([]TimedEvent, 0),
InnerCalls: make([]TimedEvent, 0),
IsSend: 0,
IndexTimestamp: "",
}
Expand All @@ -76,6 +80,8 @@ func (s *Segment) putTimedEvent(event TimedEvent) {
s.TransactionIds = append(s.TransactionIds, event)
case TimedApmSpanEventKind:
s.Spans = append(s.Spans, event)
case TimedInnerCallEventKind:
s.InnerCalls = append(s.InnerCalls, event)
}
}

Expand Down Expand Up @@ -103,6 +109,10 @@ func (s *Segment) toDataGroup(parent *TimeSegments) *model.DataGroup {
if err == nil {
labels.AddStringValue(SpanLabel, string(spanEventString))
}
innerCallString, err := json.Marshal(s.InnerCalls)
if err == nil {
labels.AddStringValue(InnerCallLabel, string(innerCallString))
}
return model.NewDataGroup(constnames.CameraEventGroupName, labels, s.StartTime)
}

Expand Down Expand Up @@ -183,4 +193,22 @@ func (j *ApmSpanEvent) EndTimestamp() uint64 {

func (j *ApmSpanEvent) Kind() TimedEventKind {
return TimedApmSpanEventKind
}
}

type InnerCall struct {
StartTime uint64 `json:"startTime"`
EndTime uint64 `json:"endTime"`
Trace *model.DataGroup `json:"trace"`
}

func (c *InnerCall) StartTimestamp() uint64 {
return c.StartTime
}

func (c *InnerCall) EndTimestamp() uint64 {
return c.EndTime
}

func (c *InnerCall) Kind() TimedEventKind {
return TimedInnerCallEventKind
}
14 changes: 13 additions & 1 deletion collector/pkg/component/analyzer/cpuanalyzer/send_trigger.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func ReceiveDataGroupAsSignal(data *model.DataGroup) {
})
return
}
if data.Labels.GetBoolValue(constlabels.IsSlow) {
if data.Labels.GetBoolValue(constlabels.IsSlow) || data.Labels.GetBoolValue(constlabels.IsError) {
duration, ok := data.GetMetric(constvalues.RequestTotalTime)
if !ok {
return
Expand All @@ -54,6 +54,18 @@ type SendTriggerEvent struct {
func (ca *CpuAnalyzer) ReceiveSendSignal() {
// Break the for loop if the channel is closed
for sendContent := range sendChannel {
// CpuAnalyzer consumes all traces from the client-side to add them to TimeSegments
// These traces are not considered as signals, so we skip them here. Note they won't
// be consumed by the following consumers.
if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsServer) {
ca.ConsumeTraces(sendContent)
continue
}
// Only send the slow traces as the signals
if !sendContent.OriginalData.Labels.GetBoolValue(constlabels.IsSlow) {
continue
}
// Store the traces first
for _, nexConsumer := range ca.nextConsumers {
_ = nexConsumer.Consume(sendContent.OriginalData)
}
Expand Down
2 changes: 1 addition & 1 deletion deploy/agent/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ analyzers:
networkanalyzer:
connect_timeout: 100
# How many seconds to wait until we consider a request as complete.
fd_reuse_timeout: 15
fd_reuse_timeout: 2
# How many seconds to wait until we consider a request as no response.
no_response_threshold: 120
# How many milliseconds to wait until we consider a request-response as slow.
Expand Down