diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index 64b26212..0d5ffc0b 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -3,6 +3,15 @@ typedef __u32 __wsum; +typedef struct { + struct net *net; +} possible_net_t; + +struct net_device { + int ifindex; + possible_net_t nd_net; +}; + typedef unsigned int sk_buff_data_t; // Assumes 64-bit. FIXME see below /* // BITS_PER_LONG can be wrong with -target bpf @@ -17,6 +26,22 @@ typedef unsigned char *sk_buff_data_t; #endif */ +struct sk_buff_list { + struct sk_buff *next; + struct sk_buff *prev; +}; + +struct sk_buff_head { + union { + struct { + struct sk_buff *next; + struct sk_buff *prev; + }; + struct sk_buff_list list; + }; + __u32 qlen; +}; + struct sk_buff { union { struct { @@ -147,7 +172,28 @@ enum ip_conntrack_status { }; struct scm_timestamping_internal { - struct timespec64 ts[3]; + struct timespec64 ts[3]; +}; + +struct ns_common { + struct dentry *stashed; + unsigned int inum; +}; + +struct net { + struct ns_common ns; +}; + +struct sock_common { + possible_net_t skc_net; +}; + +struct sock { + struct sock_common __sk_common; + struct sk_buff_head sk_receive_queue; + struct dst_entry *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; }; #endif /* __VMLINUX_NET_H__ */ diff --git a/netstacklat/fill_filter_maps.sh b/netstacklat/fill_filter_maps.sh new file mode 100755 index 00000000..3b7f7ccc --- /dev/null +++ b/netstacklat/fill_filter_maps.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +declare -rA bpf_maps=( + [pid]="netstack_pidfil" + [iface]="netstack_ifinde" + [cgroup]="netstack_cgroup" +) + +declare -rA key_converters=( + [pid]=pid_to_bpftool + [iface]=iface_to_bpftool + [cgroup]=cgroup_to_bpftool +) + +print_usage() +{ + echo "usage: $0 TYPE val1 [val2 val3 val4...]" + echo "TYPE: { $(echo "${!bpf_maps[@]}" | tr ' ' '\|') }" +} + +pid_to_bpftool() +{ + local val="$1" + + uint_to_bpftool_u32 "$val" +} + +# Supports ifname or ifindex +iface_to_bpftool() +{ + local val="$1" + + if ! is_uint "$val"; then + val="$(ifname_to_idx "$val")" + fi + + uint_to_bpftool_u32 "$val" +} + +# Supports full cgroup path or direct cgroup id (inode) +cgroup_to_bpftool() +{ + local val="$1" + + if ! is_uint "$val"; then + val="$(cgroup_path_to_id "$val")" + fi + + uint_to_bpftool_u64 "$val" +} + +is_uint() +{ + local val="$1" + + [[ "$val" == +([0-9]) ]] +} + +ifname_to_idx() +{ + local ifname="$1" + local ifindex=0 + + ifindex="$(ip address show "$ifname" | grep "[0-9]*: ${ifname}:")" + ifindex="${ifindex%%:*}" + + if [[ -z "$ifindex" ]]; then + return 1 + fi + + echo "$ifindex" +} + +cgroup_path_to_id() +{ + local cpath="$1" + + stat -L -c '%i' "$(realpath "$cpath")" +} + +# When providing keys/values to bpftool map update, it basically wants one +# argument for each byte in the key/value. So if you have a u32 key (as in any +# array map) and you want to update key 1234, then you will have to provide +# key 0xd2 0x04 0x00 0x00 (1234 in hex split up as the 4 bytes in a u32 in +# little-endian order). These helpers assume you're on a little endian machine. +uint_to_bpftool_u32() +{ + local val="$1" + + printf "0x%02x 0x%02x 0x%02x 0x%02x\n" \ + $((val & 0xff)) $(((val >> 8) & 0xff)) $(((val >> 16) & 0xff)) $(((val >> 24) & 0xff)) +} + +uint_to_bpftool_u64() +{ + printf "0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n" \ + $((val & 0xff)) $(((val >> 8) & 0xff)) $(((val >> 16) & 0xff)) $(((val >> 24) & 0xff)) \ + $(((val >> 32) & 0xff)) $(((val >> 40) & 0xff)) $(((val >> 48) & 0xff)) $(((val >> 56) & 0xff)) +} + +# All the filter maps use a u8 as the value, so just set that single byte to 1 +add_to_filter_map() +{ + local map="$1" + local key="$2" + + bpftool map update name "$map" key $key value 1 +} + +if (( $# < 2 )); then + print_usage + exit 1 +fi + +type=$1 +if [[ -z "${bpf_maps[$type]}" ]]; then + echo "Error: unrecognized type $type, must be one of: ${!bpf_maps[*]}" + exit 1 +fi + +map=${bpf_maps[$type]} +converter=${key_converters[$type]} + +for val in "${@:2}"; do + key=$($converter "$val") + if ! add_to_filter_map "$map" "$key"; then + echo "Error adding $val ($key) to map $map" + exit 1 + fi +done diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 513ddb73..f889acea 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -14,7 +14,13 @@ char LICENSE[] SEC("license") = "GPL"; volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { + .network_ns = 0, .filter_pid = false, + .filter_ifindex = false, + .filter_cgroup = false, + .filter_nonempty_sockqueue = false, + .groupby_ifindex = false, + .groupby_cgroup = false, }; /* @@ -30,74 +36,47 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); -/* - * To be compatible with ebpf-exporter, all histograms need a key struct whose final - * member is named "bucket" and is the histogram bucket index. - * As we store the histograms in array maps, the key type for each array map - * below has to be a u32 (and not a struct), but as this struct consists of a - * single u32 member we can still use a pointer to the hist_key struct in - * lookup-functions, and the u32 bucket index will implicitly be mapped to the - * array map index. - */ -struct hist_key { - u32 bucket; -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_ip_start_seconds SEC(".maps"); - struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); + __type(key, struct hist_key); __type(value, u64); -} netstack_latency_tcp_start_seconds SEC(".maps"); +} netstack_latency_seconds SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, PID_MAX_LIMIT); __type(key, u32); - __type(value, u64); -} netstack_latency_udp_start_seconds SEC(".maps"); + __type(value, u8); +} netstack_pidfilter SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, IFINDEX_MAX); __type(key, u32); - __type(value, u64); -} netstack_latency_tcp_sock_enqueued_seconds SEC(".maps"); + __type(value, u8); +} netstack_ifindexfilter SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_udp_sock_enqueued_seconds SEC(".maps"); + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_TRACKED_CGROUPS); + __type(key, u64); + __type(value, u8); +} netstack_cgroupfilter SEC(".maps"); -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_tcp_sock_read_seconds SEC(".maps"); +static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) +{ + u64 zero = 0; + u64 *val; -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_udp_sock_read_seconds SEC(".maps"); + val = bpf_map_lookup_elem(map, key); + if (val) + return val; -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, PID_MAX_LIMIT); - __type(key, u32); - __type(value, u8); -} netstack_pidfilter SEC(".maps"); + // Key not in map - try insert it and lookup again + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + return bpf_map_lookup_elem(map, key); +} static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket) { @@ -130,7 +109,7 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key, // Increment histogram key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket); - bucket_count = bpf_map_lookup_elem(map, &key); + bucket_count = lookup_or_zeroinit_histentry(map, &key); if (bucket_count) (*bucket_count)++; @@ -139,33 +118,11 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key, return; key.bucket = max_bucket + 1; - bucket_count = bpf_map_lookup_elem(map, &key); + bucket_count = lookup_or_zeroinit_histentry(map, &key); if (bucket_count) *bucket_count += value; } -static void *hook_to_histmap(enum netstacklat_hook hook) -{ - switch (hook) { - case NETSTACKLAT_HOOK_IP_RCV: - return &netstack_latency_ip_start_seconds; - case NETSTACKLAT_HOOK_TCP_START: - return &netstack_latency_tcp_start_seconds; - case NETSTACKLAT_HOOK_UDP_START: - return &netstack_latency_udp_start_seconds; - case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - return &netstack_latency_tcp_sock_enqueued_seconds; - case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - return &netstack_latency_udp_sock_enqueued_seconds; - case NETSTACKLAT_HOOK_TCP_SOCK_READ: - return &netstack_latency_tcp_sock_read_seconds; - case NETSTACKLAT_HOOK_UDP_SOCK_READ: - return &netstack_latency_udp_sock_read_seconds; - default: - return NULL; - } -} - static ktime_t time_since(ktime_t tstamp) { ktime_t now; @@ -180,22 +137,60 @@ static ktime_t time_since(ktime_t tstamp) return now - tstamp; } -static void record_latency(ktime_t latency, enum netstacklat_hook hook) +static void record_latency(ktime_t latency, const struct hist_key *key) { - struct hist_key key = { 0 }; - increment_exp2_histogram_nosync(hook_to_histmap(hook), key, latency, + increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency, HIST_MAX_LATENCY_SLOT); } -static void record_latency_since(ktime_t tstamp, enum netstacklat_hook hook) +static void record_latency_since(ktime_t tstamp, const struct hist_key *key) { ktime_t latency = time_since(tstamp); if (latency >= 0) - record_latency(latency, hook); + record_latency(latency, key); +} + +static bool filter_ifindex(u32 ifindex) +{ + u8 *ifindex_ok; + + if (!user_config.filter_ifindex) + // No ifindex filter - all ok + return true; + + ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex); + if (!ifindex_ok) + return false; + + return *ifindex_ok > 0; } -static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) +static bool filter_network_ns(u32 ns) { + if (user_config.network_ns == 0) + return true; + + return ns == user_config.network_ns; +} + +static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) +{ + /* + * Favor reading from sk due to less redirection (fewer probe reads) + * and skb->dev is not always set. + */ + if (sk) + return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum); + else if (skb) + return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum); + return 0; +} + +static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) +{ + struct hist_key key = { .hook = hook }; + u32 ifindex; + if (bpf_core_field_exists(skb->tstamp_type)) { /* * For kernels >= v6.11 the tstamp_type being non-zero @@ -219,7 +214,17 @@ static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) return; } - record_latency_since(skb->tstamp, hook); + ifindex = skb->skb_iif; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + + record_latency_since(skb->tstamp, &key); } static bool filter_pid(u32 pid) @@ -237,31 +242,88 @@ static bool filter_pid(u32 pid) return *pid_ok > 0; } -static bool filter_current_task(void) +static bool filter_cgroup(u64 cgroup_id) { + if (!user_config.filter_cgroup) + // No cgroup filter - all cgroups ok + return true; + + return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; +} + +static bool filter_current_task(u64 cgroup) +{ + bool ok = true; __u32 tgid; - if (!user_config.filter_pid) + if (user_config.filter_pid) { + tgid = bpf_get_current_pid_tgid() >> 32; + ok = ok && filter_pid(tgid); + } + + if (user_config.filter_cgroup) + ok = ok && filter_cgroup(cgroup); + + return ok; +} + +/** + * skb_queue_empty - check if a queue is empty + * @list: queue head + * + * Returns true if the queue is empty, false otherwise. + * + * Copied from /include/linux/skbuff.h + */ +static inline int skb_queue_empty(const struct sk_buff_head *list) +{ + return list->next == (const struct sk_buff *)list; +} + +static bool filter_nonempty_sockqueue(struct sock *sk) +{ + if (!user_config.filter_nonempty_sockqueue) return true; - tgid = bpf_get_current_pid_tgid() >> 32; - return filter_pid(tgid); + return !skb_queue_empty(&sk->sk_receive_queue); } -static void record_socket_latency(struct sock *sk, ktime_t tstamp, - enum netstacklat_hook hook) +static void record_socket_latency(struct sock *sk, struct sk_buff *skb, + ktime_t tstamp, enum netstacklat_hook hook) { - if (!filter_current_task()) + struct hist_key key = { .hook = hook }; + u64 cgroup = 0; + u32 ifindex; + + if (!filter_nonempty_sockqueue(sk)) return; - record_latency_since(tstamp, hook); + if (user_config.filter_cgroup || user_config.groupby_cgroup) + cgroup = bpf_get_current_cgroup_id(); + + if (!filter_current_task(cgroup)) + return; + + ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + if (user_config.groupby_cgroup) + key.cgroup = cgroup; + + record_latency_since(tstamp, &key); } SEC("fentry/ip_rcv_core") int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) { - record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); return 0; } @@ -269,58 +331,51 @@ SEC("fentry/ip6_rcv_core") int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) { - record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); return 0; } SEC("fentry/tcp_v4_rcv") int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); return 0; } SEC("fentry/tcp_v6_rcv") int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); return 0; } SEC("fentry/udp_rcv") int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } SEC("fentry/udpv6_rcv") int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START); - return 0; -} - -SEC("fexit/tcp_data_queue") -int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb) -{ - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } -SEC("fexit/udp_queue_rcv_one_skb") -int BPF_PROG(netstacklat_udp_queue_rcv_one_skb, struct sock *sk, - struct sk_buff *skb) +SEC("fexit/tcp_queue_rcv") +int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); return 0; } -SEC("fexit/udpv6_queue_rcv_one_skb") -int BPF_PROG(netstacklat_udpv6_queue_rcv_one_skb, struct sock *sk, - struct sk_buff *skb) +SEC("fexit/__udp_enqueue_schedule_skb") +int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, + struct sk_buff *skb, int retval) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + if (retval == 0) + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); return 0; } @@ -329,7 +384,8 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { struct timespec64 *ts = &tss->ts[0]; - record_socket_latency(sk, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, + record_socket_latency(sk, NULL, + (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, NETSTACKLAT_HOOK_TCP_SOCK_READ); return 0; } @@ -338,6 +394,7 @@ SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) { - record_socket_latency(sk, skb->tstamp, NETSTACKLAT_HOOK_UDP_SOCK_READ); + record_socket_latency(sk, skb, skb->tstamp, + NETSTACKLAT_HOOK_UDP_SOCK_READ); return 0; } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 80567689..773f1ebd 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -2,6 +2,7 @@ static const char *__doc__ = "Netstacklat - Monitor latency to various points in the ingress network stack"; +#define _GNU_SOURCE // to get name_to_handle_at #include #include #include @@ -10,11 +11,14 @@ static const char *__doc__ = #include #include #include +#include +#include #include #include #include #include #include +#include #include #include @@ -44,31 +48,57 @@ static const char *__doc__ = #define MAX_BUCKETCOUNT_STRLEN 10 #define MAX_BAR_STRLEN (80 - 6 - MAX_BUCKETSPAN_STRLEN - MAX_BUCKETCOUNT_STRLEN) +#define LOOKUP_BATCH_SIZE 128 + #define MAX_HOOK_PROGS 4 -// Maximum number of different pids that can be filtered for -#define MAX_FILTER_PIDS 4096 +// Maximum number of PIDs to read from user +#define MAX_PARSED_PIDS 4096 +#define MAX_PARSED_IFACES 4096 + +typedef int (*t_parse_val_func)(const char *, void *); struct hook_prog_collection { struct bpf_program *progs[MAX_HOOK_PROGS]; int nprogs; }; +struct histogram_entry { + struct hist_key key; + __u64 *buckets; +}; + +struct histogram_buffer { + struct histogram_entry *hists; + size_t max_size; + size_t current_size; +}; + struct netstacklat_config { struct netstacklat_bpf_config bpf_conf; double report_interval_s; bool enabled_hooks[NETSTACKLAT_N_HOOKS]; int npids; - __u32 pids[MAX_FILTER_PIDS]; + int nifindices; + int ncgroups; + __u32 *pids; + __u32 *ifindices; + __u64 *cgroups; }; static const struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "report-interval", required_argument, NULL, 'r' }, - { "list-probes", no_argument, NULL, 'l' }, - { "enable-probes", required_argument, NULL, 'e' }, - { "disable-probes", required_argument, NULL, 'd' }, - { "pids", required_argument, NULL, 'p' }, + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { "list-probes", no_argument, NULL, 'l' }, + { "enable-probes", required_argument, NULL, 'e' }, + { "disable-probes", required_argument, NULL, 'd' }, + { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, + { "network-namespace", required_argument, NULL, 'n' }, + { "cgroups", required_argument, NULL, 'c' }, + { "nonempty-queue", no_argument, NULL, 'q' }, + { "groupby-interface", no_argument, NULL, 'I' }, + { "groupby-cgroup", no_argument, NULL, 'C' }, { 0, 0, 0, 0 } }; @@ -201,35 +231,6 @@ static const char *hook_to_description(enum netstacklat_hook hook) } } -static int hook_to_histmap(enum netstacklat_hook hook, - const struct netstacklat_bpf *obj) -{ - switch (hook) { - case NETSTACKLAT_HOOK_IP_RCV: - return bpf_map__fd(obj->maps.netstack_latency_ip_start_seconds); - case NETSTACKLAT_HOOK_TCP_START: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_start_seconds); - case NETSTACKLAT_HOOK_UDP_START: - return bpf_map__fd( - obj->maps.netstack_latency_udp_start_seconds); - case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_sock_enqueued_seconds); - case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - return bpf_map__fd( - obj->maps.netstack_latency_udp_sock_enqueued_seconds); - case NETSTACKLAT_HOOK_TCP_SOCK_READ: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_sock_read_seconds); - case NETSTACKLAT_HOOK_UDP_SOCK_READ: - return bpf_map__fd( - obj->maps.netstack_latency_udp_sock_read_seconds); - default: - return -EINVAL; - } -} - static void hook_to_progs(struct hook_prog_collection *progs, enum netstacklat_hook hook, const struct netstacklat_bpf *obj) @@ -251,14 +252,13 @@ static void hook_to_progs(struct hook_prog_collection *progs, progs->nprogs = 2; break; case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - progs->progs[0] = obj->progs.netstacklat_tcp_data_queue; + progs->progs[0] = obj->progs.netstacklat_tcp_queue_rcv; progs->nprogs = 1; break; case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - progs->progs[0] = obj->progs.netstacklat_udp_queue_rcv_one_skb; - progs->progs[1] = - obj->progs.netstacklat_udpv6_queue_rcv_one_skb; - progs->nprogs = 2; + progs->progs[0] = + obj->progs.netstacklat_udp_enqueue_schedule_skb; + progs->nprogs = 1; break; case NETSTACKLAT_HOOK_TCP_SOCK_READ: progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp; @@ -284,6 +284,18 @@ static void list_hooks(FILE *stream) hook_to_description(hook)); } +static long long get_current_network_ns(void) +{ + struct stat ns_stat; + int err; + + err = stat("/proc/self/ns/net", &ns_stat); + if (err) + return -errno; + + return ns_stat.st_ino; +} + static int parse_bounded_double(double *res, const char *str, double low, double high, const char *name) { @@ -335,72 +347,206 @@ static int parse_bounded_long(long long *res, const char *str, long long low, return 0; } +static int parse_strlist_to_arr(const char *_str, void *arr, size_t nelem, + size_t elem_size, const char *delim, + t_parse_val_func parse_func) +{ + char *tokstr, *str; + char *saveptr = NULL; + int err = 0, i = 0; + + str = malloc(strlen(_str) + 1); + if (!str) + return -ENOMEM; + strcpy(str, _str); + + tokstr = strtok_r(str, delim, &saveptr); + while (tokstr && i < nelem) { + err = parse_func(tokstr, (char *)arr + i * elem_size); + if (err) + goto exit; + + tokstr = strtok_r(NULL, delim, &saveptr); + i++; + } + + if (tokstr) + // Parsed size values, but more still remain + err = -E2BIG; + +exit: + free(str); + return err ?: i; +} + +int parse_hook(const char *str, void *hookout) +{ + enum netstacklat_hook hook; + + hook = str_to_hook(str); + if (hook == NETSTACKLAT_HOOK_INVALID) { + fprintf(stderr, "%s is not a valid hook\n", str); + return -EINVAL; + } + + *(enum netstacklat_hook *)hookout = hook; + return 0; +} + /* * Parses a comma-delimited string of hook-names, and sets the positions for * the hooks that appear in the string to true. */ -static int parse_hooks(bool hooks[NETSTACKLAT_N_HOOKS], const char *_str) +static int parse_hooks(bool hooks[NETSTACKLAT_N_HOOKS], const char *str) { - enum netstacklat_hook hook; - char *tokp = NULL; - char str[1024]; - char *hookstr; - int i; + enum netstacklat_hook ehooks[NETSTACKLAT_N_HOOKS * 2]; + int len, i; + + len = parse_strlist_to_arr(str, ehooks, ARRAY_SIZE(ehooks), + sizeof(*ehooks), ",", parse_hook); + if (len < 0) + return len; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) hooks[i] = false; - if (strlen(_str) >= sizeof(str)) - return -E2BIG; - strcpy(str, _str); + for (i = 0; i < len; i++) + hooks[ehooks[i]] = true; - hookstr = strtok_r(str, ",", &tokp); - while (hookstr) { - hook = str_to_hook(hookstr); - if (hook == NETSTACKLAT_HOOK_INVALID) { - fprintf(stderr, "%s is not a valid hook\n", hookstr); - return -EINVAL; - } + return 0; +} - hooks[hook] = true; +static int parse_pid(const char *str, void *pidout) +{ + long long lval; + int err; - hookstr = strtok_r(NULL, ",", &tokp); - } + err = parse_bounded_long(&lval, str, 1, PID_MAX_LIMIT, "pid"); + if (err) + return err; + *(__u32 *)pidout = lval; return 0; } -static int parse_pids(size_t size, __u32 arr[size], const char *_str, - const char *name) +static int parse_pids(size_t size, __u32 arr[size], const char *str) { - char *pidstr, *str; - char *tokp = NULL; - int err, i = 0; - long long val; + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", + parse_pid); +} - str = malloc(strlen(_str) + 1); - if (!str) - return -ENOMEM; - strcpy(str, _str); +static int parse_iface(const char *str, void *ifindexout) +{ + int ifindex, err = 0; + long long lval; - pidstr = strtok_r(str, ",", &tokp); - while (pidstr && i < size) { - err = parse_bounded_long(&val, pidstr, 1, PID_MAX_LIMIT, name); - if (err) - goto exit; - arr[i] = val; + ifindex = if_nametoindex(str); + if (ifindex > IFINDEX_MAX) { + fprintf(stderr, + "%s has ifindex %d which is above the supported limit %d\n", + str, ifindex, IFINDEX_MAX); + return -ENOTSUP; + } else if (ifindex == 0) { + // Not a valid interface name - try parsing it as an index instead + err = parse_bounded_long(&lval, str, 1, IFINDEX_MAX, + "interface"); + if (!err) + ifindex = lval; + } - pidstr = strtok_r(NULL, ",", &tokp); - i++; + if (ifindex > 0) + *(__u32 *)ifindexout = ifindex; + else + fprintf(stderr, + "%s is not a recognized interface name, nor a valid interface index\n", + str); + + return err; +} + +static int parse_ifaces(size_t size, __u32 arr[size], const char *str) +{ + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", parse_iface); +} + +/** + * get_cgroup_id_from_path - Get cgroup id for a particular cgroup path + * @cgroup_workdir: The absolute cgroup path + * + * On success, it returns the cgroup id. On failure it returns 0, + * which is an invalid cgroup id, and errno is set. + * + * Slightly modified version of get_cgroup_id_from_path from + * /tools/testing/selftests/bpf/cgroup_helpers.c that does not + * print out the errors + */ +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +{ + int dirfd, err, flags, mount_id, fhsize; + union { + unsigned long long cgid; + unsigned char raw_bytes[8]; + } id; + struct file_handle *fhp, *fhp2; + unsigned long long ret = 0; + + dirfd = AT_FDCWD; + flags = 0; + fhsize = sizeof(*fhp); + fhp = calloc(1, fhsize); + if (!fhp) + return 0; + + err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags); + if (err >= 0 || fhp->handle_bytes != 8) { + errno = EBADE; + goto free_mem; } - if (pidstr) - // Parsed size pids, but more still remain - err = -E2BIG; + fhsize = sizeof(struct file_handle) + fhp->handle_bytes; + fhp2 = realloc(fhp, fhsize); + if (!fhp2) + goto free_mem; -exit: - free(str); - return err ?: i; + err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags); + fhp = fhp2; + if (err < 0) + goto free_mem; + + memcpy(id.raw_bytes, fhp->f_handle, 8); + ret = id.cgid; + +free_mem: + free(fhp); + return ret; +} + +static int parse_cgroup(const char *str, void *cgroupout) +{ + long long lval; + __u64 cgroup; + int err = 0; + + cgroup = get_cgroup_id_from_path(str); + + if (cgroup == 0) { + // Not a valid cgroup path - try parse it as an int instead + err = parse_bounded_long(&lval, str, 0, INT64_MAX, "cgroup"); + if (!err) + cgroup = lval; + } + + if (cgroup != 0) + *(__u64 *)cgroupout = cgroup; + else + fprintf(stderr, "%s is not a valid cgroup path or ID\n", str); + + return err; +} + +static int parse_cgroups(size_t size, __u64 arr[size], const char *str) +{ + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", parse_cgroup); } static int parse_arguments(int argc, char *argv[], @@ -408,12 +554,24 @@ static int parse_arguments(int argc, char *argv[], { bool hooks_on = false, hooks_off = false; bool hooks[NETSTACKLAT_N_HOOKS]; + long long network_ns = 0; int opt, err, ret, i; char optstr[64]; double fval; conf->npids = 0; + conf->nifindices = 0; conf->bpf_conf.filter_pid = false; + conf->bpf_conf.filter_ifindex = false; + conf->bpf_conf.filter_nonempty_sockqueue = false; + conf->bpf_conf.groupby_ifindex = false; + conf->bpf_conf.groupby_cgroup = false; + + conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); + conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); + conf->cgroups = calloc(MAX_TRACKED_CGROUPS, sizeof(*conf->cgroups)); + if (!conf->pids || !conf->ifindices || !conf->cgroups) + return -ENOMEM; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default @@ -459,16 +617,52 @@ static int parse_arguments(int argc, char *argv[], conf->enabled_hooks[i] = !hooks[i]; hooks_off = true; break; - case 'p': // filter-pids - ret = parse_pids(ARRAY_SIZE(conf->pids) - conf->npids, - conf->pids + conf->npids, optarg, - optval_to_longopt(opt)->name); + case 'p': // pids + ret = parse_pids(MAX_PARSED_PIDS - conf->npids, + conf->pids + conf->npids, optarg); if (ret < 0) return ret; conf->npids += ret; conf->bpf_conf.filter_pid = true; break; + case 'i': // interfaces + ret = parse_ifaces(MAX_PARSED_IFACES - conf->nifindices, + conf->ifindices + conf->nifindices, + optarg); + if (ret < 0) + return ret; + + conf->nifindices += ret; + conf->bpf_conf.filter_ifindex = true; + break; + case 'n': // network-namespace + err = parse_bounded_long(&network_ns, optarg, -1, + UINT32_MAX, + optval_to_longopt(opt)->name); + if (err) + return err; + break; + case 'c': // cgroups + ret = parse_cgroups(MAX_TRACKED_CGROUPS - + conf->ncgroups, + conf->cgroups, optarg); + if (ret < 0) + return ret; + + conf->ncgroups += ret; + conf->bpf_conf.filter_cgroup = true; + break; + + case 'q': // nonempty-queue + conf->bpf_conf.filter_nonempty_sockqueue = true; + break; + case 'I': // groupby-interface + conf->bpf_conf.groupby_ifindex = true; + break; + case 'C': // groupby-cgroup + conf->bpf_conf.groupby_cgroup = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -487,6 +681,21 @@ static int parse_arguments(int argc, char *argv[], return -EINVAL; } + if (network_ns < 0) { + conf->bpf_conf.network_ns = 0; + } else if (network_ns == 0) { + network_ns = get_current_network_ns(); + if (network_ns < 0) { + fprintf(stderr, + "Failed getting current network namespace: %s\n", + strerror(-network_ns)); + return network_ns; + } + conf->bpf_conf.network_ns = network_ns; + } else { + conf->bpf_conf.network_ns = network_ns; + } + return 0; } @@ -621,97 +830,240 @@ static void print_log2hist(FILE *stream, size_t n, const __u64 hist[n], } } -static void merge_percpu_hist(size_t n, int ncpus, - const __u64 percpu_hist[n][ncpus], - __u64 merged_hist[n]) +static void print_histkey(FILE *stream, const struct hist_key *key) { - int idx, cpu; + fprintf(stream, "%s", hook_to_str(key->hook)); - memset(merged_hist, 0, sizeof(__u64) * n); + if (key->ifindex) + fprintf(stream, ", interface=%u", key->ifindex); - for (idx = 0; idx < n; idx++) { - for (cpu = 0; cpu < ncpus; cpu++) { - merged_hist[idx] += percpu_hist[idx][cpu]; - } + if (key->cgroup) + fprintf(stream, ", cgroup=%llu", key->cgroup); +} + +static int cmp_histkey(const void *val1, const void *val2) +{ + const struct hist_key *key1 = val1, *key2 = val2; + + if (key1->hook != key2->hook) + return key1->hook > key2->hook ? 1 : -1; + + if (key1->ifindex != key2->ifindex) + return key1->ifindex > key2->ifindex ? 1 : -1; + + if (key1->cgroup != key2->cgroup) + return key1->cgroup > key2->cgroup ? 1 : -1; + + return 0; +} + +static int cmp_histentry(const void *val1, const void *val2) +{ + const struct histogram_entry *entry1 = val1, *entry2 = val2; + + return cmp_histkey(&entry1->key, &entry2->key); +} + +static int insert_last_hist_sorted(struct histogram_buffer *buf) +{ + struct histogram_entry *hists = buf->hists; + int i, last = buf->current_size - 1; + struct histogram_entry tmp; + + if (buf->current_size < 2) + return 0; + + i = last; + while (i > 0 && cmp_histentry(&hists[last], &hists[i - 1]) < 0) + i--; + + if (i == last) + // Last hist already in the right place, no need to swap it in + return i; + + // Swap in hist to the correct position + memcpy(&tmp, &hists[last], sizeof(tmp)); + memmove(&hists[i + 1], &hists[i], (last - i) * sizeof(*hists)); + memcpy(&hists[i], &tmp, sizeof(*hists)); + + return i; +} + +static struct histogram_entry * +lookup_or_zeroinit_hist(const struct hist_key *key, + struct histogram_buffer *buf) +{ + struct histogram_entry *hist; + __u64 *buckets; + int i; + + hist = bsearch(key, buf->hists, buf->current_size, sizeof(*buf->hists), + cmp_histentry); + if (hist) + return hist; + + // No matching histogram key found - create new histogram entry and insert it + if (buf->current_size >= buf->max_size) { + errno = ENOSPC; + return NULL; + } + + buckets = calloc(HIST_NBUCKETS, sizeof(*buckets)); + if (!buckets) { + errno = ENOMEM; + return NULL; } + + hist = &buf->hists[buf->current_size++]; + memcpy(&hist->key, key, sizeof(hist->key)); + hist->key.bucket = 0; + hist->buckets = buckets; + + i = insert_last_hist_sorted(buf); + return &buf->hists[i]; +} + +static int update_histogram_entry_bucket(const struct hist_key *key, + __u64 count, + struct histogram_buffer *buf) +{ + struct histogram_entry *hist; + int bucket = key->bucket; + + hist = lookup_or_zeroinit_hist(key, buf); + if (!hist) + return -errno; + + hist->buckets[bucket] = count; + return 0; } -static int fetch_hist_map(int map_fd, __u64 hist[HIST_NBUCKETS]) +static __u64 sum_percpu_vals(int cpus, __u64 vals[cpus]) { - __u32 in_batch, out_batch, count = HIST_NBUCKETS; + __u64 sum = 0; + int i; + + for (i = 0; i < cpus; i++) + sum += vals[i]; + + return sum; +} + +static int fetch_histograms(int map_fd, struct histogram_buffer *buf) +{ + __u32 in_batch, out_batch, count = LOOKUP_BATCH_SIZE; int ncpus = libbpf_num_possible_cpus(); - __u32 idx, buckets_fetched = 0; - __u64 (*percpu_hist)[ncpus]; - __u32 *keys; - int err = 0; + int i, nentries = 0, err, err2 = 0; + __u64(*percpu_buckets)[ncpus]; + bool entries_remain = true; + struct hist_key *keys; - DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts, .flags = BPF_EXIST); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); - percpu_hist = calloc(HIST_NBUCKETS, sizeof(*percpu_hist)); - keys = calloc(HIST_NBUCKETS, sizeof(*keys)); - if (!percpu_hist || !keys) { + percpu_buckets = calloc(LOOKUP_BATCH_SIZE, sizeof(*percpu_buckets)); + keys = calloc(LOOKUP_BATCH_SIZE, sizeof(*keys)); + if (!percpu_buckets || !keys) { err = -ENOMEM; goto exit; } - while (buckets_fetched < HIST_NBUCKETS) { + while (entries_remain) { err = bpf_map_lookup_batch(map_fd, - buckets_fetched > 0 ? &in_batch : NULL, - &out_batch, keys + buckets_fetched, - percpu_hist + buckets_fetched, &count, - &batch_opts); - if (err == -ENOENT) // All entries fetched + nentries > 0 ? &in_batch : NULL, + &out_batch, keys, percpu_buckets, + &count, &batch_opts); + if (err == -ENOENT) { // All entries fetched + entries_remain = false; err = 0; - else if (err) + } else if (err) { goto exit; + } - // Verify keys match expected idx range - for (idx = buckets_fetched; idx < buckets_fetched + count; idx++) { - if (keys[idx] != idx) { - err = -EBADSLT; + for (i = 0; i < count; i++) { + err = update_histogram_entry_bucket( + &keys[i], + sum_percpu_vals(ncpus, percpu_buckets[i]), buf); + if (err == -ENOSPC) { + /* + * Out of histogram entries. + * Record error, but continue. + * Use error code that should not clash with + * bpf_map_lookup_batch + */ + err2 = -ETOOMANYREFS; + err = 0; + } else if (err) { + // Critical error - abort goto exit; } } + nentries += count; + count = LOOKUP_BATCH_SIZE; in_batch = out_batch; - buckets_fetched += count; - count = HIST_NBUCKETS - buckets_fetched; } - merge_percpu_hist(HIST_NBUCKETS, ncpus, percpu_hist, hist); - exit: - free(percpu_hist); + free(percpu_buckets); free(keys); - return err; + return err ?: err2; } -static int report_stats(const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int report_stats(const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { - enum netstacklat_hook hook; - __u64 hist[HIST_NBUCKETS] = { 0 }; + int i, err; time_t t; - int err; + + err = fetch_histograms(bpf_map__fd(obj->maps.netstack_latency_seconds), + hist_buf); + if (err == -ETOOMANYREFS) + fprintf(stderr, + "Warning: Histogram buffer ran out of space - some histograms may not be reported\n"); + else if (err) + return err; time(&t); printf("%s", ctime(&t)); - for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) { - if (!conf->enabled_hooks[hook]) - continue; + for (i = 0; i < hist_buf->current_size; i++) { + print_histkey(stdout, &hist_buf->hists[i].key); + printf(":\n"); + print_log2hist(stdout, HIST_NBUCKETS, + hist_buf->hists[i].buckets, 1); + printf("\n"); + } + fflush(stdout); - printf("%s:\n", hook_to_str(hook)); + return 0; +} - err = fetch_hist_map(hook_to_histmap(hook, obj), hist); - if (err) - return err; +static int init_histogram_buffer(struct histogram_buffer *buf, + const struct netstacklat_config *conf) +{ + int max_hists = 0, i; - print_log2hist(stdout, ARRAY_SIZE(hist), hist, 1); - printf("\n"); + for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) { + if (conf->enabled_hooks[i]) + max_hists++; } - fflush(stdout); + if (conf->bpf_conf.groupby_ifindex) + max_hists *= conf->bpf_conf.filter_ifindex ? + min(conf->nifindices, 64) : + 32; + + if (conf->bpf_conf.groupby_cgroup) + max_hists *= conf->bpf_conf.filter_cgroup ? + min(conf->ncgroups, 128) : + 64; + + buf->hists = calloc(max_hists, sizeof(*buf->hists)); + if (!buf->hists) + return -errno; + + buf->max_size = max_hists; + buf->current_size = 0; return 0; } @@ -767,6 +1119,75 @@ static void set_programs_to_load(const struct netstacklat_config *conf, } } +static int set_map_sizes(const struct netstacklat_config *conf, + struct netstacklat_bpf *obj, int max_hists) +{ + __u32 size; + int err, i; + + size = max_hists * HIST_NBUCKETS; + err = bpf_map__set_max_entries(obj->maps.netstack_latency_seconds, + size); + if (err) { + fprintf(stderr, "Failed setting size of histogram map to %u\n", + size); + return err; + } + + // PID filter - arraymap, needs max PID + 1 entries + for (i = 0, size = 1; i < conf->npids; i++) { + if (conf->pids[i] >= size) + size = conf->pids[i] + 1; + } + err = bpf_map__set_max_entries(obj->maps.netstack_pidfilter, size); + if (err) { + fprintf(stderr, "Failed setting size of PID filter map to %u\n", + size); + return err; + } + + // ifindex filter - arraymap, needs max ifindex + 1 entries + for (i = 0, size = 1; i < conf->nifindices; i++) { + if (conf->ifindices[i] >= size) + size = conf->ifindices[i] + 1; + } + err = bpf_map__set_max_entries(obj->maps.netstack_ifindexfilter, size); + if (err) { + fprintf(stderr, + "Failed setting size of ifindex filter map to %u\n", + size); + return err; + } + + // cgroup filter - hashmap, should be ~2x expected number of entries + size = conf->bpf_conf.filter_cgroup ? conf->ncgroups * 2 : 1; + err = bpf_map__set_max_entries(obj->maps.netstack_cgroupfilter, size); + if (err) { + fprintf(stderr, + "Failed setting size of cgroup filter map to %u\n", + size); + return err; + } + + return 0; +} + +static int init_filtermap(int map_fd, void *keys, size_t nelem, + size_t elem_size) +{ + __u8 ok_val = 1; + int i, err; + + for (i = 0; i < nelem; i++) { + err = bpf_map_update_elem(map_fd, (char *)keys + i * elem_size, + &ok_val, 0); + if (err) + return err; + } + + return 0; +} + static int init_signalfd(void) { sigset_t mask; @@ -836,8 +1257,8 @@ static int setup_timer(__u64 interval_ns) return fd; } -static int handle_timer(int timer_fd, const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int handle_timer(int timer_fd, const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { __u64 timer_exps; ssize_t size; @@ -854,7 +1275,7 @@ static int handle_timer(int timer_fd, const struct netstacklat_config *conf, fprintf(stderr, "Warning: Missed %llu reporting intervals\n", timer_exps - 1); - return report_stats(conf, obj); + return report_stats(obj, hist_buf); } static int epoll_add_event(int epoll_fd, int fd, __u64 event_type, __u64 value) @@ -894,8 +1315,8 @@ static int setup_epoll_instance(int sig_fd, int timer_fd) return err; } -static int poll_events(int epoll_fd, const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int poll_events(int epoll_fd, const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { struct epoll_event events[MAX_EPOLL_EVENTS]; int i, n, fd, err = 0; @@ -914,7 +1335,7 @@ static int poll_events(int epoll_fd, const struct netstacklat_config *conf, err = handle_signal(fd); break; case NETSTACKLAT_EPOLL_TIMER: - err = handle_timer(fd, conf, obj); + err = handle_timer(fd, obj, hist_buf); break; default: fprintf(stderr, "Warning: unexpected epoll data: %lu\n", @@ -929,30 +1350,13 @@ static int poll_events(int epoll_fd, const struct netstacklat_config *conf, return err; } -static int init_pidfilter_map(const struct netstacklat_bpf *obj, - const struct netstacklat_config *conf) -{ - __u8 pid_ok_val = 1; - int map_fd, err; - __u32 i; - - map_fd = bpf_map__fd(obj->maps.netstack_pidfilter); - for (i = 0; i < conf->npids; i++) { - err = bpf_map_update_elem(map_fd, &conf->pids[i], &pid_ok_val, - 0); - if (err) - return err; - } - - return 0; -} - int main(int argc, char *argv[]) { int sig_fd, timer_fd, epoll_fd, sock_fd, err; struct netstacklat_config config = { .report_interval_s = 5, }; + struct histogram_buffer hist_buf; struct netstacklat_bpf *obj; char errmsg[128]; @@ -963,6 +1367,13 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + err = init_histogram_buffer(&hist_buf, &config); + if (err) { + fprintf(stderr, "Failed allocating buffer for histograms: %s\n", + strerror(-err)); + return EXIT_FAILURE; + } + sock_fd = enable_sw_rx_tstamps(); if (sock_fd < 0) { err = sock_fd; @@ -974,7 +1385,7 @@ int main(int argc, char *argv[]) obj = netstacklat_bpf__open(); if (!obj) { - err = libbpf_get_error(obj); + err = -errno; libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed opening eBPF object file: %s\n", errmsg); goto exit_sockfd; @@ -985,6 +1396,13 @@ int main(int argc, char *argv[]) set_programs_to_load(&config, obj); + err = set_map_sizes(&config, obj, hist_buf.max_size); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed configuring map sizes: %s\n", errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__load(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); @@ -992,7 +1410,9 @@ int main(int argc, char *argv[]) goto exit_destroy_bpf; } - err = init_pidfilter_map(obj, &config); + err = init_filtermap(bpf_map__fd(obj->maps.netstack_pidfilter), + config.pids, config.npids, sizeof(*config.pids)); + if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed filling the pid filter map: %s\n", @@ -1000,6 +1420,26 @@ int main(int argc, char *argv[]) goto exit_destroy_bpf; } + err = init_filtermap(bpf_map__fd(obj->maps.netstack_ifindexfilter), + config.ifindices, config.nifindices, + sizeof(*config.ifindices)); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed filling the ifindex filter map: %s\n", + errmsg); + goto exit_destroy_bpf; + } + + err = init_filtermap(bpf_map__fd(obj->maps.netstack_cgroupfilter), + config.cgroups, config.ncgroups, + sizeof(*config.cgroups)); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed filling the cgroup filter map: %s\n", + errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__attach(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); @@ -1032,12 +1472,12 @@ int main(int argc, char *argv[]) // Report stats until user shuts down program while (true) { - err = poll_events(epoll_fd, &config, obj); + err = poll_events(epoll_fd, obj, &hist_buf); if (err) { if (err == NETSTACKLAT_ABORT) { // Report stats a final time before terminating - err = report_stats(&config, obj); + err = report_stats(obj, &hist_buf); } else { libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed polling fds: %s\n", diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index bb0162a1..4811da4c 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -15,6 +15,10 @@ // The highest possible PID on a Linux system (from /include/linux/threads.h) #define PID_MAX_LIMIT (4 * 1024 * 1024) +// The highest ifindex we expect to encounter +#define IFINDEX_MAX 16384 +// The maximum number of different cgroups we can filter for +#define MAX_TRACKED_CGROUPS 4096 #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -29,6 +33,15 @@ }) #endif +#ifndef min +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + enum netstacklat_hook { NETSTACKLAT_HOOK_INVALID = 0, NETSTACKLAT_HOOK_IP_RCV, @@ -41,10 +54,26 @@ enum netstacklat_hook { NETSTACKLAT_N_HOOKS, }; -struct netstacklat_bpf_config -{ +/* + * Key used for the histogram map + * To be compatible with ebpf-exporter, all histograms need a key struct whose final + * member is named "bucket" and is the histogram bucket index. + */ +struct hist_key { + __u64 cgroup; + __u32 ifindex; + __u16 hook; // need well defined size for ebpf-exporter to decode + __u16 bucket; // needs to be last to be compatible with ebpf-exporter +}; + +struct netstacklat_bpf_config { + __u32 network_ns; bool filter_pid; + bool filter_ifindex; + bool filter_cgroup; + bool filter_nonempty_sockqueue; + bool groupby_ifindex; + bool groupby_cgroup; }; #endif - diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml index 2fb99530..6c6cf152 100644 --- a/netstacklat/netstacklat.yaml +++ b/netstacklat/netstacklat.yaml @@ -1,79 +1,39 @@ metrics: histograms: - - name: netstack_latency_ip_start_seconds - help: Time for packet to reach the start of the IP-stack + - name: netstack_latency_seconds + help: Latency for packets (skbs) to reach various points in the kernel network stack bucket_type: exp2 bucket_min: 0 bucket_max: 34 bucket_multiplier: 0.000000001 # nanoseconds to seconds labels: - - name: bucket - size: 4 + - name: cgroup + size: 8 decoders: - name: uint - - name: netstack_latency_tcp_start_seconds - help: Time for packet to reach the start of the TCP stack - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket + - name: cgroup + - name: iface size: 4 decoders: - - name: uint - - name: netstack_latency_udp_start_seconds - help: Time until packet to reach the start of the UDP stack - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 + # If including output from a different network namespace than ebpf-exporter + # you probably just want to decode as a uint (ifindex) instead + # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others + - name: ifname + - name: hook + size: 2 decoders: - name: uint - - name: netstack_latency_tcp_sock_enqueued_seconds - help: Time until packet is queued to TCP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: + - name: static_map + static_map: + 1: "ip-start" + 2: "tcp-start" + 3: "udp-start" + 4: "tcp-socket-enqueued" + 5: "udp-socket-enqueued" + 6: "tcp-socket-read" + 7: "udp-socket-read" - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_udp_sock_enqueued_seconds - help: Time until packet is queued to UDP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_tcp_sock_read_seconds - help: Time until packet data is read from TCP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_udp_sock_read_seconds - help: Time until packet data is read from UDP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 + size: 2 decoders: - name: uint +