Skip to content

Commit ece9f3f

Browse files
committed
Auto merge of #142978 - Kobzol:query-hit, r=oli-obk
Add new self-profiling event to cheaply aggregate query cache hit counts Self-profile can record various types of things, some of them are not enabled, like query cache hits. Rustc currently records cache hits as "instant" measureme events, which records the thread ID, current timestamp, and constructs an individual event for each such cache hit. This is incredibly expensive, in a small hello world benchmark that just depends on serde, it makes compilation with nightly go from ~3s (with `-Zself-profile`) to ~15s (with `-Zself-profile -Zself-profile-events=default,query-cache-hit`). We'd like to add query cache hits to rustc-perf (rust-lang/rustc-perf#2168), but there we only need the actualy cache hit counts, not the timestamp/thread ID metadata associated with it. This PR adds a new `query-cache-hit-count` event. Instead of generating individual instant events, it simply aggregates cache hit counts per *query invocation* (so a combination of a query and its arguments, if I understand it correctly) using an atomic counter. At the end of the compilation session, these counts are then dumped to the self-profile log using integer events (in a similar fashion as how we record artifact sizes). I suppose that we could dedup the query invocations in rustc directly, but I don't think it's really required. In local experiments with the hello world + serde case, the query invocation records generated ~30 KiB more data in the self-profile, which was ~10% increase in this case. With this PR, the overhead of `-Zself-profile` seems to be the same as before, at least on my machine, so I also enabled query cache hit counts by default when self profiling is enabled. We should also modify `analyzeme`, specifically [this](https://github.com/rust-lang/measureme/blob/master/analyzeme/src/analysis.rs#L139), and make it load the integer events with query cache hit counts. I can do that as a follow-up, it's not required to be done in sync with this PR, and it doesn't require changes in rustc. CC `@cjgillot` r? `@oli-obk`
2 parents f51c987 + e8fc30e commit ece9f3f

File tree

2 files changed

+93
-7
lines changed

2 files changed

+93
-7
lines changed

compiler/rustc_data_structures/src/profiling.rs

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ use std::fmt::Display;
8888
use std::intrinsics::unlikely;
8989
use std::path::Path;
9090
use std::sync::Arc;
91+
use std::sync::atomic::{AtomicU64, Ordering};
9192
use std::time::{Duration, Instant};
9293
use std::{fs, process};
9394

@@ -105,6 +106,8 @@ bitflags::bitflags! {
105106
struct EventFilter: u16 {
106107
const GENERIC_ACTIVITIES = 1 << 0;
107108
const QUERY_PROVIDERS = 1 << 1;
109+
/// Store detailed instant events, including timestamp and thread ID,
110+
/// per each query cache hit. Note that this is quite expensive.
108111
const QUERY_CACHE_HITS = 1 << 2;
109112
const QUERY_BLOCKED = 1 << 3;
110113
const INCR_CACHE_LOADS = 1 << 4;
@@ -113,16 +116,20 @@ bitflags::bitflags! {
113116
const FUNCTION_ARGS = 1 << 6;
114117
const LLVM = 1 << 7;
115118
const INCR_RESULT_HASHING = 1 << 8;
116-
const ARTIFACT_SIZES = 1 << 9;
119+
const ARTIFACT_SIZES = 1 << 9;
120+
/// Store aggregated counts of cache hits per query invocation.
121+
const QUERY_CACHE_HIT_COUNTS = 1 << 10;
117122

118123
const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
119124
Self::QUERY_PROVIDERS.bits() |
120125
Self::QUERY_BLOCKED.bits() |
121126
Self::INCR_CACHE_LOADS.bits() |
122127
Self::INCR_RESULT_HASHING.bits() |
123-
Self::ARTIFACT_SIZES.bits();
128+
Self::ARTIFACT_SIZES.bits() |
129+
Self::QUERY_CACHE_HIT_COUNTS.bits();
124130

125131
const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
132+
const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
126133
}
127134
}
128135

@@ -134,6 +141,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
134141
("generic-activity", EventFilter::GENERIC_ACTIVITIES),
135142
("query-provider", EventFilter::QUERY_PROVIDERS),
136143
("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
144+
("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
137145
("query-blocked", EventFilter::QUERY_BLOCKED),
138146
("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
139147
("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +419,24 @@ impl SelfProfilerRef {
411419
#[inline(never)]
412420
#[cold]
413421
fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
414-
profiler_ref.instant_query_event(
415-
|profiler| profiler.query_cache_hit_event_kind,
416-
query_invocation_id,
417-
);
422+
if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
423+
profiler_ref
424+
.profiler
425+
.as_ref()
426+
.unwrap()
427+
.increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
428+
}
429+
if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
430+
profiler_ref.instant_query_event(
431+
|profiler| profiler.query_cache_hit_event_kind,
432+
query_invocation_id,
433+
);
434+
}
418435
}
419436

420-
if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
437+
// We check both kinds of query cache hit events at once, to reduce overhead in the
438+
// common case (with self-profile disabled).
439+
if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
421440
cold_call(self, query_invocation_id);
422441
}
423442
}
@@ -489,6 +508,35 @@ impl SelfProfilerRef {
489508
self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
490509
}
491510

511+
/// Store query cache hits to the self-profile log.
512+
/// Should be called once at the end of the compilation session.
513+
///
514+
/// The cache hits are stored per **query invocation**, not **per query kind/type**.
515+
/// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
516+
/// IDs.
517+
pub fn store_query_cache_hits(&self) {
518+
if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
519+
let profiler = self.profiler.as_ref().unwrap();
520+
let query_hits = profiler.query_hits.read();
521+
let builder = EventIdBuilder::new(&profiler.profiler);
522+
let thread_id = get_thread_id();
523+
for (query_invocation, hit_count) in query_hits.iter().enumerate() {
524+
let hit_count = hit_count.load(Ordering::Relaxed);
525+
// No need to record empty cache hit counts
526+
if hit_count > 0 {
527+
let event_id =
528+
builder.from_label(StringId::new_virtual(query_invocation as u64));
529+
profiler.profiler.record_integer_event(
530+
profiler.query_cache_hit_count_event_kind,
531+
event_id,
532+
thread_id,
533+
hit_count,
534+
);
535+
}
536+
}
537+
}
538+
}
539+
492540
#[inline]
493541
pub fn enabled(&self) -> bool {
494542
self.profiler.is_some()
@@ -537,13 +585,28 @@ pub struct SelfProfiler {
537585

538586
string_cache: RwLock<FxHashMap<String, StringId>>,
539587

588+
/// Recording individual query cache hits as "instant" measureme events
589+
/// is incredibly expensive. Instead of doing that, we simply aggregate
590+
/// cache hit *counts* per query invocation, and then store the final count
591+
/// of cache hits per invocation at the end of the compilation session.
592+
///
593+
/// With this approach, we don't know the individual thread IDs and timestamps
594+
/// of cache hits, but it has very little overhead on top of `-Zself-profile`.
595+
/// Recording the cache hits as individual events made compilation 3-5x slower.
596+
///
597+
/// Query invocation IDs should be monotonic integers, so we can store them in a vec,
598+
/// rather than using a hashmap.
599+
query_hits: RwLock<Vec<AtomicU64>>,
600+
540601
query_event_kind: StringId,
541602
generic_activity_event_kind: StringId,
542603
incremental_load_result_event_kind: StringId,
543604
incremental_result_hashing_event_kind: StringId,
544605
query_blocked_event_kind: StringId,
545606
query_cache_hit_event_kind: StringId,
546607
artifact_size_event_kind: StringId,
608+
/// Total cache hits per query invocation
609+
query_cache_hit_count_event_kind: StringId,
547610
}
548611

549612
impl SelfProfiler {
@@ -573,6 +636,7 @@ impl SelfProfiler {
573636
let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
574637
let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
575638
let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
639+
let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
576640

577641
let mut event_filter_mask = EventFilter::empty();
578642

@@ -618,6 +682,8 @@ impl SelfProfiler {
618682
query_blocked_event_kind,
619683
query_cache_hit_event_kind,
620684
artifact_size_event_kind,
685+
query_cache_hit_count_event_kind,
686+
query_hits: Default::default(),
621687
})
622688
}
623689

@@ -627,6 +693,25 @@ impl SelfProfiler {
627693
self.profiler.alloc_string(s)
628694
}
629695

696+
/// Store a cache hit of a query invocation
697+
pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
698+
// Fast path: assume that the query was already encountered before, and just record
699+
// a cache hit.
700+
let mut guard = self.query_hits.upgradable_read();
701+
let query_hits = &guard;
702+
let index = id.0 as usize;
703+
if index < query_hits.len() {
704+
// We only want to increment the count, no other synchronization is required
705+
query_hits[index].fetch_add(1, Ordering::Relaxed);
706+
} else {
707+
// If not, we need to extend the query hit map to the highest observed ID
708+
guard.with_upgraded(|vec| {
709+
vec.resize_with(index + 1, || AtomicU64::new(0));
710+
vec[index] = AtomicU64::from(1);
711+
});
712+
}
713+
}
714+
630715
/// Gets a `StringId` for the given string. This method makes sure that
631716
/// any strings going through it will only be allocated once in the
632717
/// profiling data.

compiler/rustc_query_impl/src/profiling_support.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {
259259
for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {
260260
alloc(tcx, &mut string_cache)
261261
}
262+
tcx.sess.prof.store_query_cache_hits();
262263
}

0 commit comments

Comments
 (0)