Skip to content

Commit 50f536d

Browse files
byrnedjCopilot
andauthored
Improvements to the DTO_WAIT_METHODS (#16)
* Improvements to the DTO_WAIT_METHODS 1. use busypoll and autotune as defaults 2. add tpause instruction 3. change to use compiler intrinsics for UMWAIT and TPAUSE 4. update the documentation for using UMWAIT 5. clean up wait_method paths * Update README.md Co-authored-by: Copilot <[email protected]> * umwait and tpause come from waitpkg * update size --------- Co-authored-by: Copilot <[email protected]>
1 parent 74c1ad5 commit 50f536d

File tree

3 files changed

+117
-58
lines changed

3 files changed

+117
-58
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ all: libdto dto-test-wodto
77
DML_LIB_CXX=-D_GNU_SOURCE
88

99
libdto: dto.c
10-
gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma
10+
gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
1111

1212
libdto_nostats: dto.c
13-
gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma
13+
gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
1414

1515
install:
1616
cp libdto.so.1.0 /usr/lib64/

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" ex
2626
calling thread cpu - cpu-centric numa awareness.
2727
3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
2828
4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
29+
The wait method can be one of the following: yield, busypoll, umwait, or tpause. The default is busypoll.
30+
31+
For some workloads, complete offloading to the DSA device can result in improved performance and reduce power consumption via the UMWAIT (or TPAUSE) instruction.
32+
In this case, DTO_CPU_SIZE_FRACTION can be set to 0.0, which means that the CPU job is 0 bytes and the entire job is offloaded to DSA and AUTO_ADJUST_KNOBS is set to 0.
33+
Then during the wait step, DTO will use the UMWAIT (or TPAUSE) instruction to wait for the DSA job to complete. UMWAIT / TPAUSE are instructions that allow the CPU to enter a low-power state while waiting for the job to complete. For UMWAIT, please refer to this [enabling guide](https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/monitor-umonitor-performance-guidance.html) for more details.
34+
35+
2936

3037
DTO also implements a heuristic to auto tune dsa_min_bytes and cpu_size_fraction parameters based on current DSA load. For example, if DSA is heavily loaded,
3138
DTO tries to reduce the DSA load by increasing cpu_size_fraction and dsa_min_bytes. Conversely, if DSA is lightly loaded, DTO tries to increase the DSA load by
@@ -179,4 +186,4 @@ When linking DTO using LD_PRELOAD environment variable special care is required
179186
in the script.
180187
- When the application is started by a script with #!<location of shell> which invokes another script with #!<location of shell>, for
181188
unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum
182-
DTO size above 8K, or by avoiding this invocation sequence.
189+
DTO size above 8K, or by avoiding this invocation sequence.

dto.c

Lines changed: 107 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@
3030
// DSA capabilities
3131
#define GENCAP_CC_MEMORY 0x4
3232

33-
#define UMWAIT_DELAY_DEFAULT 100000
34-
/* C0.1 state */
35-
#define UMWAIT_STATE 1
33+
#define UMWAIT_DELAY_DEFAULT 100000 //cycles until umwait timeout
34+
35+
#define C01_STATE 1
36+
#define C02_STATE 0
37+
#define TPAUSE_DELAY 1000
3638

3739
#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size)
3840
#define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec))
@@ -44,10 +46,15 @@
4446
*/
4547
#define MAX_WQS 32
4648
#define MAX_NUMA_NODES 32
47-
#define DTO_DEFAULT_MIN_SIZE 16384
49+
#define DTO_DEFAULT_MIN_SIZE 65536
4850
#define DTO_INITIALIZED 0
4951
#define DTO_INITIALIZING 1
5052

53+
54+
#define NSEC_PER_SEC (1000000000)
55+
#define MSEC_PER_SEC (1000)
56+
#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC)
57+
5158
// thread specific variables
5259
static __thread struct dsa_hw_desc thr_desc;
5360
static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32)));
@@ -79,7 +86,8 @@ struct dto_device {
7986
enum wait_options {
8087
WAIT_BUSYPOLL = 0,
8188
WAIT_UMWAIT,
82-
WAIT_YIELD
89+
WAIT_YIELD,
90+
WAIT_TPAUSE
8391
};
8492

8593
enum numa_aware {
@@ -105,7 +113,7 @@ static atomic_uchar dto_initializing;
105113
static uint8_t use_std_lib_calls;
106114
static enum numa_aware is_numa_aware;
107115
static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
108-
static int wait_method = WAIT_YIELD;
116+
static int wait_method = WAIT_BUSYPOLL;
109117
static size_t cpu_size_fraction; // range of values is 0 to 99
110118

111119
static uint8_t dto_dsa_memcpy = 1;
@@ -114,6 +122,18 @@ static uint8_t dto_dsa_memset = 1;
114122
static uint8_t dto_dsa_memcmp = 1;
115123

116124
static uint8_t dto_dsa_cc = 1;
125+
static bool dto_use_c02 = true; //C02 state is default -
126+
//C02 avg exit latency is ~500 ns
127+
//and C01 is about ~240 ns on SPR
128+
129+
#define TPAUSE_C02_DELAY_NS 6000 //in this case we are offloading so delay can
130+
//be ~6 us as this is around the time a > 64KB
131+
//copy takes to complete
132+
133+
#define TPAUSE_C01_DELAY_NS 1000 //keep smaller because we want to wake up
134+
//with lower latency
135+
136+
static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS;
117137

118138
static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
119139

@@ -171,6 +191,7 @@ static const char * const wait_names[] = {
171191
[WAIT_BUSYPOLL] = "busypoll",
172192
[WAIT_UMWAIT] = "umwait",
173193
[WAIT_YIELD] = "yield",
194+
[WAIT_TPAUSE] = "tpause"
174195
};
175196

176197
static int collect_stats;
@@ -223,7 +244,7 @@ static atomic_int fail_counter[HIST_NO_BUCKETS][MAX_FAILURES];
223244
static int init_dto(void) __attribute__((constructor));
224245
static void cleanup_dto(void) __attribute__((destructor));
225246

226-
static int umwait_support;
247+
static int waitpkg_support;
227248

228249
static enum {
229250
LOG_LEVEL_FATAL,
@@ -323,76 +344,78 @@ static __always_inline void movdir64b(struct dsa_hw_desc *desc, volatile void *r
323344
: : "a" (reg), "d" (desc));
324345
}
325346

326-
static __always_inline void umonitor(const volatile void *addr)
327-
{
328-
asm volatile(".byte 0xf3, 0x48, 0x0f, 0xae, 0xf0" : : "a"(addr));
329-
}
330-
331-
static __always_inline int umwait(unsigned long timeout, unsigned int state)
332-
{
333-
uint8_t r;
334-
uint32_t timeout_low = (uint32_t)timeout;
335-
uint32_t timeout_high = (uint32_t)(timeout >> 32);
336-
337-
asm volatile(".byte 0xf2, 0x48, 0x0f, 0xae, 0xf1\t\n"
338-
"setc %0\t\n"
339-
: "=r"(r)
340-
: "c"(state), "a"(timeout_low), "d"(timeout_high));
341-
return r;
342-
}
343-
344347
static __always_inline void dsa_wait_yield(const volatile uint8_t *comp)
345348
{
346349
while (*comp == 0) {
347-
sched_yield();
350+
sched_yield();
348351
}
349352
}
350353

351354
static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp)
352355
{
353356
while (*comp == 0) {
354-
_mm_pause();
357+
_mm_pause();
355358
}
356359
}
357360

358-
static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp)
361+
static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp)
359362
{
360-
umonitor(comp);
363+
while (*comp == 0) {
364+
uint64_t delay = _rdtsc() + tpause_wait_time;
365+
_tpause(C02_STATE, delay);
366+
}
367+
}
361368

362-
// Hardware never writes 0 to this field. Software should initialize this field to 0
363-
// so it can detect when the completion record has been written
364-
if (*comp == 0) {
365-
uint64_t delay = __rdtsc() + dto_umwait_delay;
369+
static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp)
370+
{
371+
_umonitor((void*)comp);
366372

367-
umwait(delay, UMWAIT_STATE);
368-
}
373+
uint64_t delay = _rdtsc() + UMWAIT_DELAY_DEFAULT;
374+
_umwait(C02_STATE, delay);
369375
}
370376

371377
static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp)
372378
{
373-
374-
while (*comp == 0)
375-
__dsa_wait_umwait(comp);
379+
while (*comp == 0) {
380+
__dsa_wait_umwait(comp);
381+
}
376382
}
377383

378384
static __always_inline void __dsa_wait(const volatile uint8_t *comp)
379385
{
380-
if (wait_method == WAIT_YIELD)
386+
switch(wait_method) {
387+
case WAIT_YIELD:
381388
sched_yield();
382-
else if (wait_method == WAIT_UMWAIT)
383-
__dsa_wait_umwait(comp);
384-
else
385-
_mm_pause();
389+
break;
390+
case WAIT_UMWAIT:
391+
__dsa_wait_umwait(comp);
392+
break;
393+
case WAIT_TPAUSE:
394+
_tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY_NS);
395+
break;
396+
default:
397+
_mm_pause();
398+
}
386399
}
387400

388401
static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp)
389402
{
390-
if (wait_method == WAIT_YIELD)
391-
dsa_wait_yield(comp);
392-
else if (wait_method == WAIT_UMWAIT)
393-
dsa_wait_umwait(comp);
394-
else
395-
dsa_wait_busy_poll(comp);
403+
switch (wait_method) {
404+
case WAIT_YIELD:
405+
dsa_wait_yield(comp);
406+
break;
407+
case WAIT_UMWAIT:
408+
dsa_wait_umwait(comp);
409+
break;
410+
case WAIT_BUSYPOLL:
411+
dsa_wait_busy_poll(comp);
412+
break;
413+
case WAIT_TPAUSE:
414+
dsa_wait_tpause(comp);
415+
break;
416+
default:
417+
dsa_wait_busy_poll(comp);
418+
}
396419
}
397420

398421
/* A simple auto-tuning heuristic.
@@ -418,8 +441,9 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp)
418441
uint64_t local_num_waits = 0;
419442

420443
if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) {
421-
while (*comp == 0)
444+
while (*comp == 0) {
422445
__dsa_wait(comp);
446+
}
423447

424448
return;
425449
}
@@ -1097,13 +1121,13 @@ static int dsa_init(void)
10971121
const char *env_str;
10981122
char wq_list[256];
10991123

1100-
/* detect umwait support */
1124+
/* detect waitpkg support */
11011125
leaf = 7;
11021126
waitpkg = 0;
11031127
if (__get_cpuid(0, &leaf, unused, &waitpkg, unused + 1)) {
11041128
if (waitpkg & 0x20) {
1105-
LOG_TRACE("umwait supported\n");
1106-
umwait_support = 1;
1129+
LOG_TRACE("waitpkg supported\n");
1130+
waitpkg_support = 1;
11071131
}
11081132
}
11091133

@@ -1114,14 +1138,21 @@ static int dsa_init(void)
11141138
min_avg_waits = MIN_AVG_POLL_WAITS;
11151139
max_avg_waits = MAX_AVG_POLL_WAITS;
11161140
} else if (!strncmp(env_str, wait_names[WAIT_UMWAIT], strlen(wait_names[WAIT_UMWAIT]))) {
1117-
if (umwait_support) {
1141+
if (waitpkg_support) {
11181142
wait_method = WAIT_UMWAIT;
11191143
/* Use the same waits as busypoll for now */
11201144
min_avg_waits = MIN_AVG_POLL_WAITS;
11211145
max_avg_waits = MAX_AVG_POLL_WAITS;
11221146
} else
11231147
LOG_ERROR("umwait not supported. Falling back to default wait method\n");
1124-
}
1148+
} else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) {
1149+
if (waitpkg_support) {
1150+
wait_method = WAIT_TPAUSE;
1151+
} else {
1152+
LOG_ERROR("tpause not supported. Falling back to busypoll\n");
1153+
wait_method = WAIT_BUSYPOLL;
1154+
}
1155+
}
11251156
}
11261157

11271158
env_str = getenv("DTO_WQ_LIST");
@@ -1341,6 +1372,27 @@ static int init_dto(void)
13411372
use_std_lib_calls = 1;
13421373
}
13431374

1375+
// calculate the wait time for TPAUSE
1376+
if (wait_method == WAIT_TPAUSE) {
1377+
unsigned int num, den, freq;
1378+
unsigned int empty;
1379+
unsigned long long tmp;
1380+
__get_cpuid( 0x15, &den, &num, &freq, &empty );
1381+
freq /= 1000;
1382+
LOG_TRACE( "Core Freq = %u kHz\n", freq );
1383+
LOG_TRACE( "TSC Mult = %u\n", num );
1384+
LOG_TRACE( "TSC Den = %u\n", den );
1385+
freq *= num;
1386+
freq /= den;
1387+
LOG_TRACE( "CPU freq = %u kHz\n", freq );
1388+
1389+
LOG_TRACE( "Requested wait: %llu nsec\n", tpause_wait_time );
1390+
tmp = tpause_wait_time;
1391+
tmp *= freq;
1392+
tpause_wait_time = tmp / NSEC_PER_MSEC;
1393+
LOG_TRACE( "Requested wait duration: %llu cycles\n", tpause_wait_time );
1394+
}
1395+
13441396
// display configuration
13451397
LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
13461398
"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n",

0 commit comments

Comments
 (0)