diff --git a/crypto/Kconfig b/crypto/Kconfig index c0054b9f23cbb..17c113cd5fe52 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -2019,6 +2019,7 @@ config CRYPTO_ANSI_CPRNG tristate "Pseudo Random Number Generation for Cryptographic modules" select CRYPTO_AES select CRYPTO_RNG + select CRYPTO_SHA3 help This option enables the generic pseudo random number generator for cryptographic modules. Uses the Algorithm specified in diff --git a/crypto/aead.c b/crypto/aead.c index 16991095270d2..c4ece86c45bc4 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -35,8 +35,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/cipher.c b/crypto/cipher.c index b47141ed4a9f3..395f0c2fbb9ff 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -34,8 +34,7 @@ static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); + kfree_sensitive(buffer); return ret; } diff --git a/crypto/drbg.c b/crypto/drbg.c index accf425de57f7..d14cc09b5d399 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1283,6 +1283,12 @@ static inline int drbg_alloc_state(struct drbg_state *drbg) if (ret < 0) goto err; + /* + * Align to at least a cache line for better performance. This also + * prevents false sharing of cache lines between different instances. + */ + ret = max(ret, L1_CACHE_BYTES - 1); + drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL); if (!drbg->Vbuf) { ret = -ENOMEM; diff --git a/crypto/ecdh.c b/crypto/ecdh.c index fe8966511e9d7..85c64f1a40df2 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ecc.h" struct ecdh_ctx { @@ -33,6 +34,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, params.key_size > sizeof(u64) * ctx->ndigits) return -EINVAL; + memset(ctx->private_key, 0, sizeof(ctx->private_key)); + if (!params.key || !params.key_size) return ecc_gen_privkey(ctx->curve_id, ctx->ndigits, ctx->private_key); @@ -94,6 +97,36 @@ static int ecdh_compute_value(struct kpp_request *req) ctx->private_key, public_key); buf = public_key; nbytes = public_key_sz; + + /* + * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of + * Pair-wise Consistency"): recompute the public key + * and check if the results match. + */ + if (fips_enabled) { + u64 *public_key_pct; + + if (ret < 0) + goto free_all; + + public_key_pct = kmalloc(public_key_sz, GFP_KERNEL); + if (!public_key_pct) { + ret = -ENOMEM; + goto free_all; + } + + ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits, + ctx->private_key, + public_key_pct); + if (ret < 0) { + kfree(public_key_pct); + goto free_all; + } + + if (memcmp(public_key, public_key_pct, public_key_sz)) + panic("ECDH PCT failed in FIPS mode"); + kfree(public_key_pct); + } } if (ret < 0) diff --git a/crypto/essiv.c b/crypto/essiv.c index 8bcc5bdcb2a95..ec81bdea25631 100644 --- a/crypto/essiv.c +++ b/crypto/essiv.c @@ -114,13 +114,16 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); if (err) - return err; + goto out; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); - return crypto_cipher_setkey(tctx->essiv_cipher, salt, - crypto_shash_digestsize(tctx->hash)); + err = crypto_cipher_setkey(tctx->essiv_cipher, salt, + crypto_shash_digestsize(tctx->hash)); +out: + memzero_explicit(&keys, sizeof(keys)); + return err; } static int essiv_aead_setauthsize(struct crypto_aead *tfm, diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c index b9edfaa51b273..4b50cbc8a2faf 100644 --- a/crypto/jitterentropy-kcapi.c +++ b/crypto/jitterentropy-kcapi.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Linux Kernel Crypto API specific code * - * Copyright Stephan Mueller , 2015 + * Copyright Stephan Mueller , 2015 - 2023 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,6 +37,8 @@ * DAMAGE. */ +#include +#include #include #include #include @@ -46,6 +48,8 @@ #include "jitterentropy.h" +#define JENT_CONDITIONING_HASH "sha3-256-generic" + /*************************************************************************** * Helper function ***************************************************************************/ @@ -60,11 +64,6 @@ void jent_zfree(void *ptr) kfree_sensitive(ptr); } -void jent_memcpy(void *dest, const void *src, unsigned int n) -{ - memcpy(dest, src, n); -} - /* * Obtain a high-resolution time stamp value. The time stamp is used to measure * the execution time of a given code path and its variations. Hence, the time @@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out) *out = tmp; } +int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm); + u8 intermediary[SHA3_256_DIGEST_SIZE]; + __u64 j = 0; + int ret; + + desc->tfm = hash_state_desc->tfm; + + if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) { + pr_warn_ratelimited("Unexpected digest size\n"); + return -EINVAL; + } + + /* + * This loop fills a buffer which is injected into the entropy pool. + * The main reason for this loop is to execute something over which we + * can perform a timing measurement. The injection of the resulting + * data into the pool is performed to ensure the result is used and + * the compiler cannot optimize the loop away in case the result is not + * used at all. Yet that data is considered "additional information" + * considering the terminology from SP800-90A without any entropy. + * + * Note, it does not matter which or how much data you inject, we are + * interested in one Keccack1600 compression operation performed with + * the crypto_shash_final. + */ + for (j = 0; j < hash_loop_cnt; j++) { + ret = crypto_shash_init(desc) ?: + crypto_shash_update(desc, intermediary, + sizeof(intermediary)) ?: + crypto_shash_finup(desc, addtl, addtl_len, intermediary); + if (ret) + goto err; + } + + /* + * Inject the data from the previous loop into the pool. This data is + * not considered to contain any entropy, but it stirs the pool a bit. + */ + ret = crypto_shash_update(desc, intermediary, sizeof(intermediary)); + if (ret) + goto err; + + /* + * Insert the time stamp into the hash context representing the pool. + * + * If the time stamp is stuck, do not finally insert the value into the + * entropy pool. Although this operation should not do any harm even + * when the time stamp has no entropy, SP800-90B requires that any + * conditioning operation to have an identical amount of input data + * according to section 3.1.5. + */ + if (!stuck) { + ret = crypto_shash_update(hash_state_desc, (u8 *)&time, + sizeof(__u64)); + } + +err: + shash_desc_zero(desc); + memzero_explicit(intermediary, sizeof(intermediary)); + + return ret; +} + +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len) +{ + struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state; + u8 jent_block[SHA3_256_DIGEST_SIZE]; + /* Obtain data from entropy pool and re-initialize it */ + int ret = crypto_shash_final(hash_state_desc, jent_block) ?: + crypto_shash_init(hash_state_desc) ?: + crypto_shash_update(hash_state_desc, jent_block, + sizeof(jent_block)); + + if (!ret && dst_len) + memcpy(dst, jent_block, dst_len); + + memzero_explicit(jent_block, sizeof(jent_block)); + return ret; +} + /*************************************************************************** * Kernel crypto API interface ***************************************************************************/ @@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out) struct jitterentropy { spinlock_t jent_lock; struct rand_data *entropy_collector; + struct crypto_shash *tfm; + struct shash_desc *sdesc; }; -static int jent_kcapi_init(struct crypto_tfm *tfm) +static void jent_kcapi_cleanup(struct crypto_tfm *tfm) { struct jitterentropy *rng = crypto_tfm_ctx(tfm); - int ret = 0; - rng->entropy_collector = jent_entropy_collector_alloc(1, 0); - if (!rng->entropy_collector) - ret = -ENOMEM; + spin_lock(&rng->jent_lock); - spin_lock_init(&rng->jent_lock); - return ret; -} + if (rng->sdesc) { + shash_desc_zero(rng->sdesc); + kfree(rng->sdesc); + } + rng->sdesc = NULL; -static void jent_kcapi_cleanup(struct crypto_tfm *tfm) -{ - struct jitterentropy *rng = crypto_tfm_ctx(tfm); + if (rng->tfm) + crypto_free_shash(rng->tfm); + rng->tfm = NULL; - spin_lock(&rng->jent_lock); if (rng->entropy_collector) jent_entropy_collector_free(rng->entropy_collector); rng->entropy_collector = NULL; spin_unlock(&rng->jent_lock); } +static int jent_kcapi_init(struct crypto_tfm *tfm) +{ + struct jitterentropy *rng = crypto_tfm_ctx(tfm); + struct crypto_shash *hash; + struct shash_desc *sdesc; + int size, ret = 0; + + spin_lock_init(&rng->jent_lock); + + /* + * Use SHA3-256 as conditioner. We allocate only the generic + * implementation as we are not interested in high-performance. The + * execution time of the SHA3 operation is measured and adds to the + * Jitter RNG's unpredictable behavior. If we have a slower hash + * implementation, the execution timing variations are larger. When + * using a fast implementation, we would need to call it more often + * as its variations are lower. + */ + hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(hash)) { + pr_err("Cannot allocate conditioning digest\n"); + return PTR_ERR(hash); + } + rng->tfm = hash; + + size = sizeof(struct shash_desc) + crypto_shash_descsize(hash); + sdesc = kmalloc(size, GFP_KERNEL); + if (!sdesc) { + ret = -ENOMEM; + goto err; + } + + sdesc->tfm = hash; + crypto_shash_init(sdesc); + rng->sdesc = sdesc; + + rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc); + if (!rng->entropy_collector) { + ret = -ENOMEM; + goto err; + } + + spin_lock_init(&rng->jent_lock); + return 0; + +err: + jent_kcapi_cleanup(tfm); + return ret; +} + static int jent_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) @@ -180,15 +314,24 @@ static struct rng_alg jent_alg = { .cra_module = THIS_MODULE, .cra_init = jent_kcapi_init, .cra_exit = jent_kcapi_cleanup, - } }; static int __init jent_mod_init(void) { + SHASH_DESC_ON_STACK(desc, tfm); + struct crypto_shash *tfm; int ret = 0; - ret = jent_entropy_init(); + tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + desc->tfm = tfm; + crypto_shash_init(desc); + ret = jent_entropy_init(desc); + shash_desc_zero(desc); + crypto_free_shash(tfm); if (ret) { /* Handle permanent health test error */ if (fips_enabled) diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c index 227cedfa4f0ae..5b224d3d7442e 100644 --- a/crypto/jitterentropy.c +++ b/crypto/jitterentropy.c @@ -2,7 +2,7 @@ * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * - * Copyright Stephan Mueller , 2015 - 2020 + * Copyright Stephan Mueller , 2015 - 2023 * * Design * ====== @@ -57,21 +57,22 @@ typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; +typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { + /* SHA3-256 is used as conditioner */ +#define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ - __u64 data; /* SENSITIVE Actual random number */ - __u64 old_data; /* SENSITIVE Previous random number */ - __u64 prev_time; /* SENSITIVE Previous time stamp */ -#define DATA_SIZE_BITS ((sizeof(__u64)) * 8) - __u64 last_delta; /* SENSITIVE stuck test */ - __s64 last_delta2; /* SENSITIVE stuck test */ - unsigned int osr; /* Oversample rate */ + void *hash_state; /* SENSITIVE hash state entropy pool */ + __u64 prev_time; /* SENSITIVE Previous time stamp */ + __u64 last_delta; /* SENSITIVE stuck test */ + __s64 last_delta2; /* SENSITIVE stuck test */ + unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_BLOCKS 64 #define JENT_MEMORY_BLOCKSIZE 32 #define JENT_MEMORY_ACCESSLOOPS 128 @@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec) * an entropy collection. * * Input: - * @ec entropy collector struct -- may be NULL * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ -static __u64 jent_loop_shuffle(struct rand_data *ec, - unsigned int bits, unsigned int min) +static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; @@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, unsigned int mask = (1<data; + /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. @@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec, * execution time jitter * * This function injects the individual bits of the time value into the - * entropy pool using an LFSR. + * entropy pool using a hash. * - * The code is deliberately inefficient with respect to the bit shifting - * and shall stay that way. This function is the root cause why the code - * shall be compiled without optimization. This function not only acts as - * folding operation, but this function's execution is used to measure - * the CPU execution time jitter. Any change to the loop in this function - * implies that careful retesting must be done. - * - * @ec [in] entropy collector struct - * @time [in] time stamp to be injected - * @loop_cnt [in] if a value not equal to 0 is set, use the given value as - * number of loops to perform the folding - * @stuck [in] Is the time stamp identified as stuck? + * ec [in] entropy collector + * time [in] time stamp to be injected + * stuck [in] Is the time stamp identified as stuck? * * Output: - * updated ec->data - * - * @return Number of loops the folding operation is performed + * updated hash context in the entropy collector or error code */ -static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt, - int stuck) +static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { - unsigned int i; - __u64 j = 0; - __u64 new = 0; -#define MAX_FOLD_LOOP_BIT 4 -#define MIN_FOLD_LOOP_BIT 0 - __u64 fold_loop_cnt = - jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT); - - /* - * testing purposes -- allow test app to set the counter, not - * needed during runtime - */ - if (loop_cnt) - fold_loop_cnt = loop_cnt; - for (j = 0; j < fold_loop_cnt; j++) { - new = ec->data; - for (i = 1; (DATA_SIZE_BITS) >= i; i++) { - __u64 tmp = time << (DATA_SIZE_BITS - i); - - tmp = tmp >> (DATA_SIZE_BITS - 1); - - /* - * Fibonacci LSFR with polynomial of - * x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is - * primitive according to - * http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf - * (the shift values are the polynomial values minus one - * due to counting bits from 0 to 63). As the current - * position is always the LSB, the polynomial only needs - * to shift data in from the left without wrap. - */ - tmp ^= ((new >> 63) & 1); - tmp ^= ((new >> 60) & 1); - tmp ^= ((new >> 55) & 1); - tmp ^= ((new >> 30) & 1); - tmp ^= ((new >> 27) & 1); - tmp ^= ((new >> 22) & 1); - new <<= 1; - new ^= tmp; - } - } - - /* - * If the time stamp is stuck, do not finally insert the value into - * the entropy pool. Although this operation should not do any harm - * even when the time stamp has no entropy, SP800-90B requires that - * any conditioning operation (SP800-90B considers the LFSR to be a - * conditioning operation) to have an identical amount of input - * data according to section 3.1.5. - */ - if (!stuck) - ec->data = new; +#define SHA3_HASH_LOOP (1<<3) + struct { + int rct_count; + unsigned int apt_observations; + unsigned int apt_count; + unsigned int apt_base; + } addtl = { + ec->rct_count, + ec->apt_observations, + ec->apt_count, + ec->apt_base + }; + + return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), + SHA3_HASH_LOOP, stuck); } /* @@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = - jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); + jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; @@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec) stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ - jent_lfsr_time(ec, current_delta, 0, stuck); + if (jent_condition_data(ec, current_delta, stuck)) + stuck = 1; return stuck; } /* * Generator of one 64 bit random number - * Function fills rand_data->data + * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ @@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec) * @return 0 when request is fulfilled or an error * * The following error codes can occur: - * -1 entropy_collector is NULL + * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ @@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, * Perform startup health tests and return permanent * error if it fails. */ - if (jent_entropy_init()) + if (jent_entropy_init(ec->hash_state)) return -3; return -2; @@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, tocopy = (DATA_SIZE_BITS / 8); else tocopy = len; - jent_memcpy(p, &ec->data, tocopy); + if (jent_read_random_block(ec->hash_state, p, tocopy)) + return -1; len -= tocopy; p += tocopy; @@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data, ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags) + unsigned int flags, + void *hash_state) { struct rand_data *entropy_collector; @@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, osr = 1; /* minimum sampling rate is 1 */ entropy_collector->osr = osr; + entropy_collector->hash_state = hash_state; + /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); @@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector) jent_zfree(entropy_collector); } -int jent_entropy_init(void) +int jent_entropy_init(void *hash_state) { int i; __u64 delta_sum = 0; @@ -681,6 +631,7 @@ int jent_entropy_init(void) /* Required for RCT */ ec.osr = 1; + ec.hash_state = hash_state; /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These @@ -718,7 +669,7 @@ int jent_entropy_init(void) /* Invoke core entropy collection logic */ jent_get_nstime(&time); ec.prev_time = time; - jent_lfsr_time(&ec, time, 0, 0); + jent_condition_data(&ec, time, 0); jent_get_nstime(&time2); /* test whether timer works */ diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h index 5cc583f6bc6b8..b3890ff26a023 100644 --- a/crypto/jitterentropy.h +++ b/crypto/jitterentropy.h @@ -2,14 +2,18 @@ extern void *jent_zalloc(unsigned int len); extern void jent_zfree(void *ptr); -extern void jent_memcpy(void *dest, const void *src, unsigned int n); extern void jent_get_nstime(__u64 *out); +extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl, + unsigned int addtl_len, __u64 hash_loop_cnt, + unsigned int stuck); +int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len); struct rand_data; -extern int jent_entropy_init(void); +extern int jent_entropy_init(void *hash_state); extern int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len); extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr, - unsigned int flags); + unsigned int flags, + void *hash_state); extern void jent_entropy_collector_free(struct rand_data *entropy_collector); diff --git a/crypto/rng.c b/crypto/rng.c index c650678106a7f..b5ab31ef7db9a 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -6,6 +6,9 @@ * * Copyright (c) 2008 Neil Horman * Copyright (c) 2015 Herbert Xu + * + * Copyright (C) 2025 Ctrl IQ, Inc. + * Author: Sultan Alsawaf */ #include @@ -14,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,12 +29,38 @@ #include "internal.h" -static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock); -static struct crypto_rng *crypto_reseed_rng; -static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock); +static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock); struct crypto_rng *crypto_default_rng; EXPORT_SYMBOL_GPL(crypto_default_rng); -static int crypto_default_rng_refcnt; +static unsigned int crypto_default_rng_refcnt; + +/* + * Per-CPU RNG instances are only used by crypto_devrandom_rng. The global RNG, + * crypto_default_rng, is only used directly by other drivers. + * + * Per-CPU instances of the DRBG are efficient because the DRBG itself supports + * an arbitrary number of instances and can be seeded on a per-CPU basis. + * + * Specifically, the DRBG is seeded by the CRNG and the Jitter RNG. The CRNG is + * globally accessible and is already per-CPU. And while the Jitter RNG _isn't_ + * per-CPU, creating a DRBG instance also creates a Jitter RNG instance; + * therefore, per-CPU DRBG instances implies per-CPU Jitter RNG instances. + */ +struct cpu_rng_inst { + local_lock_t lock; + struct rt_mutex mlock; + struct crypto_rng *rng; + void *page; +}; + +static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_default_rng) = { + .lock = INIT_LOCAL_LOCK(pcpu_default_rng.lock), + .mlock = __RT_MUTEX_INITIALIZER(pcpu_default_rng.mlock) +}; +static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_reseed_rng) = { + /* The reseed instances don't use the local lock */ + .mlock = __RT_MUTEX_INITIALIZER(pcpu_reseed_rng.mlock) +}; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { @@ -145,11 +174,11 @@ int crypto_get_default_rng(void) { int err; - mutex_lock(&crypto_default_rng_lock); + rt_mutex_lock(&crypto_default_rng_lock); err = crypto_get_rng(&crypto_default_rng); if (!err) crypto_default_rng_refcnt++; - mutex_unlock(&crypto_default_rng_lock); + rt_mutex_unlock(&crypto_default_rng_lock); return err; } @@ -157,41 +186,61 @@ EXPORT_SYMBOL_GPL(crypto_get_default_rng); void crypto_put_default_rng(void) { - mutex_lock(&crypto_default_rng_lock); + rt_mutex_lock(&crypto_default_rng_lock); crypto_default_rng_refcnt--; - mutex_unlock(&crypto_default_rng_lock); + rt_mutex_unlock(&crypto_default_rng_lock); } EXPORT_SYMBOL_GPL(crypto_put_default_rng); #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE) -static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp, - struct mutex *lock) -{ - int err = -EBUSY; - - mutex_lock(lock); - if (refcntp && *refcntp) - goto out; - - crypto_free_rng(*rngp); - *rngp = NULL; +#define down_read_del_pcpu_rwsem() down_read(&del_pcpu_rwsem) +#define up_read_del_pcpu_rwsem() up_read(&del_pcpu_rwsem) +static DECLARE_RWSEM(del_pcpu_rwsem); - err = 0; +static void crypto_del_pcpu_rng(struct cpu_rng_inst __percpu *pcri) +{ + int cpu; -out: - mutex_unlock(lock); + for_each_possible_cpu(cpu) { + struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu); - return err; + if (cri->rng) { + crypto_free_rng(cri->rng); + cri->rng = NULL; + } + } } int crypto_del_default_rng(void) { - return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt, - &crypto_default_rng_lock) ?: - crypto_del_rng(&crypto_reseed_rng, NULL, - &crypto_reseed_rng_lock); + bool busy; + + rt_mutex_lock(&crypto_default_rng_lock); + if (!(busy = crypto_default_rng_refcnt)) { + crypto_free_rng(crypto_default_rng); + crypto_default_rng = NULL; + } + rt_mutex_unlock(&crypto_default_rng_lock); + if (busy) + return -EBUSY; + + if (!down_write_trylock(&del_pcpu_rwsem)) + return -EBUSY; + + crypto_del_pcpu_rng(&pcpu_default_rng); + crypto_del_pcpu_rng(&pcpu_reseed_rng); + up_write(&del_pcpu_rwsem); + + return 0; } EXPORT_SYMBOL_GPL(crypto_del_default_rng); +#else +static inline void down_read_del_pcpu_rwsem(void) +{ +} +static inline void up_read_del_pcpu_rwsem(void) +{ +} #endif int crypto_register_rng(struct rng_alg *alg) @@ -244,80 +293,343 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count) } EXPORT_SYMBOL_GPL(crypto_unregister_rngs); +/* + * On non-PREEMPT_RT kernels, local locks disable preemption. When there's no + * rng allocated, one must be allocated by calling crypto_get_rng(), which can + * sleep. Therefore, crypto_get_rng() cannot be called under local_lock(), so if + * our CPU's RNG instance doesn't have an rng allocated, we drop the local lock + * and take a mutex lock instead. After the local lock is dropped, the current + * task can be freely migrated to another CPU, which means that calling + * local_lock() again might not result in the same instance getting locked as + * before. That's why this function exists: to loop on calling local_lock() and + * allocating an rng as needed with crypto_get_rng() until the current CPU's + * instance is found to have an rng allocated. If crypto_get_rng() ever fails, + * this function returns an error even if there are instances for other CPUs + * which _do_ have an rng allocated. + */ +static __always_inline struct cpu_rng_inst * +lock_default_rng(struct crypto_rng **rng) __acquires(&cri->lock) +{ + struct cpu_rng_inst __percpu *pcri = &pcpu_default_rng; + struct cpu_rng_inst *cri; + int ret; + + while (1) { + local_lock(&pcri->lock); + cri = this_cpu_ptr(pcri); + /* + * cri->rng may have transitioned from non-NULL to NULL, but + * underneath down_read_del_pcpu_rwsem() it can only transition + * from NULL to non-NULL. This may occur on a different CPU, + * thus cri->rng must be read atomically to prevent data races; + * this elides mlock by pairing with the WRITE_ONCE() in the + * slow path below. + * + * And if cri->rng is non-NULL, then it is good to go. To avoid + * data races due to load speculation on torn cri->rng loads + * _after_ the NULL check, one of the following is required: + * 1. smp_acquire__after_ctrl_dep() in the if-statement + * 2. All cri->rng reads are performed with READ_ONCE() + * 3. cri->rng is never read again outside this function + * + * Option #3 yields the best performance, so this function + * provides the rng pointer as an output for the caller to use. + */ + *rng = READ_ONCE(cri->rng); + if (likely(*rng)) + return cri; + + /* + * Slow path: there's no rng currently allocated to this instance. + * Release the local lock and acquire this instance's mlock to + * perform the allocation. + * + * Note that this task may be migrated to a different CPU now! + */ + local_unlock(&cri->lock); + rt_mutex_lock(&cri->mlock); + if (!cri->rng) { + struct crypto_rng *new_rng = NULL; + + ret = crypto_get_rng(&new_rng); + if (ret) { + rt_mutex_unlock(&cri->mlock); + break; + } + + /* + * Pairs with READ_ONCE() above, because we might not be + * on the same CPU anymore as when we first got `cri`. + */ + WRITE_ONCE(cri->rng, new_rng); + } + rt_mutex_unlock(&cri->mlock); + } + + /* + * Even if this task got migrated to another CPU that _does_ have an rng + * allocated, just bail out if crypto_get_rng() ever fails in order to + * avoid looping forever. + */ + return ERR_PTR(ret); +} + +static __always_inline struct cpu_rng_inst * +lock_reseed_rng(struct crypto_rng **rng) __acquires(&cri->mlock) +{ + struct cpu_rng_inst __percpu *pcri = &pcpu_reseed_rng; + struct cpu_rng_inst *cri; + int ret; + + /* + * Use whichever CPU this task is currently running on, knowing full + * well that the task can freely migrate to other CPUs. The reseed RNG + * requires holding a lock across the entire devrandom read, so that + * another task cannot extract entropy from the same seed. In other + * words, when reseeding is requested, reseeding must be done every time + * every time mlock is acquired. + */ + cri = raw_cpu_ptr(pcri); + rt_mutex_lock(&cri->mlock); + if (likely(cri->rng)) { + /* + * Since this rng instance wasn't just allocated, it needs to be + * explicitly reseeded. New rng instances are seeded on creation + * in crypto_get_rng() and thus don't need explicit reseeding. + */ + crypto_tfm_set_flags(crypto_rng_tfm(cri->rng), + CRYPTO_TFM_REQ_NEED_RESEED); + } else { + ret = crypto_get_rng(&cri->rng); + if (ret) { + rt_mutex_unlock(&cri->mlock); + return ERR_PTR(ret); + } + } + + *rng = cri->rng; + return cri; +} + +#define lock_local_rng(rng, reseed) \ + ({ (reseed) ? lock_reseed_rng(rng) : lock_default_rng(rng); }) + +#define unlock_local_rng(cri, reseed) \ +do { \ + if (reseed) \ + rt_mutex_unlock(&(cri)->mlock); \ + else \ + local_unlock(&(cri)->lock); \ +} while (0) + +static __always_inline void +clear_rng_page(struct cpu_rng_inst *cri, size_t count) +{ + /* For zeroing a whole page, clear_page() is faster than memset() */ + count < PAGE_SIZE ? memset(cri->page, 0, count) : clear_page(cri->page); +} + static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed) { + /* lock_local_rng() puts us in atomic context for !reseed on non-RT */ + const bool atomic = !reseed && !IS_ENABLED(CONFIG_PREEMPT_RT); + const bool user_no_reseed = !reseed && user_backed_iter(iter); + size_t ulen, page_dirty_len = 0; + struct cpu_rng_inst *cri; struct crypto_rng *rng; - u8 tmp[256]; - ssize_t ret; + void __user *uaddr; + struct page *upage; + ssize_t ret = 0; if (unlikely(!iov_iter_count(iter))) return 0; - if (reseed) { - u32 flags = 0; - - /* If reseeding is requested, acquire a lock on - * crypto_reseed_rng so it is not swapped out until - * the initial random bytes are generated. - * - * The algorithm implementation is also protected with - * a separate mutex (drbg->drbg_mutex) around the - * reseed-and-generate operation. - */ - mutex_lock(&crypto_reseed_rng_lock); - - /* If crypto_default_rng is not set, it will be seeded - * at creation in __crypto_get_default_rng and thus no - * reseeding is needed. - */ - if (crypto_reseed_rng) - flags |= CRYPTO_TFM_REQ_NEED_RESEED; - - ret = crypto_get_rng(&crypto_reseed_rng); - if (ret) { - mutex_unlock(&crypto_reseed_rng_lock); - return ret; + /* Set up the starting user destination address and length */ + if (user_no_reseed) { + if (iter_is_ubuf(iter)) { + uaddr = iter->ubuf + iter->iov_offset; + ulen = iov_iter_count(iter); + } else if (iter_is_iovec(iter)) { + uaddr = iter_iov_addr(iter); + ulen = iter_iov_len(iter); + } else { + /* + * ITER_UBUF and ITER_IOVEC are the only user-backed + * iters. Bug out if a new user-backed iter appears. + */ + BUG(); } + } - rng = crypto_reseed_rng; - crypto_tfm_set_flags(crypto_rng_tfm(rng), flags); - } else { - ret = crypto_get_default_rng(); - if (ret) - return ret; - rng = crypto_default_rng; + /* Prevent rngs from getting deleted from per-CPU RNG instances */ + down_read_del_pcpu_rwsem(); +restart: + /* + * Pin the user page backing the current user destination address, + * potentially prefaulting to allocate a page for the destination. By + * prefaulting without the RNG lock held, the DRBG won't be blocked by + * time spent on page faults for this task, and thus the DRBG can still + * be used by other tasks. + */ + if (user_no_reseed && pin_user_pages_fast((unsigned long)uaddr, 1, + FOLL_WRITE, &upage) != 1) + goto up_rwsem; + + cri = lock_local_rng(&rng, reseed); + if (IS_ERR(cri)) { + if (!ret) + ret = PTR_ERR(cri); + goto unpin_upage; } - for (;;) { - size_t i, copied; + while (1) { + size_t copied, i = min(iov_iter_count(iter), PAGE_SIZE); + bool resched_without_lock = false; int err; - i = min_t(size_t, iov_iter_count(iter), sizeof(tmp)); - err = crypto_rng_get_bytes(rng, tmp, i); + /* + * Generate up to one page at a time, and align to a page + * boundary so we only need to pin one user page at a time. + */ + if (user_no_reseed) + i = min3(i, PAGE_SIZE - offset_in_page(uaddr), ulen); + + /* + * On non-PREEMPT_RT kernels, local locks disable preemption. + * The DRBG's generate() function has a mutex lock, which could + * mean that we'll schedule while atomic if the mutex lock + * sleeps. However, that will never happen if we ensure that + * there's never any contention on the DRBG's mutex lock while + * we're atomic! Our local lock ensures calls to the DRBG are + * always serialized, so there's no contention from here. And + * the DRBG only uses its mutex lock from one other path, when + * an instance of the DRBG is freshly allocated, which we only + * do from crypto_get_rng(). So the DRBG's mutex lock is + * guaranteed to not have contention when we call generate() and + * thus it'll never sleep here. And of course, nothing else in + * generate() ever sleeps. + */ + err = crypto_rng_get_bytes(rng, cri->page, i); if (err) { - ret = ret ?: err; + if (!ret) + ret = err; break; } - copied = copy_to_iter(tmp, i, iter); - ret += copied; + /* + * Record the number of bytes used in cri->page and either copy + * directly to the user address without faulting, or copy to the + * iter which is always backed by kernel memory when !reseed && + * !user_backed_iter(). When reseed == true, the iter may be + * backed by user memory, but we copy to it with the possibility + * of page faults anyway because we need to hold the lock across + * the entire call; this is why a mutex is used instead of a + * local lock for the reseed RNG, to permit sleeping without + * yielding the DRBG instance. + */ + page_dirty_len = max(i, page_dirty_len); + if (user_no_reseed) { + err = copy_to_user_nofault(uaddr, cri->page, i); + if (err >= 0) { + iov_iter_advance(iter, i - err); + ret += i - err; + } + if (err) + break; + } else { + /* + * We know that copying from cri->page is safe, so use + * _copy_to_iter() directly to skip check_copy_size(). + */ + copied = _copy_to_iter(cri->page, i, iter); + ret += copied; + if (copied != i) + break; + } - if (!iov_iter_count(iter) || copied != i) + /* + * Quit when either the requested number of bytes have been + * generated or there is a pending signal. + */ + if (!iov_iter_count(iter) || signal_pending(current)) break; - BUILD_BUG_ON(PAGE_SIZE % sizeof(tmp) != 0); - if (ret % PAGE_SIZE == 0) { - if (signal_pending(current)) - break; - cond_resched(); + /* Compute the next user destination address and length */ + if (user_no_reseed) { + ulen -= i; + if (likely(ulen)) { + uaddr += i; + } else { + /* + * This path is only reachable by ITER_IOVEC + * because ulen is initialized to the request + * size for ITER_UBUF, and therefore ITER_UBUF + * will always quit at the iov_iter_count() + * check above before ulen can become zero. + * + * iter->iov_offset is guaranteed to be zero + * here, so iter_iov_{addr|len}() isn't needed. + */ + uaddr = iter_iov(iter)->iov_base; + ulen = iter_iov(iter)->iov_len; + } + + unpin_user_page(upage); + } + + /* + * Reschedule right now if needed and we're not atomic. If we're + * atomic, then we must first drop the lock to reschedule. + */ + if (need_resched()) { + if (atomic) + resched_without_lock = true; + else + cond_resched(); + } + + /* + * Optimistically try to pin the next user page without + * faulting, so we don't need to clear cri->page and drop the + * lock on every iteration. If this fails, we fall back to + * pinning with the option to prefault. + */ + if (user_no_reseed && !resched_without_lock && + pin_user_pages_fast_only((unsigned long)uaddr, 1, + FOLL_WRITE, &upage) == 1) + continue; + + /* + * Restart if either rescheduling is needed (and requires + * dropping the lock since we're atomic) or the optimistic page + * pinning attempt failed. + * + * This always implies `reseed == false`, so unlock_local_rng() + * can just be passed `false` for reseed to eliminate a branch. + */ + if (resched_without_lock || user_no_reseed) { + /* + * Clear the buffer of our latest random bytes before + * unlocking and potentially migrating CPUs, in which + * case we wouldn't have the same `cri` anymore. + */ + clear_rng_page(cri, page_dirty_len); + unlock_local_rng(cri, false); + page_dirty_len = 0; + if (resched_without_lock) + cond_resched(); + goto restart; } } - if (reseed) - mutex_unlock(&crypto_reseed_rng_lock); - else - crypto_put_default_rng(); - memzero_explicit(tmp, sizeof(tmp)); + if (page_dirty_len) + clear_rng_page(cri, page_dirty_len); + unlock_local_rng(cri, reseed); +unpin_upage: + if (user_no_reseed) + unpin_user_page(upage); +up_rwsem: + up_read_del_pcpu_rwsem(); return ret ? ret : -EFAULT; } @@ -326,16 +638,70 @@ static const struct random_extrng crypto_devrandom_rng = { .owner = THIS_MODULE, }; +static void free_pcpu_inst(struct cpu_rng_inst __percpu *pcri) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu); + + if (cri->rng) + crypto_free_rng(cri->rng); + + free_page((unsigned long)cri->page); + } +} + +static int __init alloc_pcpu_inst(struct cpu_rng_inst __percpu *pcri) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu); + + cri->page = (void *)__get_free_page(GFP_KERNEL); + if (!cri->page) + goto err_page_alloc; + + local_lock_init(&cri->lock); + } + + return 0; + +err_page_alloc: + while (cpu--) + free_page((unsigned long)per_cpu_ptr(pcri, cpu)->page); + return -ENOMEM; +} + static int __init crypto_rng_init(void) { - if (fips_enabled) - random_register_extrng(&crypto_devrandom_rng); + int ret; + + if (!fips_enabled) + return 0; + + ret = alloc_pcpu_inst(&pcpu_default_rng); + if (ret) + return ret; + + ret = alloc_pcpu_inst(&pcpu_reseed_rng); + if (ret) + goto free_pcpu_default; + + random_register_extrng(&crypto_devrandom_rng); return 0; + +free_pcpu_default: + free_pcpu_inst(&pcpu_default_rng); + return ret; } static void __exit crypto_rng_exit(void) { random_unregister_extrng(); + free_pcpu_inst(&pcpu_default_rng); + free_pcpu_inst(&pcpu_reseed_rng); } late_initcall(crypto_rng_init); diff --git a/include/linux/mm.h b/include/linux/mm.h index 196c481ec1603..b640f479624c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2544,6 +2544,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); diff --git a/lib/mpi/ec.c b/lib/mpi/ec.c index 40f5908e57a4f..e16dca1e23d52 100644 --- a/lib/mpi/ec.c +++ b/lib/mpi/ec.c @@ -584,6 +584,9 @@ void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model, ctx->a = mpi_copy(a); ctx->b = mpi_copy(b); + ctx->d = NULL; + ctx->t.two_inv_p = NULL; + ctx->t.p_barrett = use_barrett > 0 ? mpi_barrett_init(ctx->p, 0) : NULL; mpi_ec_get_reset(ctx); diff --git a/mm/gup.c b/mm/gup.c index ad7345cfba91d..656f1ead20bb4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3353,6 +3353,34 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/** + * pin_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Like pin_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + if (!is_valid_gup_args(pages, NULL, &gup_flags, + FOLL_PIN | FOLL_FAST_ONLY)) + return -EINVAL; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process *