Skip to content

Commit f313af3

Browse files
authored
feat: upload multiple asset chunks in the same call (#3947)
1 parent 115a077 commit f313af3

File tree

13 files changed

+291
-33
lines changed

13 files changed

+291
-33
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,20 @@ Valid settings are:
1010
- `--log-viewer` flag with `dfx canister create`
1111
- `canisters[].initialization_values.log_visibility.allowed_viewers` in `dfx.json`
1212

13+
### feat: batch upload assets
14+
15+
The frontend canister sync now tries to batch multiple small content chunks into a single call using the `create_chunks` method added earlier.
16+
This should lead to significantly faster upload times for frontends with many small files.
17+
18+
## Dependencies
19+
20+
### Frontend canister
21+
22+
Bumped `api_version` to `2` for the previous addition of `create_chunks` since the improved file sync relies on it.
23+
24+
- Module hash: 9e4485d4358dd910aebcc025843547d05604cf28c6dc7c2cc2f8c76d083112e8
25+
- https://github.com/dfinity/sdk/pull/3947
26+
1327
# 0.24.1
1428

1529
### feat: More PocketIC flags supported

src/canisters/frontend/ic-asset/src/batch_upload/operations.rs

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@ use crate::canister_api::types::batch_upload::v1::{BatchOperationKind, CommitBat
1010
use candid::Nat;
1111
use std::collections::HashMap;
1212

13+
use super::plumbing::ChunkUploader;
14+
1315
pub(crate) const BATCH_UPLOAD_API_VERSION: u16 = 1;
1416

15-
pub(crate) fn assemble_batch_operations(
17+
pub(crate) async fn assemble_batch_operations(
18+
chunk_uploader: Option<&ChunkUploader<'_>>,
1619
project_assets: &HashMap<String, ProjectAsset>,
1720
canister_assets: HashMap<String, AssetDetails>,
1821
asset_deletion_reason: AssetDeletionReason,
@@ -30,25 +33,28 @@ pub(crate) fn assemble_batch_operations(
3033
);
3134
create_new_assets(&mut operations, project_assets, &canister_assets);
3235
unset_obsolete_encodings(&mut operations, project_assets, &canister_assets);
33-
set_encodings(&mut operations, project_assets);
36+
set_encodings(&mut operations, chunk_uploader, project_assets).await;
3437
update_properties(&mut operations, project_assets, &canister_asset_properties);
3538

3639
operations
3740
}
3841

39-
pub(crate) fn assemble_commit_batch_arguments(
42+
pub(crate) async fn assemble_commit_batch_arguments(
43+
chunk_uploader: &ChunkUploader<'_>,
4044
project_assets: HashMap<String, ProjectAsset>,
4145
canister_assets: HashMap<String, AssetDetails>,
4246
asset_deletion_reason: AssetDeletionReason,
4347
canister_asset_properties: HashMap<String, AssetProperties>,
4448
batch_id: Nat,
4549
) -> CommitBatchArguments {
4650
let operations = assemble_batch_operations(
51+
Some(chunk_uploader),
4752
&project_assets,
4853
canister_assets,
4954
asset_deletion_reason,
5055
canister_asset_properties,
51-
);
56+
)
57+
.await;
5258
CommitBatchArguments {
5359
operations,
5460
batch_id,
@@ -153,21 +159,28 @@ pub(crate) fn unset_obsolete_encodings(
153159
}
154160
}
155161

156-
pub(crate) fn set_encodings(
162+
pub(crate) async fn set_encodings(
157163
operations: &mut Vec<BatchOperationKind>,
164+
chunk_uploader: Option<&ChunkUploader<'_>>,
158165
project_assets: &HashMap<String, ProjectAsset>,
159166
) {
160167
for (key, project_asset) in project_assets {
161168
for (content_encoding, v) in &project_asset.encodings {
162169
if v.already_in_place {
163170
continue;
164171
}
165-
172+
let chunk_ids = if let Some(uploader) = chunk_uploader {
173+
uploader
174+
.uploader_ids_to_canister_chunk_ids(&v.uploader_chunk_ids)
175+
.await
176+
} else {
177+
vec![]
178+
};
166179
operations.push(BatchOperationKind::SetAssetContent(
167180
SetAssetContentArguments {
168181
key: key.clone(),
169182
content_encoding: content_encoding.clone(),
170-
chunk_ids: v.chunk_ids.clone(),
183+
chunk_ids,
171184
sha256: Some(v.sha256.clone()),
172185
},
173186
));

src/canisters/frontend/ic-asset/src/batch_upload/plumbing.rs

Lines changed: 130 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use crate::asset::content::Content;
33
use crate::asset::content_encoder::ContentEncoder;
44
use crate::batch_upload::semaphores::Semaphores;
55
use crate::canister_api::methods::chunk::create_chunk;
6+
use crate::canister_api::methods::chunk::create_chunks;
67
use crate::canister_api::types::asset::AssetDetails;
78
use crate::error::CreateChunkError;
89
use crate::error::CreateEncodingError;
@@ -14,10 +15,12 @@ use futures::TryFutureExt;
1415
use ic_utils::Canister;
1516
use mime::Mime;
1617
use slog::{debug, info, Logger};
18+
use std::collections::BTreeMap;
1719
use std::collections::HashMap;
1820
use std::path::PathBuf;
1921
use std::sync::atomic::{AtomicUsize, Ordering};
2022
use std::sync::Arc;
23+
use tokio::sync::Mutex;
2124

2225
const CONTENT_ENCODING_IDENTITY: &str = "identity";
2326

@@ -35,7 +38,7 @@ pub(crate) struct AssetDescriptor {
3538
}
3639

3740
pub(crate) struct ProjectAssetEncoding {
38-
pub(crate) chunk_ids: Vec<Nat>,
41+
pub(crate) uploader_chunk_ids: Vec<usize>,
3942
pub(crate) sha256: Vec<u8>,
4043
pub(crate) already_in_place: bool,
4144
}
@@ -46,30 +49,68 @@ pub(crate) struct ProjectAsset {
4649
pub(crate) encodings: HashMap<String, ProjectAssetEncoding>,
4750
}
4851

52+
type IdMapping = BTreeMap<usize, Nat>;
53+
type UploadQueue = Vec<(usize, Vec<u8>)>;
4954
pub(crate) struct ChunkUploader<'agent> {
5055
canister: Canister<'agent>,
5156
batch_id: Nat,
57+
api_version: u16,
5258
chunks: Arc<AtomicUsize>,
5359
bytes: Arc<AtomicUsize>,
60+
// maps uploader_chunk_id to canister_chunk_id
61+
id_mapping: Arc<Mutex<IdMapping>>,
62+
upload_queue: Arc<Mutex<UploadQueue>>,
5463
}
64+
5565
impl<'agent> ChunkUploader<'agent> {
56-
pub(crate) fn new(canister: Canister<'agent>, batch_id: Nat) -> Self {
66+
pub(crate) fn new(canister: Canister<'agent>, api_version: u16, batch_id: Nat) -> Self {
5767
Self {
5868
canister,
5969
batch_id,
70+
api_version,
6071
chunks: Arc::new(AtomicUsize::new(0)),
6172
bytes: Arc::new(AtomicUsize::new(0)),
73+
id_mapping: Arc::new(Mutex::new(BTreeMap::new())),
74+
upload_queue: Arc::new(Mutex::new(vec![])),
6275
}
6376
}
6477

78+
/// Returns an uploader_chunk_id, which is different from the chunk id on the asset canister.
79+
/// uploader_chunk_id can be mapped to canister_chunk_id using `uploader_ids_to_canister_chunk_ids`
80+
/// once `finalize_upload` has completed.
6581
pub(crate) async fn create_chunk(
6682
&self,
6783
contents: &[u8],
6884
semaphores: &Semaphores,
69-
) -> Result<Nat, CreateChunkError> {
70-
self.chunks.fetch_add(1, Ordering::SeqCst);
85+
) -> Result<usize, CreateChunkError> {
86+
let uploader_chunk_id = self.chunks.fetch_add(1, Ordering::SeqCst);
7187
self.bytes.fetch_add(contents.len(), Ordering::SeqCst);
72-
create_chunk(&self.canister, &self.batch_id, contents, semaphores).await
88+
if contents.len() == MAX_CHUNK_SIZE || self.api_version < 2 {
89+
let canister_chunk_id =
90+
create_chunk(&self.canister, &self.batch_id, contents, semaphores).await?;
91+
let mut map = self.id_mapping.lock().await;
92+
map.insert(uploader_chunk_id, canister_chunk_id);
93+
Ok(uploader_chunk_id)
94+
} else {
95+
self.add_to_upload_queue(uploader_chunk_id, contents).await;
96+
// Larger `max_retained_bytes` leads to batches that are filled closer to the max size.
97+
// `4 * MAX_CHUNK_SIZE` leads to a pretty small memory footprint but still offers solid fill rates.
98+
// Mini experiment:
99+
// - Tested with: `for i in $(seq 1 50); do dd if=/dev/urandom of="src/hello_frontend/assets/file_$i.bin" bs=$(shuf -i 1-2000000 -n 1) count=1; done && dfx deploy hello_frontend`
100+
// - Result: Roughly 15% of batches under 90% full.
101+
// With other byte ranges (e.g. `shuf -i 1-3000000 -n 1`) stats improve significantly
102+
self.upload_chunks(4 * MAX_CHUNK_SIZE, usize::MAX, semaphores)
103+
.await?;
104+
Ok(uploader_chunk_id)
105+
}
106+
}
107+
108+
pub(crate) async fn finalize_upload(
109+
&self,
110+
semaphores: &Semaphores,
111+
) -> Result<(), CreateChunkError> {
112+
self.upload_chunks(0, 0, semaphores).await?;
113+
Ok(())
73114
}
74115

75116
pub(crate) fn bytes(&self) -> usize {
@@ -78,6 +119,80 @@ impl<'agent> ChunkUploader<'agent> {
78119
pub(crate) fn chunks(&self) -> usize {
79120
self.chunks.load(Ordering::SeqCst)
80121
}
122+
123+
/// Call only after `finalize_upload` has completed
124+
pub(crate) async fn uploader_ids_to_canister_chunk_ids(
125+
&self,
126+
uploader_ids: &[usize],
127+
) -> Vec<Nat> {
128+
let mapping = self.id_mapping.lock().await;
129+
uploader_ids
130+
.iter()
131+
.map(|id| {
132+
mapping
133+
.get(id)
134+
.expect("Chunk uploader did not upload all chunks. This is a bug.")
135+
.clone()
136+
})
137+
.collect()
138+
}
139+
140+
async fn add_to_upload_queue(&self, uploader_chunk_id: usize, contents: &[u8]) {
141+
let mut queue = self.upload_queue.lock().await;
142+
queue.push((uploader_chunk_id, contents.into()));
143+
}
144+
145+
/// Calls `upload_chunks` with batches of chunks from `self.upload_queue` until at most `max_retained_bytes`
146+
/// bytes and at most `max_retained_chunks` chunks remain in the upload queue. Larger values
147+
/// will lead to better batch fill rates but also leave a larger memory footprint.
148+
async fn upload_chunks(
149+
&self,
150+
max_retained_bytes: usize,
151+
max_retained_chunks: usize,
152+
semaphores: &Semaphores,
153+
) -> Result<(), CreateChunkError> {
154+
let mut queue = self.upload_queue.lock().await;
155+
156+
let mut batches = vec![];
157+
while queue
158+
.iter()
159+
.map(|(_, content)| content.len())
160+
.sum::<usize>()
161+
> max_retained_bytes
162+
|| queue.len() > max_retained_chunks
163+
{
164+
// Greedily fills batch with the largest chunk that fits
165+
queue.sort_unstable_by_key(|(_, content)| content.len());
166+
let mut batch = vec![];
167+
let mut batch_size = 0;
168+
for (uploader_chunk_id, content) in std::mem::take(&mut *queue).into_iter().rev() {
169+
if content.len() <= MAX_CHUNK_SIZE - batch_size {
170+
batch_size += content.len();
171+
batch.push((uploader_chunk_id, content));
172+
} else {
173+
queue.push((uploader_chunk_id, content));
174+
}
175+
}
176+
batches.push(batch);
177+
}
178+
179+
try_join_all(batches.into_iter().map(|chunks| async move {
180+
let (uploader_chunk_ids, chunks): (Vec<_>, Vec<_>) = chunks.into_iter().unzip();
181+
let canister_chunk_ids =
182+
create_chunks(&self.canister, &self.batch_id, chunks, semaphores).await?;
183+
let mut map = self.id_mapping.lock().await;
184+
for (uploader_id, canister_id) in uploader_chunk_ids
185+
.into_iter()
186+
.zip(canister_chunk_ids.into_iter())
187+
{
188+
map.insert(uploader_id, canister_id);
189+
}
190+
Ok(())
191+
}))
192+
.await?;
193+
194+
Ok(())
195+
}
81196
}
82197

83198
#[allow(clippy::too_many_arguments)]
@@ -110,7 +225,7 @@ async fn make_project_asset_encoding(
110225
false
111226
};
112227

113-
let chunk_ids = if already_in_place {
228+
let uploader_chunk_ids = if already_in_place {
114229
info!(
115230
logger,
116231
" {}{} ({} bytes) sha {} is already installed",
@@ -144,7 +259,7 @@ async fn make_project_asset_encoding(
144259
};
145260

146261
Ok(ProjectAssetEncoding {
147-
chunk_ids,
262+
uploader_chunk_ids,
148263
sha256,
149264
already_in_place,
150265
})
@@ -305,6 +420,13 @@ pub(crate) async fn make_project_assets(
305420
})
306421
.collect();
307422
let project_assets = try_join_all(project_asset_futures).await?;
423+
if let Some(uploader) = chunk_upload_target {
424+
uploader.finalize_upload(&semaphores).await.map_err(|err| {
425+
CreateProjectAssetError::CreateEncodingError(CreateEncodingError::CreateChunkFailed(
426+
err,
427+
))
428+
})?;
429+
}
308430

309431
let mut hm = HashMap::new();
310432
for project_asset in project_assets {
@@ -321,7 +443,7 @@ async fn upload_content_chunks(
321443
content_encoding: &str,
322444
semaphores: &Semaphores,
323445
logger: &Logger,
324-
) -> Result<Vec<Nat>, CreateChunkError> {
446+
) -> Result<Vec<usize>, CreateChunkError> {
325447
if content.data.is_empty() {
326448
let empty = vec![];
327449
let chunk_id = chunk_uploader.create_chunk(&empty, semaphores).await?;

src/canisters/frontend/ic-asset/src/batch_upload/semaphores.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,31 @@ const MAX_SIMULTANEOUS_LOADED_MB: usize = 50;
66
// How many simultaneous chunks being created at once
77
const MAX_SIMULTANEOUS_CREATE_CHUNK: usize = 50;
88

9-
// How many simultaneous Agent.call() to create_chunk
9+
// How many simultaneous Agent.call() to create_chunk(s)
1010
const MAX_SIMULTANEOUS_CREATE_CHUNK_CALLS: usize = 25;
1111

12-
// How many simultaneous Agent.wait() on create_chunk result
12+
// How many simultaneous Agent.wait() on create_chunk(s) result
1313
const MAX_SIMULTANEOUS_CREATE_CHUNK_WAITS: usize = 25;
1414

15+
#[derive(Debug)]
1516
pub(crate) struct Semaphores {
1617
// The "file" semaphore limits how much file data to load at once. A given loaded file's data
1718
// may be simultaneously encoded (gzip and so forth).
1819
pub file: SharedSemaphore,
1920

20-
// The create_chunk semaphore limits the number of chunks that can be in the process
21-
// of being created at one time. Since each chunk creation can involve retries,
22-
// this focuses those retries on a smaller number of chunks.
21+
// The create_chunk semaphore limits the number of chunk creation calls
22+
// that can be in progress at one time. Since each chunk creation can involve retries,
23+
// this focuses those retries on a smaller number of calls.
2324
// Without this semaphore, every chunk would make its first attempt, before
2425
// any chunk made its second attempt.
2526
pub create_chunk: SharedSemaphore,
2627

2728
// The create_chunk_call semaphore limits the number of simultaneous
28-
// agent.call()s to create_chunk.
29+
// agent.call()s to create_chunk(s).
2930
pub create_chunk_call: SharedSemaphore,
3031

3132
// The create_chunk_wait semaphore limits the number of simultaneous
32-
// agent.wait() calls for outstanding create_chunk requests.
33+
// agent.wait() calls for outstanding create_chunk(s) requests.
3334
pub create_chunk_wait: SharedSemaphore,
3435
}
3536

0 commit comments

Comments
 (0)