Skip to content

Commit fc4cc8d

Browse files
committed
stream rustdoc html content from S3, use streaming rewriter, stream to client
1 parent 7e51511 commit fc4cc8d

File tree

4 files changed

+232
-198
lines changed

4 files changed

+232
-198
lines changed

src/storage/mod.rs

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -253,40 +253,6 @@ impl AsyncStorage {
253253
}
254254
}
255255

256-
/// Fetch a rustdoc file from our blob storage.
257-
/// * `name` - the crate name
258-
/// * `version` - the crate version
259-
/// * `latest_build_id` - the id of the most recent build. used purely to invalidate the local archive
260-
/// index cache, when `archive_storage` is `true.` Without it we wouldn't know that we have
261-
/// to invalidate the locally cached file after a rebuild.
262-
/// * `path` - the wanted path inside the documentation.
263-
/// * `archive_storage` - if `true`, we will assume we have a remove ZIP archive and an index
264-
/// where we can fetch the requested path from inside the ZIP file.
265-
#[instrument]
266-
pub(crate) async fn fetch_rustdoc_file(
267-
&self,
268-
name: &str,
269-
version: &str,
270-
latest_build_id: Option<BuildId>,
271-
path: &str,
272-
archive_storage: bool,
273-
) -> Result<Blob> {
274-
trace!("fetch rustdoc file");
275-
Ok(if archive_storage {
276-
self.get_from_archive(
277-
&rustdoc_archive_path(name, version),
278-
latest_build_id,
279-
path,
280-
self.max_file_size_for(path),
281-
)
282-
.await?
283-
} else {
284-
// Add rustdoc prefix, name and version to the path for accessing the file stored in the database
285-
let remote_path = format!("rustdoc/{name}/{version}/{path}");
286-
self.get(&remote_path, self.max_file_size_for(path)).await?
287-
})
288-
}
289-
290256
/// Fetch a rustdoc file from our blob storage.
291257
/// * `name` - the crate name
292258
/// * `version` - the crate version
@@ -840,23 +806,6 @@ impl Storage {
840806
.block_on(self.inner.set_public_access(path, public))
841807
}
842808

843-
pub(crate) fn fetch_rustdoc_file(
844-
&self,
845-
name: &str,
846-
version: &str,
847-
latest_build_id: Option<BuildId>,
848-
path: &str,
849-
archive_storage: bool,
850-
) -> Result<Blob> {
851-
self.runtime.block_on(self.inner.fetch_rustdoc_file(
852-
name,
853-
version,
854-
latest_build_id,
855-
path,
856-
archive_storage,
857-
))
858-
}
859-
860809
pub(crate) fn fetch_source_file(
861810
&self,
862811
name: &str,

src/utils/html.rs

Lines changed: 195 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,109 +1,213 @@
1-
use crate::web::{
2-
page::templates::{Body, Head, Vendored},
3-
rustdoc::RustdocPage,
1+
use crate::{
2+
InstanceMetrics,
3+
web::{
4+
page::{
5+
TemplateData,
6+
templates::{Body, Head, Vendored},
7+
},
8+
rustdoc::RustdocPage,
9+
},
410
};
511
use askama::Template;
12+
use async_stream::stream;
13+
use axum::body::Bytes;
14+
use futures_util::{Stream, StreamExt as _};
615
use lol_html::{element, errors::RewritingError};
16+
use std::sync::Arc;
17+
use tokio::{io::AsyncRead, task::JoinHandle};
18+
use tokio_util::io::ReaderStream;
19+
use tracing::error;
20+
21+
#[derive(thiserror::Error, Debug)]
22+
pub(crate) enum RustdocRewritingError {
23+
#[error("HTML rewriter error: {0}")]
24+
RewritingError(#[from] lol_html::errors::RewritingError),
25+
#[error("generic error while rewriting rustdoc HTML: {0}")]
26+
Other(#[from] anyhow::Error),
27+
}
728

829
/// Rewrite a rustdoc page to have the docs.rs topbar
930
///
1031
/// Given a rustdoc HTML page and a context to serialize it with,
1132
/// render the `rustdoc/` templates with the `html`.
1233
/// The output is an HTML page which has not yet been UTF-8 validated.
1334
/// In practice, the output should always be valid UTF-8.
14-
pub(crate) fn rewrite_lol(
15-
html: &[u8],
35+
pub(crate) fn rewrite_rustdoc_html_stream<R>(
36+
template_data: Arc<TemplateData>,
37+
mut reader: R,
1638
max_allowed_memory_usage: usize,
17-
data: &RustdocPage,
18-
) -> Result<Vec<u8>, RewritingError> {
19-
use lol_html::html_content::{ContentType, Element};
20-
use lol_html::{HtmlRewriter, MemorySettings, Settings};
21-
22-
let head_html = Head::new(data).render().unwrap();
23-
let vendored_html = Vendored.render().unwrap();
24-
let body_html = Body.render().unwrap();
25-
let topbar_html = data.render().unwrap();
26-
27-
// Before: <body> ... rustdoc content ... </body>
28-
// After:
29-
// ```html
30-
// <div id="rustdoc_body_wrapper" class="{{ rustdoc_body_class }}" tabindex="-1">
31-
// ... rustdoc content ...
32-
// </div>
33-
// ```
34-
let body_handler = |rustdoc_body_class: &mut Element| {
35-
// Add the `rustdoc` classes to the html body
36-
let mut tmp;
37-
let klass = if let Some(classes) = rustdoc_body_class.get_attribute("class") {
38-
tmp = classes;
39-
tmp.push_str(" container-rustdoc");
40-
&tmp
41-
} else {
42-
"container-rustdoc"
43-
};
44-
rustdoc_body_class.set_attribute("class", klass)?;
45-
rustdoc_body_class.set_attribute("id", "rustdoc_body_wrapper")?;
46-
rustdoc_body_class.set_attribute("tabindex", "-1")?;
47-
// Change the `body` to a `div`
48-
rustdoc_body_class.set_tag_name("div")?;
49-
// Prepend the askama content
50-
rustdoc_body_class.prepend(&body_html, ContentType::Html);
51-
// Wrap the transformed body and topbar into a <body> element
52-
rustdoc_body_class.before(r#"<body class="rustdoc-page">"#, ContentType::Html);
53-
// Insert the topbar outside of the rustdoc div
54-
rustdoc_body_class.before(&topbar_html, ContentType::Html);
55-
// Finalize body with </body>
56-
rustdoc_body_class.after("</body>", ContentType::Html);
57-
58-
Ok(())
59-
};
60-
61-
let settings = Settings {
62-
element_content_handlers: vec![
63-
// Append `style.css` stylesheet after all head elements.
64-
element!("head", |head: &mut Element| {
65-
head.append(&head_html, ContentType::Html);
66-
Ok(())
67-
}),
68-
element!("body", body_handler),
69-
// Append `vendored.css` before `rustdoc.css`, so that the duplicate copy of
70-
// `normalize.css` will be overridden by the later version.
71-
//
72-
// Later rustdoc has `#mainThemeStyle` that could be used, but pre-2018 docs
73-
// don't have this:
74-
//
75-
// https://github.com/rust-lang/rust/commit/003b2bc1c65251ec2fc80b78ed91c43fb35402ec
76-
//
77-
// Pre-2018 rustdoc also didn't have the resource suffix, but docs.rs was using a fork
78-
// that had implemented it already then, so we can assume the css files are
79-
// `<some path>/rustdoc-<some suffix>.css` and use the `-` to distinguish from the
80-
// `rustdoc.static` path.
81-
element!(
82-
"link[rel='stylesheet'][href*='rustdoc-']",
83-
|rustdoc_css: &mut Element| {
84-
rustdoc_css.before(&vendored_html, ContentType::Html);
39+
data: Arc<RustdocPage>,
40+
metrics: Arc<InstanceMetrics>,
41+
) -> impl Stream<Item = Result<Bytes, RustdocRewritingError>>
42+
where
43+
R: AsyncRead + Unpin + 'static,
44+
{
45+
stream!({
46+
let (input_sender, input_receiver) = std::sync::mpsc::channel::<Option<Vec<u8>>>();
47+
let (result_sender, mut result_receiver) = tokio::sync::mpsc::unbounded_channel::<Bytes>();
48+
49+
let join_handle: JoinHandle<anyhow::Result<_>> = tokio::spawn(async move {
50+
// we're using the rendering threadpool to limit CPU usage on the server, and to
51+
// offload potentially CPU intensive stuff from the tokio runtime.
52+
// Also this lets us limit the threadpool size and through that the CPU usage.
53+
template_data
54+
.render_in_threadpool(move || {
55+
use lol_html::html_content::{ContentType, Element};
56+
use lol_html::{HtmlRewriter, MemorySettings, Settings};
57+
58+
let head_html = Head::new(&data).render().unwrap();
59+
let vendored_html = Vendored.render().unwrap();
60+
let body_html = Body.render().unwrap();
61+
let topbar_html = data.render().unwrap();
62+
63+
// Before: <body> ... rustdoc content ... </body>
64+
// After:
65+
// ```html
66+
// <div id="rustdoc_body_wrapper" class="{{ rustdoc_body_class }}" tabindex="-1">
67+
// ... rustdoc content ...
68+
// </div>
69+
// ```
70+
let body_handler = |rustdoc_body_class: &mut Element| {
71+
// Add the `rustdoc` classes to the html body
72+
let mut tmp;
73+
let klass = if let Some(classes) = rustdoc_body_class.get_attribute("class")
74+
{
75+
tmp = classes;
76+
tmp.push_str(" container-rustdoc");
77+
&tmp
78+
} else {
79+
"container-rustdoc"
80+
};
81+
rustdoc_body_class.set_attribute("class", klass)?;
82+
rustdoc_body_class.set_attribute("id", "rustdoc_body_wrapper")?;
83+
rustdoc_body_class.set_attribute("tabindex", "-1")?;
84+
// Change the `body` to a `div`
85+
rustdoc_body_class.set_tag_name("div")?;
86+
// Prepend the askama content
87+
rustdoc_body_class.prepend(&body_html, ContentType::Html);
88+
// Wrap the transformed body and topbar into a <body> element
89+
rustdoc_body_class
90+
.before(r#"<body class="rustdoc-page">"#, ContentType::Html);
91+
// Insert the topbar outside of the rustdoc div
92+
rustdoc_body_class.before(&topbar_html, ContentType::Html);
93+
// Finalize body with </body>
94+
rustdoc_body_class.after("</body>", ContentType::Html);
95+
96+
Ok(())
97+
};
98+
99+
let settings = Settings {
100+
element_content_handlers: vec![
101+
// Append `style.css` stylesheet after all head elements.
102+
element!("head", |head: &mut Element| {
103+
head.append(&head_html, ContentType::Html);
104+
Ok(())
105+
}),
106+
element!("body", body_handler),
107+
// Append `vendored.css` before `rustdoc.css`, so that the duplicate copy of
108+
// `normalize.css` will be overridden by the later version.
109+
//
110+
// Later rustdoc has `#mainThemeStyle` that could be used, but pre-2018 docs
111+
// don't have this:
112+
//
113+
// https://github.com/rust-lang/rust/commit/003b2bc1c65251ec2fc80b78ed91c43fb35402ec
114+
//
115+
// Pre-2018 rustdoc also didn't have the resource suffix, but docs.rs was using a fork
116+
// that had implemented it already then, so we can assume the css files are
117+
// `<some path>/rustdoc-<some suffix>.css` and use the `-` to distinguish from the
118+
// `rustdoc.static` path.
119+
element!(
120+
"link[rel='stylesheet'][href*='rustdoc-']",
121+
move |rustdoc_css: &mut Element| {
122+
rustdoc_css.before(&vendored_html, ContentType::Html);
123+
Ok(())
124+
}
125+
),
126+
],
127+
memory_settings: MemorySettings {
128+
max_allowed_memory_usage,
129+
..MemorySettings::default()
130+
},
131+
..Settings::default()
132+
};
133+
134+
let mut rewriter = HtmlRewriter::new(settings, move |chunk: &[u8]| {
135+
// send the result back to the main rewriter when its coming in.
136+
// this can fail only when the receiver is dropped, in which case
137+
// we exit this thread anyways.
138+
let _ = result_sender.send(Bytes::from(chunk.to_vec()));
139+
});
140+
while let Some(chunk) = input_receiver.recv()? {
141+
// receive data from the input receiver.
142+
// `input_receiver` is a non-async one.
143+
// Since we're in a normal background thread, we can use the blocking `.recv`
144+
// here.
145+
// We will get `None` when the reader is done reading,
146+
// so that's our signal to exit this loop and call `rewriter.end()` below.
147+
rewriter.write(&chunk)?;
148+
}
149+
// finalize everything. Will trigger the output sink (and through that,
150+
// sending data to the `result_sender`).
151+
rewriter.end()?;
85152
Ok(())
86-
}
87-
),
88-
],
89-
memory_settings: MemorySettings {
90-
max_allowed_memory_usage,
91-
..MemorySettings::default()
92-
},
93-
..Settings::default()
94-
};
153+
})
154+
.await?;
155+
Ok(())
156+
});
157+
158+
let mut reader_stream = ReaderStream::new(&mut reader);
159+
while let Some(chunk) = reader_stream.next().await {
160+
let chunk = chunk.map_err(|err| {
161+
error!(?err, "error while reading from rustdoc HTML reader");
162+
RustdocRewritingError::Other(err.into())
163+
})?;
95164

96-
// The input and output are always strings, we just use `&[u8]` so we only have to validate once.
97-
let mut buffer = Vec::new();
98-
// TODO: Make the rewriter persistent?
99-
let mut writer = HtmlRewriter::new(settings, |bytes: &[u8]| {
100-
buffer.extend_from_slice(bytes);
101-
});
165+
if let Err(err) = input_sender.send(Some(chunk.to_vec())) {
166+
error!(
167+
?err,
168+
"error when trying to send chunk to html rewriter thread"
169+
);
170+
yield Err(RustdocRewritingError::Other(err.into()));
171+
break;
172+
}
102173

103-
writer.write(html)?;
104-
writer.end()?;
174+
while let Ok(bytes) = result_receiver.try_recv() {
175+
yield Ok(bytes);
176+
}
177+
}
178+
// This signals the renderer thread to finalize & exit.
179+
if let Err(err) = input_sender.send(None) {
180+
error!(
181+
?err,
182+
"error when trying to send end signal to html rewriter thread"
183+
);
184+
yield Err(RustdocRewritingError::Other(err.into()));
185+
}
186+
while let Some(bytes) = result_receiver.recv().await {
187+
yield Ok(bytes);
188+
}
105189

106-
Ok(buffer)
190+
join_handle.await.expect("Task panicked").map_err(|e| {
191+
error!(
192+
?e,
193+
memory_limit = max_allowed_memory_usage,
194+
"error while rewriting rustdoc HTML"
195+
);
196+
// our `render_in_threadpool` and so the async tokio task return an `anyhow::Result`.
197+
// In most cases this will be an error from the `HtmlRewriter`, which we'll get as a
198+
// `RewritingError` which we extract here again. The other cases remain an
199+
// `anyhow::Error`.
200+
match e.downcast::<RewritingError>() {
201+
Ok(e) => {
202+
if matches!(e, RewritingError::MemoryLimitExceeded(_)) {
203+
metrics.html_rewrite_ooms.inc();
204+
}
205+
RustdocRewritingError::RewritingError(e)
206+
}
207+
Err(e) => RustdocRewritingError::Other(e),
208+
}
209+
})?;
210+
})
107211
}
108212

109213
#[cfg(test)]

src/utils/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
pub(crate) use self::cargo_metadata::{CargoMetadata, Package as MetadataPackage};
44
pub(crate) use self::copy::copy_dir_all;
55
pub use self::daemon::{start_daemon, watch_registry};
6-
pub(crate) use self::html::rewrite_lol;
6+
pub(crate) use self::html::rewrite_rustdoc_html_stream;
77
pub use self::queue::{
88
get_crate_pattern_and_priority, get_crate_priority, list_crate_priorities,
99
remove_crate_priority, set_crate_priority,

0 commit comments

Comments
 (0)