Skip to content

Commit f3f3f92

Browse files
authored
feat: make wikilink extraction and checking opt-in (#1803)
1 parent 62f2193 commit f3f3f92

File tree

8 files changed

+91
-20
lines changed

8 files changed

+91
-20
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Available as a command-line utility, a library and a [GitHub Action](https://git
1818

1919
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
2020
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
21+
2122
## Table of Contents
2223

2324
- [Development](#development)
@@ -171,7 +172,7 @@ outdated information.
171172
| Language | Rust | Ruby | Go | JS | TypeScript | Python | JS | PHP |
172173
| Async/Parallel | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] |
173174
| JSON output | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![maybe]<sup>1</sup> | ![yes] | ![yes] |
174-
| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] |![no] | ![no] | ![no] |
175+
| Static binary | ![yes] | ![no] | ![yes] | ![no] | ![no] |![no] | ![no] | ![no] |
175176
| Markdown files | ![yes] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![no] |
176177
| HTML files | ![yes] | ![no] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] |
177178
| Text files | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] |
@@ -183,7 +184,7 @@ outdated information.
183184
| Relative URLs | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] |
184185
| Anchors/Fragments | ![yes] | ![no] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![no] |
185186
| Skip relative URLs | ![yes] | ![no] | ![no] | ![maybe] | ![no] | ![no] | ![no] | ![no] |
186-
| Include patterns | ![yes] | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] |
187+
| Include patterns | ![yes]| ![yes] | ![no] | ![yes] | ![no] | ![no] | ![no] | ![no] |
187188
| Exclude patterns | ![yes] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] |
188189
| Handle redirects | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] |
189190
| Ignore insecure SSL | ![yes] | ![yes] | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] |
@@ -587,6 +588,9 @@ Options:
587588
--cookie-jar <COOKIE_JAR>
588589
Tell lychee to read cookies from the given file. Cookies will be stored in the cookie jar and sent with requests. New cookies will be stored in the cookie jar and existing cookies will be updated
589590
591+
--include-wikilinks
592+
Check WikiLinks in Markdown files
593+
590594
-h, --help
591595
Print help (see a summary with '-h')
592596
@@ -710,6 +714,7 @@ which includes usage instructions.
710714
## Pre-commit Usage
711715

712716
Lychee can also be used as a [pre-commit](https://pre-commit.com/) hook.
717+
713718
```yaml
714719
# .pre-commit-config.yaml
715720
repos:
@@ -722,6 +727,7 @@ repos:
722727
```
723728
724729
Rather than running on staged-files only, Lychee can be run against an entire repository.
730+
725731
```yaml
726732
- id: lychee
727733
args: ["--no-progress", "."]

fixtures/TEST_WIKI.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Test file including a normal link and a wikilink
2+
[[LycheeWikilink]]

lychee-bin/src/main.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,9 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
338338
.headers(HeaderMap::from_header_pairs(&opts.config.header)?)
339339
.excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?)
340340
// File a bug if you rely on this envvar! It's going to go away eventually.
341-
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1"));
341+
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1"))
342+
.include_wikilinks(opts.config.include_wikilinks);
343+
342344
collector = if let Some(ref basic_auth) = opts.config.basic_auth {
343345
collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?)
344346
} else {

lychee-bin/src/options.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,12 @@ separated list of accepted status codes. This example will accept 200, 201,
734734
#[arg(long)]
735735
#[serde(default)]
736736
pub(crate) cookie_jar: Option<PathBuf>,
737+
738+
#[allow(clippy::doc_markdown)]
739+
/// Check WikiLinks in Markdown files
740+
#[arg(long)]
741+
#[serde(default)]
742+
pub(crate) include_wikilinks: bool,
737743
}
738744

739745
impl Config {
@@ -789,6 +795,7 @@ impl Config {
789795
include_fragments: false;
790796
include_mail: false;
791797
include_verbatim: false;
798+
include_wikilinks: false;
792799
include: Vec::<String>::new();
793800
insecure: false;
794801
max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();

lychee-bin/tests/cli.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2442,6 +2442,31 @@ mod cli {
24422442
.stdout(contains("https://www.example.com/smth."));
24432443
}
24442444

2445+
#[test]
2446+
fn test_wikilink_extract_when_specified() {
2447+
let test_path = fixtures_path().join("TEST_WIKI.md");
2448+
2449+
let mut cmd = main_command();
2450+
cmd.arg("--dump")
2451+
.arg("--include-wikilinks")
2452+
.arg(test_path)
2453+
.assert()
2454+
.success()
2455+
.stdout(contains("LycheeWikilink"));
2456+
}
2457+
2458+
#[test]
2459+
fn test_wikilink_dont_extract_when_not_specified() {
2460+
let test_path = fixtures_path().join("TEST_WIKI.md");
2461+
2462+
let mut cmd = main_command();
2463+
cmd.arg("--dump")
2464+
.arg(test_path)
2465+
.assert()
2466+
.success()
2467+
.stdout(is_empty());
2468+
}
2469+
24452470
#[test]
24462471
fn test_index_files_default() {
24472472
let input = fixtures_path().join("filechecker/dir_links.md");

lychee-lib/src/collector.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pub struct Collector {
2929
skip_ignored: bool,
3030
skip_hidden: bool,
3131
include_verbatim: bool,
32+
include_wikilinks: bool,
3233
use_html5ever: bool,
3334
root_dir: Option<PathBuf>,
3435
base: Option<Base>,
@@ -47,6 +48,7 @@ impl Default for Collector {
4748
basic_auth_extractor: None,
4849
skip_missing_inputs: false,
4950
include_verbatim: false,
51+
include_wikilinks: false,
5052
use_html5ever: false,
5153
skip_hidden: true,
5254
skip_ignored: true,
@@ -76,6 +78,7 @@ impl Collector {
7678
basic_auth_extractor: None,
7779
skip_missing_inputs: false,
7880
include_verbatim: false,
81+
include_wikilinks: false,
7982
use_html5ever: false,
8083
skip_hidden: true,
8184
skip_ignored: true,
@@ -138,6 +141,14 @@ impl Collector {
138141
self
139142
}
140143

144+
#[allow(clippy::doc_markdown)]
145+
/// Check WikiLinks in Markdown files
146+
#[must_use]
147+
pub const fn include_wikilinks(mut self, yes: bool) -> Self {
148+
self.include_wikilinks = yes;
149+
self
150+
}
151+
141152
/// Pass a [`BasicAuthExtractor`] which is capable to match found
142153
/// URIs to basic auth credentials. These credentials get passed to the
143154
/// request in question.
@@ -268,7 +279,11 @@ impl Collector {
268279
let basic_auth_extractor = self.basic_auth_extractor.clone();
269280
async move {
270281
let content = content?;
271-
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
282+
let extractor = Extractor::new(
283+
self.use_html5ever,
284+
self.include_verbatim,
285+
self.include_wikilinks,
286+
);
272287
let uris: Vec<RawUri> = extractor.extract(&content);
273288
let requests = request::create(
274289
uris,

lychee-lib/src/extract/markdown.rs

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@ fn md_extensions() -> Options {
1414
}
1515

1616
/// Extract unparsed URL strings from a Markdown string.
17-
pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
17+
pub(crate) fn extract_markdown(
18+
input: &str,
19+
include_verbatim: bool,
20+
include_wikilinks: bool,
21+
) -> Vec<RawUri> {
1822
// In some cases it is undesirable to extract links from within code blocks,
1923
// which is why we keep track of entries and exits while traversing the input.
2024
let mut inside_code_block = false;
@@ -64,6 +68,10 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
6468
Some(extract_raw_uri_from_plaintext(&dest_url)),
6569
// Wiki URL (`[[http://example.com]]`)
6670
LinkType::WikiLink { has_pothole: _ } => {
71+
// Exclude WikiLinks if not explicitly enabled
72+
if !include_wikilinks {
73+
return None;
74+
}
6775
inside_link_block = true;
6876
//Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents
6977
if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
@@ -280,7 +288,7 @@ or inline like `https://bar.org` for instance.
280288
},
281289
];
282290

283-
let uris = extract_markdown(MD_INPUT, false);
291+
let uris = extract_markdown(MD_INPUT, false, false);
284292
assert_eq!(uris, expected);
285293
}
286294

@@ -309,7 +317,7 @@ or inline like `https://bar.org` for instance.
309317
},
310318
];
311319

312-
let uris = extract_markdown(MD_INPUT, true);
320+
let uris = extract_markdown(MD_INPUT, true, false);
313321
assert_eq!(uris, expected);
314322
}
315323

@@ -325,7 +333,7 @@ Some pre-formatted http://pre.com
325333

326334
let expected = vec![];
327335

328-
let uris = extract_markdown(input, false);
336+
let uris = extract_markdown(input, false, false);
329337
assert_eq!(uris, expected);
330338
}
331339

@@ -358,15 +366,15 @@ $$
358366
[\psi](\mathbf{L})
359367
$$
360368
";
361-
let uris = extract_markdown(input, true);
369+
let uris = extract_markdown(input, true, false);
362370
assert!(uris.is_empty());
363371
}
364372

365373
#[test]
366374
fn test_single_word_footnote_is_not_detected_as_link() {
367375
let markdown = "This footnote is[^actually] a link.\n\n[^actually]: not";
368376
let expected = vec![];
369-
let uris = extract_markdown(markdown, true);
377+
let uris = extract_markdown(markdown, true, false);
370378
assert_eq!(uris, expected);
371379
}
372380

@@ -378,7 +386,7 @@ $$
378386
element: None,
379387
attribute: None,
380388
}];
381-
let uris = extract_markdown(markdown, true);
389+
let uris = extract_markdown(markdown, true, false);
382390
assert_eq!(uris, expected);
383391
}
384392

@@ -390,7 +398,7 @@ $$
390398
element: None,
391399
attribute: None,
392400
}];
393-
let uris = extract_markdown(markdown, true);
401+
let uris = extract_markdown(markdown, true, false);
394402
assert_eq!(uris, expected);
395403
}
396404

@@ -402,7 +410,7 @@ $$
402410
element: Some("a".to_string()),
403411
attribute: Some("href".to_string()),
404412
}];
405-
let uris = extract_markdown(markdown, true);
413+
let uris = extract_markdown(markdown, true, true);
406414
assert_eq!(uris, expected);
407415
}
408416

@@ -421,14 +429,14 @@ $$
421429
attribute: Some("href".to_string()),
422430
},
423431
];
424-
let uris = extract_markdown(markdown, true);
432+
let uris = extract_markdown(markdown, true, true);
425433
assert_eq!(uris, expected);
426434
}
427435

428436
#[test]
429437
fn test_ignore_gitlab_toc() {
430438
let markdown = r"[[_TOC_]][TOC]";
431-
let uris = extract_markdown(markdown, true);
439+
let uris = extract_markdown(markdown, true, true);
432440
assert!(uris.is_empty());
433441
}
434442
}

lychee-lib/src/extract/mod.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use plaintext::extract_raw_uri_from_plaintext;
1414
pub struct Extractor {
1515
use_html5ever: bool,
1616
include_verbatim: bool,
17+
include_wikilinks: bool,
1718
}
1819

1920
impl Extractor {
@@ -30,10 +31,11 @@ impl Extractor {
3031
/// For more information, consult the `pulldown_cmark` documentation about code blocks
3132
/// [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
3233
#[must_use]
33-
pub const fn new(use_html5ever: bool, include_verbatim: bool) -> Self {
34+
pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
3435
Self {
3536
use_html5ever,
3637
include_verbatim,
38+
include_wikilinks,
3739
}
3840
}
3941

@@ -42,7 +44,11 @@ impl Extractor {
4244
#[must_use]
4345
pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
4446
match input_content.file_type {
45-
FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim),
47+
FileType::Markdown => extract_markdown(
48+
&input_content.content,
49+
self.include_verbatim,
50+
self.include_wikilinks,
51+
),
4652
FileType::Html => {
4753
if self.use_html5ever {
4854
html::html5ever::extract_html(&input_content.content, self.include_verbatim)
@@ -72,7 +78,7 @@ mod tests {
7278
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
7379
let input_content = InputContent::from_string(input, file_type);
7480

75-
let extractor = Extractor::new(false, false);
81+
let extractor = Extractor::new(false, false, false);
7682
let uris_html5gum: HashSet<Uri> = extractor
7783
.extract(&input_content)
7884
.into_iter()
@@ -84,7 +90,7 @@ mod tests {
8490
uris
8591
};
8692

87-
let extractor = Extractor::new(true, false);
93+
let extractor = Extractor::new(true, false, false);
8894
let uris_html5ever: HashSet<Uri> = extractor
8995
.extract(&input_content)
9096
.into_iter()
@@ -216,7 +222,7 @@ mod tests {
216222
};
217223

218224
for use_html5ever in [true, false] {
219-
let extractor = Extractor::new(use_html5ever, false);
225+
let extractor = Extractor::new(use_html5ever, false, false);
220226
let links = extractor.extract(input_content);
221227

222228
let urls = links

0 commit comments

Comments
 (0)