From c3266a377048498ba273739bea77a04c95d5972c Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 01:45:22 +0100 Subject: [PATCH 01/12] port from argh to bpaf this is a work-in-progress port from argh to bpaf. I am not convinced bpaf is a good solution here. First, I don't see an obvious way to have optional subcommands and global options. For example: hyperlink -j 4 dump-paragraphs -f foo.md ...should parse the same -j as: hyperlink -j 4 ./public/ in argh this worked fine (even though dump-paragraphs does not use -j right now), but in bpaf I didn't find a way to write subcommands without making them mutually exclusive with global options. additionally, a main reason I would like to move away from argh is its poor handling of help texts compared to clap. The helptext for dump-paragraphs currently looks like this: $ ./target/release/hyperlink dump-paragraphs --help Usage: hyperlink dump-paragraphs --file Dump out internal data for markdown or html file. This is mostly useful to figure out why a source file is not properly matched up with its target html file. NOTE: This is a tool for debugging and development. Usage: vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) Each line on the left represents a Markdown paragraph. Each line on the right represents a HTML paragraph. If there are minor formatting differences in two lines that are supposed to match, you found the issue that needs fixing in `src/paragraph.rs`. There may also be entire lines missing from either side, in which case the logic for detecting paragraphs needs adjustment, either in `src/markdown.rs` or `src/html.rs`. Note that the output for HTML omits paragraphs that do not have links, while for Markdown all paragraphs are dumped. Options: --file markdown or html file --help display usage information This, from argh, is a mess, but bpaf is worse: $ ./target/debug/hyperlink dump-paragraphs --help Dump out internal data for markdown or html file. Usage: hyperlink dump-paragraphs --file=ARG Available options: --file=ARG markdown or html file -h, --help Prints help information Where did all that content go? It seems bpaf just drops all data after the first paragraph. I looked at the docs and found that bpaf has some special handling for two successive blank lines, but I don't have those in my code (only single blank line). Overall I found the API surface overwhelming and confusing. I can mix and match the derive and combinators API, and that led me very far astray when I couldn't figure out easily how to make optional subcommands work. I still don't really know how to make it work exactly the same as in argh/clap. --- Cargo.lock | 53 +++++-------- Cargo.toml | 2 +- src/html/mod.rs | 10 +-- src/main.rs | 198 +++++++++++++++++++++++------------------------- 4 files changed, 118 insertions(+), 145 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b8597b9..4c65d2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,37 +23,6 @@ version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" -[[package]] -name = "argh" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219" -dependencies = [ - "argh_derive", - "argh_shared", -] - -[[package]] -name = "argh_derive" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a" -dependencies = [ - "argh_shared", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "argh_shared" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531" -dependencies = [ - "serde", -] - [[package]] name = "arrayref" version = "0.3.9" @@ -122,6 +91,26 @@ dependencies = [ "constant_time_eq", ] +[[package]] +name = "bpaf" +version = "0.9.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913d667d4716acd286a0dc58824a4c0ec8ce58eeca95cfb58172d17a9ec01035" +dependencies = [ + "bpaf_derive", +] + +[[package]] +name = "bpaf_derive" +version = "0.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34d8a24f809c4cda0832689019daa067d0ae927d801429196b238a3e8cb0cd3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "bstr" version = "1.10.0" @@ -318,10 +307,10 @@ name = "hyperlink" version = "0.1.43" dependencies = [ "anyhow", - "argh", "assert_cmd", "assert_fs", "blake3", + "bpaf", "bumpalo", "html5gum", "jwalk", diff --git a/Cargo.toml b/Cargo.toml index 5d8f5b0..aeb3127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,8 +25,8 @@ html5gum = "0.7.0" jwalk = "0.8.1" bumpalo = { version = "3.11.1", features = ["collections"] } percent-encoding = "2.1.0" -argh = "0.1.12" num_cpus = "1.15.0" +bpaf = { version = "0.9.16", features = ["derive"] } [dev-dependencies] assert_cmd = "2.0.2" diff --git a/src/html/mod.rs b/src/html/mod.rs index 8e68edb..50aaf8a 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -575,10 +575,7 @@ fn test_document_join_bare_html() { fn test_json_script() { use crate::paragraph::ParagraphHasher; - let doc = Document::new( - Path::new("/"), - Path::new("/html5gum/struct.Tokenizer.html"), - ); + let doc = Document::new(Path::new("/"), Path::new("/html5gum/struct.Tokenizer.html")); let html = r#""#; @@ -588,8 +585,5 @@ fn test_json_script() { .links_from_read::<_, ParagraphHasher>(&mut doc_buf, html.as_bytes(), false) .unwrap(); - assert_eq!( - links.collect::>(), - &[] - ); + assert_eq!(links.collect::>(), &[]); } diff --git a/src/main.rs b/src/main.rs index 23b764e..fe00a61 100755 --- a/src/main.rs +++ b/src/main.rs @@ -12,7 +12,7 @@ use std::path::{Path, PathBuf}; use std::process; use anyhow::{anyhow, Context, Error}; -use argh::FromArgs; +use bpaf::*; use jwalk::WalkDirGeneric; use markdown::DocumentSource; use rayon::prelude::*; @@ -26,155 +26,145 @@ use crate::urls::is_external_link; static MARKDOWN_FILES: &[&str] = &["md", "mdx"]; static HTML_FILES: &[&str] = &["htm", "html"]; -#[derive(FromArgs, PartialEq, Debug)] +#[derive(Bpaf, PartialEq, Debug)] /// A command-line tool to find broken links in your static site. -struct Cli { - /// the static file path to check - /// - /// This will be assumed to be the root path of your server as well, so - /// href="/foo" will resolve to that folder's subfolder foo - #[argh(positional)] - base_path: Option, - +struct MainCommand { /// how many threads to use, default is to try and saturate CPU - #[argh(option, short = 'j', long = "jobs")] + #[bpaf(short('j'), long("jobs"))] threads: Option, /// whether to check for valid anchor references - #[argh(switch)] + #[bpaf(long)] check_anchors: bool, /// path to directory of markdown files to use for reporting errors - #[argh(option, long = "sources")] + #[bpaf(long("sources"))] sources_path: Option, /// enable specialized output for GitHub actions - #[argh(switch)] + #[bpaf(long)] github_actions: bool, /// print version information and exit - #[argh(switch, short = 'V')] + #[bpaf(long, short('V'))] version: bool, - #[argh(subcommand)] - subcommand: Option, -} - -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand)] -enum Subcommand { - DumpParagraphs(DumpParagraphs), - MatchAllParagraphs(MatchAllParagraphs), - DumpExternalLinks(DumpExternalLinks), -} - -#[derive(FromArgs, PartialEq, Debug)] -/// Dump out internal data for markdown or html file. This is mostly useful to figure out why -/// a source file is not properly matched up with its target html file. -/// -/// NOTE: This is a tool for debugging and development. -/// -/// Usage: -/// -/// vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) -/// -/// Each line on the left represents a Markdown paragraph. Each line on the right represents a -/// HTML paragraph. If there are minor formatting differences in two lines that are supposed to -/// match, you found the issue that needs fixing in `src/paragraph.rs`. -/// -/// There may also be entire lines missing from either side, in which case the logic for -/// detecting paragraphs needs adjustment, either in `src/markdown.rs` or `src/html.rs`. -/// -/// Note that the output for HTML omits paragraphs that do not have links, while for Markdown -/// all paragraphs are dumped. -#[argh(subcommand, name = "dump-paragraphs")] -struct DumpParagraphs { - #[argh(option)] - /// markdown or html file - file: PathBuf, -} - -#[derive(FromArgs, PartialEq, Debug)] -/// Attempt to match up all paragraphs from the HTML folder with the Markdown folder and print -/// stats. This can be used to determine whether the source matching is going to be any good. -/// -/// NOTE: This is a tool for debugging and development. -#[argh(subcommand, name = "match-all-paragraphs")] -struct MatchAllParagraphs { - #[argh(option)] - /// base path - base_path: PathBuf, - - #[argh(option)] - /// sources - sources_path: PathBuf, + /// the static file path to check + /// + /// This will be assumed to be the root path of your server as well, so + /// href="/foo" will resolve to that folder's subfolder foo + #[bpaf(positional("BASE-PATH"))] + base_path: Option, } -#[derive(FromArgs, PartialEq, Debug)] -/// Dump out a list and count of _external_ links. hyperlink does not check external links, -/// but this subcommand can be used to get a summary of the external links that exist in your -/// site. -#[argh(subcommand, name = "dump-external-links")] -struct DumpExternalLinks { - #[argh(option)] - /// base path - base_path: PathBuf, +#[derive(Debug, PartialEq, Bpaf)] +#[bpaf(options)] +enum Cli { + /// Dump out internal data for markdown or html file. + /// + /// This is mostly useful to figure out why a source file is not properly matched up with its + /// target html file. + /// + /// NOTE: This is a tool for debugging and development. + /// + /// Usage: + /// + /// vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) + /// + /// Each line on the left represents a Markdown paragraph. Each line on the right represents a + /// HTML paragraph. If there are minor formatting differences in two lines that are supposed to + /// match, you found the issue that needs fixing in `src/paragraph.rs`. + /// + /// There may also be entire lines missing from either side, in which case the logic for + /// detecting paragraphs needs adjustment, either in `src/markdown.rs` or `src/html.rs`. + /// + /// Note that the output for HTML omits paragraphs that do not have links, while for Markdown + /// all paragraphs are dumped. + #[bpaf(command("dump-paragraphs"))] + DumpParagraphs { + /// markdown or html file + #[bpaf(long)] + file: PathBuf, + }, + + /// Attempt to match up all paragraphs from the HTML folder with the Markdown folder and print + /// stats. This can be used to determine whether the source matching is going to be any good. + /// + /// NOTE: This is a tool for debugging and development. + #[bpaf(command("match-all-paragraphs"))] + MatchAllParagraphs { + /// base path + #[bpaf(long)] + base_path: PathBuf, + + /// sources + #[bpaf(long)] + sources_path: PathBuf, + }, + + /// Dump out a list and count of _external_ links. hyperlink does not check external links, + /// but this subcommand can be used to get a summary of the external links that exist in your + /// site. + #[bpaf(command("dump-external-links"))] + DumpExternalLinks { + /// base path + #[bpaf(long)] + base_path: PathBuf, + }, + + Main(#[bpaf(external(main_command))] MainCommand) } fn main() -> Result<(), Error> { - let Cli { + let args = cli().run(); + + let MainCommand { base_path, threads, check_anchors, sources_path, github_actions, - subcommand, version, - } = argh::from_env(); - - if version { - println!("hyperlink {}", env!("CARGO_PKG_VERSION")); - return Ok(()); - } - - rayon::ThreadPoolBuilder::new() - // most of the work we do is kind of I/O bound. rayon assumes CPU-heavy workload. we could - // look into tokio-uring at some point, but it seems like a hassle wrt ownership - // - // hyperlink seems to deadlock on less than 1 thread. - .num_threads(cmp::max(2, threads.unwrap_or_else(|| 4 * num_cpus::get()))) - .build_global() - .unwrap(); - - match subcommand { - Some(Subcommand::DumpParagraphs(DumpParagraphs { file })) => { + } = match args { + Cli::DumpParagraphs { file } => { return dump_paragraphs(file); } - Some(Subcommand::MatchAllParagraphs(MatchAllParagraphs { + Cli::MatchAllParagraphs { base_path, sources_path, - })) => { + } => { return match_all_paragraphs(base_path, sources_path); } - Some(Subcommand::DumpExternalLinks(DumpExternalLinks { base_path })) => { + Cli::DumpExternalLinks { base_path } => { return dump_external_links(base_path); } - None => {} + Cli::Main(main_command) => main_command, + }; + + if version { + println!("hyperlink {}", env!("CARGO_PKG_VERSION")); + return Ok(()); } let base_path = match base_path { Some(base_path) => base_path, None => { // Invalid invocation. Ultra hack to show help if no arguments are provided. - let help_message = Cli::from_args(&["hyperlink"], &["--help"]) - .map(|_| ()) - .unwrap_err() - .output; + let help_message = cli().run_inner(Args::from(&["--help"])).unwrap_err().unwrap_stdout(); println!("{help_message}"); process::exit(1); } }; + rayon::ThreadPoolBuilder::new() + // most of the work we do is kind of I/O bound. rayon assumes CPU-heavy workload. we could + // look into tokio-uring at some point, but it seems like a hassle wrt ownership + // + // hyperlink seems to deadlock on less than 1 thread. + .num_threads(cmp::max(2, threads.unwrap_or_else(|| 4 * num_cpus::get()))) + .build_global() + .unwrap(); + if sources_path.is_some() { check_links::(base_path, check_anchors, sources_path, github_actions) } else { From 6b383956cb1f507e490f4a5f262f16d20bc8773f Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 01:56:21 +0100 Subject: [PATCH 02/12] fmt --- src/main.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index fe00a61..b5fe59d 100755 --- a/src/main.rs +++ b/src/main.rs @@ -112,7 +112,7 @@ enum Cli { base_path: PathBuf, }, - Main(#[bpaf(external(main_command))] MainCommand) + Main(#[bpaf(external(main_command))] MainCommand), } fn main() -> Result<(), Error> { @@ -150,7 +150,10 @@ fn main() -> Result<(), Error> { Some(base_path) => base_path, None => { // Invalid invocation. Ultra hack to show help if no arguments are provided. - let help_message = cli().run_inner(Args::from(&["--help"])).unwrap_err().unwrap_stdout(); + let help_message = cli() + .run_inner(Args::from(&["--help"])) + .unwrap_err() + .unwrap_stdout(); println!("{help_message}"); process::exit(1); } From d4da7bcb06d81fb33769b933feb1a8fcb7619aa0 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 16:40:12 +0100 Subject: [PATCH 03/12] refer subcommand from main cli, instead of other way around --- src/main.rs | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/main.rs b/src/main.rs index b5fe59d..4362e8e 100755 --- a/src/main.rs +++ b/src/main.rs @@ -28,7 +28,7 @@ static HTML_FILES: &[&str] = &["htm", "html"]; #[derive(Bpaf, PartialEq, Debug)] /// A command-line tool to find broken links in your static site. -struct MainCommand { +struct Cli { /// how many threads to use, default is to try and saturate CPU #[bpaf(short('j'), long("jobs"))] threads: Option, @@ -53,13 +53,16 @@ struct MainCommand { /// /// This will be assumed to be the root path of your server as well, so /// href="/foo" will resolve to that folder's subfolder foo - #[bpaf(positional("BASE-PATH"))] + #[bpaf(optional, positional("BASE-PATH"))] base_path: Option, + + #[bpaf(external, optional)] + subcommand: Option } #[derive(Debug, PartialEq, Bpaf)] -#[bpaf(options)] -enum Cli { +#[bpaf(command)] +enum Subcommand { /// Dump out internal data for markdown or html file. /// /// This is mostly useful to figure out why a source file is not properly matched up with its @@ -110,35 +113,34 @@ enum Cli { /// base path #[bpaf(long)] base_path: PathBuf, - }, - - Main(#[bpaf(external(main_command))] MainCommand), + } } fn main() -> Result<(), Error> { - let args = cli().run(); - - let MainCommand { + let Cli { base_path, threads, check_anchors, sources_path, github_actions, version, - } = match args { - Cli::DumpParagraphs { file } => { + subcommand + } = cli().run(); + + match subcommand { + Some(Subcommand::DumpParagraphs { file }) => { return dump_paragraphs(file); } - Cli::MatchAllParagraphs { + Some(Subcommand::MatchAllParagraphs { base_path, sources_path, - } => { + }) => { return match_all_paragraphs(base_path, sources_path); } - Cli::DumpExternalLinks { base_path } => { + Some(Subcommand::DumpExternalLinks { base_path }) => { return dump_external_links(base_path); } - Cli::Main(main_command) => main_command, + None => (), }; if version { @@ -151,6 +153,7 @@ fn main() -> Result<(), Error> { None => { // Invalid invocation. Ultra hack to show help if no arguments are provided. let help_message = cli() + .to_options() .run_inner(Args::from(&["--help"])) .unwrap_err() .unwrap_stdout(); From fe8dcdf40390dfebd6c57a1cabad14118d33ee83 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 16:53:18 +0100 Subject: [PATCH 04/12] Revert "refer subcommand from main cli, instead of other way around" This reverts commit d4da7bcb06d81fb33769b933feb1a8fcb7619aa0. --- src/main.rs | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/main.rs b/src/main.rs index 4362e8e..b5fe59d 100755 --- a/src/main.rs +++ b/src/main.rs @@ -28,7 +28,7 @@ static HTML_FILES: &[&str] = &["htm", "html"]; #[derive(Bpaf, PartialEq, Debug)] /// A command-line tool to find broken links in your static site. -struct Cli { +struct MainCommand { /// how many threads to use, default is to try and saturate CPU #[bpaf(short('j'), long("jobs"))] threads: Option, @@ -53,16 +53,13 @@ struct Cli { /// /// This will be assumed to be the root path of your server as well, so /// href="/foo" will resolve to that folder's subfolder foo - #[bpaf(optional, positional("BASE-PATH"))] + #[bpaf(positional("BASE-PATH"))] base_path: Option, - - #[bpaf(external, optional)] - subcommand: Option } #[derive(Debug, PartialEq, Bpaf)] -#[bpaf(command)] -enum Subcommand { +#[bpaf(options)] +enum Cli { /// Dump out internal data for markdown or html file. /// /// This is mostly useful to figure out why a source file is not properly matched up with its @@ -113,34 +110,35 @@ enum Subcommand { /// base path #[bpaf(long)] base_path: PathBuf, - } + }, + + Main(#[bpaf(external(main_command))] MainCommand), } fn main() -> Result<(), Error> { - let Cli { + let args = cli().run(); + + let MainCommand { base_path, threads, check_anchors, sources_path, github_actions, version, - subcommand - } = cli().run(); - - match subcommand { - Some(Subcommand::DumpParagraphs { file }) => { + } = match args { + Cli::DumpParagraphs { file } => { return dump_paragraphs(file); } - Some(Subcommand::MatchAllParagraphs { + Cli::MatchAllParagraphs { base_path, sources_path, - }) => { + } => { return match_all_paragraphs(base_path, sources_path); } - Some(Subcommand::DumpExternalLinks { base_path }) => { + Cli::DumpExternalLinks { base_path } => { return dump_external_links(base_path); } - None => (), + Cli::Main(main_command) => main_command, }; if version { @@ -153,7 +151,6 @@ fn main() -> Result<(), Error> { None => { // Invalid invocation. Ultra hack to show help if no arguments are provided. let help_message = cli() - .to_options() .run_inner(Args::from(&["--help"])) .unwrap_err() .unwrap_stdout(); From 8974526e32b3569ba759ffaacb8244c9c7610bbd Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 16:58:06 +0100 Subject: [PATCH 05/12] apply dav1ddes suggestions --- src/main.rs | 75 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/src/main.rs b/src/main.rs index b5fe59d..9c48f73 100755 --- a/src/main.rs +++ b/src/main.rs @@ -27,12 +27,7 @@ static MARKDOWN_FILES: &[&str] = &["md", "mdx"]; static HTML_FILES: &[&str] = &["htm", "html"]; #[derive(Bpaf, PartialEq, Debug)] -/// A command-line tool to find broken links in your static site. struct MainCommand { - /// how many threads to use, default is to try and saturate CPU - #[bpaf(short('j'), long("jobs"))] - threads: Option, - /// whether to check for valid anchor references #[bpaf(long)] check_anchors: bool, @@ -45,10 +40,6 @@ struct MainCommand { #[bpaf(long)] github_actions: bool, - /// print version information and exit - #[bpaf(long, short('V'))] - version: bool, - /// the static file path to check /// /// This will be assumed to be the root path of your server as well, so @@ -57,9 +48,25 @@ struct MainCommand { base_path: Option, } -#[derive(Debug, PartialEq, Bpaf)] +#[derive(Bpaf, PartialEq, Debug)] #[bpaf(options)] -enum Cli { +/// A command-line tool to find broken links in your static site. +struct Cli { + // TODO: use bpaf-native version option + /// print version information and exit + #[bpaf(long, short('V'))] + version: bool, + + /// how many threads to use, default is to try and saturate CPU + #[bpaf(short('j'), long("jobs"))] + threads: Option, + + #[bpaf(external)] + command: Command, +} + +#[derive(Bpaf, PartialEq, Debug)] +enum Command { /// Dump out internal data for markdown or html file. /// /// This is mostly useful to figure out why a source file is not properly matched up with its @@ -116,36 +123,47 @@ enum Cli { } fn main() -> Result<(), Error> { - let args = cli().run(); + let Cli { + version, + threads, + command + }= cli().run(); + + if version { + println!("hyperlink {}", env!("CARGO_PKG_VERSION")); + return Ok(()); + } + + rayon::ThreadPoolBuilder::new() + // most of the work we do is kind of I/O bound. rayon assumes CPU-heavy workload. we could + // look into tokio-uring at some point, but it seems like a hassle wrt ownership + // + // hyperlink seems to deadlock on less than 1 thread. + .num_threads(cmp::max(2, threads.unwrap_or_else(|| 4 * num_cpus::get()))) + .build_global() + .unwrap(); let MainCommand { base_path, - threads, check_anchors, sources_path, github_actions, - version, - } = match args { - Cli::DumpParagraphs { file } => { + } = match command { + Command::DumpParagraphs { file } => { return dump_paragraphs(file); } - Cli::MatchAllParagraphs { + Command::MatchAllParagraphs { base_path, sources_path, } => { return match_all_paragraphs(base_path, sources_path); } - Cli::DumpExternalLinks { base_path } => { + Command::DumpExternalLinks { base_path } => { return dump_external_links(base_path); } - Cli::Main(main_command) => main_command, + Command::Main(main_command) => main_command, }; - if version { - println!("hyperlink {}", env!("CARGO_PKG_VERSION")); - return Ok(()); - } - let base_path = match base_path { Some(base_path) => base_path, None => { @@ -159,15 +177,6 @@ fn main() -> Result<(), Error> { } }; - rayon::ThreadPoolBuilder::new() - // most of the work we do is kind of I/O bound. rayon assumes CPU-heavy workload. we could - // look into tokio-uring at some point, but it seems like a hassle wrt ownership - // - // hyperlink seems to deadlock on less than 1 thread. - .num_threads(cmp::max(2, threads.unwrap_or_else(|| 4 * num_cpus::get()))) - .build_global() - .unwrap(); - if sources_path.is_some() { check_links::(base_path, check_anchors, sources_path, github_actions) } else { From 5f665c561c3acdf8720666b891797fc68c0f9044 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 16:58:10 +0100 Subject: [PATCH 06/12] fmt --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9c48f73..cdbd76d 100755 --- a/src/main.rs +++ b/src/main.rs @@ -126,8 +126,8 @@ fn main() -> Result<(), Error> { let Cli { version, threads, - command - }= cli().run(); + command, + } = cli().run(); if version { println!("hyperlink {}", env!("CARGO_PKG_VERSION")); From b1f10ebc918ffe7056a9b0d27a9b5d26b7aac717 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 17:01:55 +0100 Subject: [PATCH 07/12] update test --- src/main.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index cdbd76d..3afdb6d 100755 --- a/src/main.rs +++ b/src/main.rs @@ -690,7 +690,8 @@ $"#, .code(1) .stdout(predicate::str::contains( "\ -Usage: hyperlink [] [-j ] [--check-anchors] [--sources ] [--github-actions] [-V] [] []\ +Usage: [-V] [-j=ARG] (COMMAND ... | [--check-anchors] [--sources=ARG] [--github-actions] [BASE-PATH +]) ", )); } From e7d750e42b99770c64357067f552b3a99639d340 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 18:50:40 +0100 Subject: [PATCH 08/12] fix help output, add snapshots --- Cargo.lock | 103 +++++++++++++++++++++++++++++++++- Cargo.toml | 2 + src/main.rs | 122 ++++------------------------------------- tests/cli.rs | 78 ++++++++++++++++++++++++++ tests/cli_snapshots.rs | 72 ++++++++++++++++++++++++ 5 files changed, 266 insertions(+), 111 deletions(-) create mode 100644 tests/cli.rs create mode 100644 tests/cli_snapshots.rs diff --git a/Cargo.lock b/Cargo.lock index 4c65d2f..c672da5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -143,6 +143,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "windows-sys 0.59.0", +] + [[package]] name = "constant_time_eq" version = "0.3.1" @@ -229,6 +241,12 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "errno" version = "0.3.9" @@ -313,6 +331,8 @@ dependencies = [ "bpaf", "bumpalo", "html5gum", + "insta", + "insta-cmd", "jwalk", "num_cpus", "percent-encoding", @@ -338,6 +358,37 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "insta" +version = "1.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71c1b125e30d93896b365e156c33dadfffab45ee8400afcbba4752f59de08a86" +dependencies = [ + "console", + "linked-hash-map", + "once_cell", + "pin-project", + "serde", + "similar", +] + +[[package]] +name = "insta-cmd" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffeeefa927925cced49ccb01bf3e57c9d4cd132df21e576eb9415baeab2d3de6" +dependencies = [ + "insta", + "serde", + "serde_json", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + [[package]] name = "jetscii" version = "0.5.3" @@ -360,6 +411,12 @@ version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -415,6 +472,26 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "pin-project" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "predicates" version = "3.1.2" @@ -554,6 +631,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "ryu" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" + [[package]] name = "same-file" version = "1.0.6" @@ -583,12 +666,30 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.138" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + [[package]] name = "syn" version = "2.0.85" diff --git a/Cargo.toml b/Cargo.toml index aeb3127..4629ded 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,8 @@ bpaf = { version = "0.9.16", features = ["derive"] } [dev-dependencies] assert_cmd = "2.0.2" assert_fs = "1.0.2" +insta = "1.42.1" +insta-cmd = "0.6.0" predicates = "3.1.2" pretty_assertions = "1.0.0" diff --git a/src/main.rs b/src/main.rs index 3afdb6d..68825a5 100755 --- a/src/main.rs +++ b/src/main.rs @@ -68,24 +68,24 @@ struct Cli { #[derive(Bpaf, PartialEq, Debug)] enum Command { /// Dump out internal data for markdown or html file. - /// - /// This is mostly useful to figure out why a source file is not properly matched up with its + /// + /// This is mostly useful to figure out why a source file is not properly matched up with its /// target html file. - /// - /// NOTE: This is a tool for debugging and development. - /// - /// Usage: - /// + /// + /// NOTE: This is a tool for debugging and development. + /// + /// Usage: + /// /// vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) - /// - /// Each line on the left represents a Markdown paragraph. Each line on the right represents a + /// + /// Each line on the left represents a Markdown paragraph. Each line on the right represents a /// HTML paragraph. If there are minor formatting differences in two lines that are supposed to /// match, you found the issue that needs fixing in `src/paragraph.rs`. /// - /// There may also be entire lines missing from either side, in which case the logic for + /// There may also be entire lines missing from either side, in which case the logic for /// detecting paragraphs needs adjustment, either in `src/markdown.rs` or `src/html.rs`. /// - /// Note that the output for HTML omits paragraphs that do not have links, while for Markdown + /// Note that the output for HTML omits paragraphs that do not have links, while for Markdown /// all paragraphs are dumped. #[bpaf(command("dump-paragraphs"))] DumpParagraphs { @@ -96,8 +96,7 @@ enum Command { /// Attempt to match up all paragraphs from the HTML folder with the Markdown folder and print /// stats. This can be used to determine whether the source matching is going to be any good. - /// - /// NOTE: This is a tool for debugging and development. + /// NOTE: This is a tool for debugging and development. #[bpaf(command("match-all-paragraphs"))] MatchAllParagraphs { /// base path @@ -613,100 +612,3 @@ fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(), Ok(()) } - -#[cfg(test)] -mod tests { - use assert_cmd::Command; - use assert_fs::prelude::*; - use predicates::prelude::*; - - #[test] - fn test_dead_link() { - let site = assert_fs::TempDir::new().unwrap(); - site.child("index.html") - .write_str("") - .unwrap(); - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - cmd.current_dir(site.path()).arg("."); - - cmd.assert().failure().code(1).stdout( - predicate::str::is_match( - r#"^Reading files -Checking 1 links from 1 files \(1 documents\) -\..index\.html - error: bad link /bar.html - -Found 1 bad links -"#, - ) - .unwrap(), - ); - site.close().unwrap(); - } - - #[test] - fn test_dead_anchor() { - let site = assert_fs::TempDir::new().unwrap(); - site.child("index.html") - .write_str("") - .unwrap(); - site.child("bar.html").touch().unwrap(); - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - cmd.current_dir(site.path()).arg(".").arg("--check-anchors"); - - cmd.assert().failure().code(2).stdout( - predicate::str::is_match( - r#"^Reading files -Checking 1 links from 2 files \(2 documents\) -\..index\.html - error: bad link /bar.html#goo - -Found 0 bad links -Found 1 bad anchors -$"#, - ) - .unwrap(), - ); - site.close().unwrap(); - } - - #[test] - fn test_version() { - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - cmd.arg("--version"); - - cmd.assert() - .success() - .code(0) - .stdout(predicate::str::contains("hyperlink ")); - } - - #[test] - fn test_no_args() { - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - - cmd.assert() - .failure() - .code(1) - .stdout(predicate::str::contains( - "\ -Usage: [-V] [-j=ARG] (COMMAND ... | [--check-anchors] [--sources=ARG] [--github-actions] [BASE-PATH -]) -", - )); - } - - #[test] - fn test_bad_dir() { - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - cmd.arg("non_existing_dir"); - - cmd.assert() - .failure() - .code(1) - .stdout("Reading files\n") - .stderr(predicate::str::contains( - "Error: IO error for operation on non_existing_dir:", - )); - } -} diff --git a/tests/cli.rs b/tests/cli.rs new file mode 100644 index 0000000..917783e --- /dev/null +++ b/tests/cli.rs @@ -0,0 +1,78 @@ +use assert_cmd::Command; +use assert_fs::prelude::*; +use predicates::prelude::*; + +#[test] +fn test_dead_link() { + let site = assert_fs::TempDir::new().unwrap(); + site.child("index.html") + .write_str("") + .unwrap(); + let mut cmd = Command::cargo_bin("hyperlink").unwrap(); + cmd.current_dir(site.path()).arg("."); + + cmd.assert().failure().code(1).stdout( + predicate::str::is_match( + r#"^Reading files +Checking 1 links from 1 files \(1 documents\) +\..index\.html + error: bad link /bar.html + +Found 1 bad links +"#, + ) + .unwrap(), + ); + site.close().unwrap(); +} + +#[test] +fn test_dead_anchor() { + let site = assert_fs::TempDir::new().unwrap(); + site.child("index.html") + .write_str("") + .unwrap(); + site.child("bar.html").touch().unwrap(); + let mut cmd = Command::cargo_bin("hyperlink").unwrap(); + cmd.current_dir(site.path()).arg(".").arg("--check-anchors"); + + cmd.assert().failure().code(2).stdout( + predicate::str::is_match( + r#"^Reading files +Checking 1 links from 2 files \(2 documents\) +\..index\.html + error: bad link /bar.html#goo + +Found 0 bad links +Found 1 bad anchors +$"#, + ) + .unwrap(), + ); + site.close().unwrap(); +} + +#[test] +fn test_version() { + let mut cmd = Command::cargo_bin("hyperlink").unwrap(); + cmd.arg("--version"); + + cmd.assert() + .success() + .code(0) + .stdout(predicate::str::contains("hyperlink ")); +} + +#[test] +fn test_bad_dir() { + let mut cmd = Command::cargo_bin("hyperlink").unwrap(); + cmd.arg("non_existing_dir"); + + cmd.assert() + .failure() + .code(1) + .stdout("Reading files\n") + .stderr(predicate::str::contains( + "Error: IO error for operation on non_existing_dir:", + )); +} diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs new file mode 100644 index 0000000..05d313d --- /dev/null +++ b/tests/cli_snapshots.rs @@ -0,0 +1,72 @@ +use insta_cmd::{assert_cmd_snapshot, get_cargo_bin}; +use std::process::Command; + +fn cli() -> Command { + Command::new(get_cargo_bin("hyperlink")) +} + +#[test] +fn test_no_args() { + assert_cmd_snapshot!(cli(), @r###" + success: false + exit_code: 1 + ----- stdout ----- + A command-line tool to find broken links in your static site. + + Usage: [-V] [-j=ARG] (COMMAND ... | [--check-anchors] [--sources=ARG] [--github-actions] [BASE-PATH + ]) + + Available positional items: + BASE-PATH the static file path to check + + Available options: + -V, --version print version information and exit + -j, --jobs=ARG how many threads to use, default is to try and saturate CPU + --check-anchors whether to check for valid anchor references + --sources=ARG path to directory of markdown files to use for reporting errors + --github-actions enable specialized output for GitHub actions + -h, --help Prints help information + + Available commands: + dump-paragraphs Dump out internal data for markdown or html file. + match-all-paragraphs Attempt to match up all paragraphs from the HTML folder with the Markdown + folder and print + dump-external-links Dump out a list and count of _external_ links. hyperlink does not check + external links, + + + ----- stderr ----- + "###); +} + +#[test] +fn test_dump_paragraphs_help() { + assert_cmd_snapshot!(cli().arg("dump-paragraphs").arg("--help"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + Dump out internal data for markdown or html file. + + This is mostly useful to figure out why a source file is not properly matched up with its target + html file. + + NOTE: This is a tool for debugging and development. + + Usage: + + vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) + + Each line on the left represents a Markdown paragraph. Each line on the right represents a HTML + paragraph. If there are minor formatting differences in two lines that are supposed to match, you + found the issue that needs fixing in `src/paragraph.rs`. + + Usage: hyperlink dump-paragraphs --file=ARG + + Available options: + --file=ARG markdown or html file + -h, --help Prints help information + + + ----- stderr ----- + "###); +} From 7cf8c8381054f0cb819c9856fb8e966c5418e2ef Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 18:58:11 +0100 Subject: [PATCH 09/12] hide version from usage --- src/main.rs | 2 +- tests/cli_snapshots.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 68825a5..c5b0dfe 100755 --- a/src/main.rs +++ b/src/main.rs @@ -54,7 +54,7 @@ struct MainCommand { struct Cli { // TODO: use bpaf-native version option /// print version information and exit - #[bpaf(long, short('V'))] + #[bpaf(long, short('V'), hide_usage)] version: bool, /// how many threads to use, default is to try and saturate CPU diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs index 05d313d..e833a5d 100644 --- a/tests/cli_snapshots.rs +++ b/tests/cli_snapshots.rs @@ -13,8 +13,7 @@ fn test_no_args() { ----- stdout ----- A command-line tool to find broken links in your static site. - Usage: [-V] [-j=ARG] (COMMAND ... | [--check-anchors] [--sources=ARG] [--github-actions] [BASE-PATH - ]) + Usage: [-j=ARG] (COMMAND ... | [--check-anchors] [--sources=ARG] [--github-actions] [BASE-PATH]) Available positional items: BASE-PATH the static file path to check From b7f2deeaef9387d2bac79feae21c15988d9c4de3 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 19:00:41 +0100 Subject: [PATCH 10/12] add version test --- Cargo.lock | 1 + Cargo.toml | 2 +- tests/cli.rs | 11 ----------- tests/cli_snapshots.rs | 25 +++++++++++++++++++++++++ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c672da5..b3f6c1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -368,6 +368,7 @@ dependencies = [ "linked-hash-map", "once_cell", "pin-project", + "regex", "serde", "similar", ] diff --git a/Cargo.toml b/Cargo.toml index 4629ded..89fbe21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ bpaf = { version = "0.9.16", features = ["derive"] } [dev-dependencies] assert_cmd = "2.0.2" assert_fs = "1.0.2" -insta = "1.42.1" +insta = { version = "1.42.1", features = ["filters"] } insta-cmd = "0.6.0" predicates = "3.1.2" pretty_assertions = "1.0.0" diff --git a/tests/cli.rs b/tests/cli.rs index 917783e..a38cec7 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -52,17 +52,6 @@ $"#, site.close().unwrap(); } -#[test] -fn test_version() { - let mut cmd = Command::cargo_bin("hyperlink").unwrap(); - cmd.arg("--version"); - - cmd.assert() - .success() - .code(0) - .stdout(predicate::str::contains("hyperlink ")); -} - #[test] fn test_bad_dir() { let mut cmd = Command::cargo_bin("hyperlink").unwrap(); diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs index e833a5d..615081d 100644 --- a/tests/cli_snapshots.rs +++ b/tests/cli_snapshots.rs @@ -69,3 +69,28 @@ fn test_dump_paragraphs_help() { ----- stderr ----- "###); } + +#[test] +fn test_version() { + let mut settings = insta::Settings::clone_current(); + settings.add_filter(r"hyperlink [.\d]+", "hyperlink [VERSION]"); + let _guard = settings.bind_to_scope(); + + assert_cmd_snapshot!(cli().arg("-V"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + hyperlink [VERSION] + + ----- stderr ----- + "###); + + assert_cmd_snapshot!(cli().arg("--version"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + hyperlink [VERSION] + + ----- stderr ----- + "###); +} From 278bc855a1a515f6efc5b974450b2277a4cad352 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 19:11:12 +0100 Subject: [PATCH 11/12] fix windows test --- tests/cli_snapshots.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs index 615081d..7d1dac6 100644 --- a/tests/cli_snapshots.rs +++ b/tests/cli_snapshots.rs @@ -40,6 +40,10 @@ fn test_no_args() { #[test] fn test_dump_paragraphs_help() { + let mut settings = insta::Settings::clone_current(); + settings.add_filter(r"hyperlink(\.exe)?", "[hyperlink bin]"); + let _guard = settings.bind_to_scope(); + assert_cmd_snapshot!(cli().arg("dump-paragraphs").arg("--help"), @r###" success: true exit_code: 0 @@ -53,13 +57,13 @@ fn test_dump_paragraphs_help() { Usage: - vimdiff <(hyperlink dump-paragraphs src/foo.md) <(hyperlink dump-paragraphs public/foo.html) + vimdiff <([hyperlink bin] dump-paragraphs src/foo.md) <([hyperlink bin] dump-paragraphs public/foo.html) Each line on the left represents a Markdown paragraph. Each line on the right represents a HTML paragraph. If there are minor formatting differences in two lines that are supposed to match, you found the issue that needs fixing in `src/paragraph.rs`. - Usage: hyperlink dump-paragraphs --file=ARG + Usage: [hyperlink bin] dump-paragraphs --file=ARG Available options: --file=ARG markdown or html file From dde3cbcbcbb0744d6451f5952aad7222fec99ee8 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Thu, 30 Jan 2025 19:13:40 +0100 Subject: [PATCH 12/12] remove TODO --- src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index c5b0dfe..6727478 100755 --- a/src/main.rs +++ b/src/main.rs @@ -52,7 +52,6 @@ struct MainCommand { #[bpaf(options)] /// A command-line tool to find broken links in your static site. struct Cli { - // TODO: use bpaf-native version option /// print version information and exit #[bpaf(long, short('V'), hide_usage)] version: bool,