Introduce fragment checking for links to markdown files. (#1126)

- Implemented enhancements to include fragments in file links
- Checked links to markdown files with fragments, generating unique kebab case and heading attributes.
- Made code more idiomatic and added an integration test.
- Updated documentation.
- Fixed issues with heading attributes fragments and ensured proper handling of file errors.
This commit is contained in:
Hugo McNally 2023-07-31 15:04:00 +01:00 committed by GitHub
parent 518fb27db9
commit 8e6369377c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 374 additions and 60 deletions

View file

@ -386,6 +386,9 @@ Options:
-a, --accept <ACCEPT>
Comma-separated list of accepted status codes for valid links
--include-fragments
Enable the checking of fragments in links
-t, --timeout <TIMEOUT>
Website timeout in seconds from connect to response finished

2
fixtures/TEST.md vendored
View file

@ -1,7 +1,7 @@
Check file link
![Logo](../assets/banner.svg)
![Anchors should be ignored](#awesome)
![Fragment only link](#awesome)
Normal link, which should work as expected.
[Wikipedia](https://en.wikipedia.org/wiki/Static_program_analysis)

0
fixtures/fragments/empty_file vendored Normal file
View file

42
fixtures/fragments/file1.md vendored Normal file
View file

@ -0,0 +1,42 @@
# Fragment Test File 1
This is a test file for the fragment loader.
## Fragment 1
[Link to fragment 2](#fragment-2)
## Fragment 2
[Link to fragment 1 in file2](file2.md#fragment-1)
## Fragment 3
[Link to missing fragment](#missing-fragment)
[Link to missing fragment in file2](file2.md#missing-fragment)
## HTML Fragments
Explicit fragment links are currently not supported.
Therefore we put the test into a code block for now to prevent false positives.
```
<a name="explicit-fragment"></a>
[Link to explicit fragment](#explicit-fragment)
```
## Custom Fragments
[Custom fragment id in file2](file2.md#custom-id)
# Kebab Case Fragment
[Link to kebab-case fragment](#kebab-case-fragment)
[Link to second kebab-case fragment](#kebab-case-fragment-1)
# Kebab Case Fragment
[Link to another file type](empty_file#fragment)

7
fixtures/fragments/file2.md vendored Normal file
View file

@ -0,0 +1,7 @@
# Fragment Test File 2
This is a test file for the fragment loader.
### Some other heading with custom id {#custom-id}
#### Fragment 1

View file

@ -77,6 +77,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
.accepted(accepted)
.require_https(cfg.require_https)
.cookie_jar(cookie_jar.cloned())
.include_fragments(cfg.include_fragments)
.build()
.client()
.context("Failed to create request client")

View file

@ -307,6 +307,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) accept: Option<HashSet<u16>>,
/// Enable the checking of fragments in links.
#[arg(long)]
#[serde(default)]
pub(crate) include_fragments: bool,
/// Website timeout in seconds from connect to response finished
#[arg(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
@ -426,6 +431,7 @@ impl Config {
output: None;
require_https: false;
cookie_jar: None;
include_fragments: false;
}
if self

View file

@ -222,8 +222,8 @@ mod cli {
.env_clear()
.assert()
.success()
.stdout(contains("3 Total"))
.stdout(contains("3 OK"));
.stdout(contains("4 Total"))
.stdout(contains("4 OK"));
}
#[test]
@ -489,8 +489,8 @@ mod cli {
test_json_output!(
"TEST.md",
MockResponseStats {
total: 11,
successful: 9,
total: 12,
successful: 10,
excludes: 2,
..MockResponseStats::default()
}
@ -518,7 +518,7 @@ mod cli {
// Running the command from the command line will print 9 links,
// because the actual `--dump` command filters out the two
// http(s)://example.com links
assert_eq!(output.lines().count(), 11);
assert_eq!(output.lines().count(), 12);
fs::remove_file(outfile)?;
Ok(())
}
@ -534,7 +534,7 @@ mod cli {
.arg(".*")
.assert()
.success()
.stdout(contains("11 Excluded"));
.stdout(contains("12 Excluded"));
Ok(())
}
@ -1399,4 +1399,30 @@ mod cli {
Ok(())
}
#[test]
fn test_fragments() {
let mut cmd = main_command();
let input = fixtures_path().join("fragments");
cmd.arg("--verbose")
.arg("--include-fragments")
.arg(input)
.assert()
.failure()
.stderr(contains("fixtures/fragments/file1.md#fragment-2"))
.stderr(contains("fixtures/fragments/file2.md#custom-id"))
.stderr(contains("fixtures/fragments/file1.md#missing-fragment"))
.stderr(contains("fixtures/fragments/file2.md#fragment-1"))
.stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment"))
.stderr(contains("fixtures/fragments/file2.md#missing-fragment"))
.stderr(contains("fixtures/fragments/empty_file#fragment"))
.stderr(contains(
"fixtures/fragments/file1.md#kebab-case-fragment-1",
))
.stdout(contains("8 Total"))
.stdout(contains("6 OK"))
// 2 failures because of missing fragments
.stdout(contains("2 Errors"));
}
}

View file

@ -13,7 +13,7 @@
clippy::default_trait_access,
clippy::used_underscore_binding
)]
use std::{collections::HashSet, sync::Arc, time::Duration};
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
@ -22,7 +22,7 @@ use http::{
header::{HeaderMap, HeaderValue, AUTHORIZATION},
StatusCode,
};
use log::debug;
use log::{debug, warn};
use octocrab::Octocrab;
use regex::RegexSet;
use reqwest::{header, redirect, Url};
@ -36,6 +36,7 @@ use crate::{
remap::Remaps,
retry::RetryExt,
types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};
@ -270,6 +271,9 @@ pub struct ClientBuilder {
///
/// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store
cookie_jar: Option<Arc<CookieStoreMutex>>,
/// Enable the checking of fragments in links.
include_fragments: bool,
}
impl Default for ClientBuilder {
@ -383,6 +387,8 @@ impl ClientBuilder {
accepted: self.accepted,
require_https: self.require_https,
quirks,
include_fragments: self.include_fragments,
fragment_checker: FragmentChecker::new(),
})
}
}
@ -429,6 +435,12 @@ pub struct Client {
/// Override behaviors for certain known issues with special URIs.
quirks: Quirks,
/// Enable the checking of fragments in links.
include_fragments: bool,
/// Caches Fragments
fragment_checker: FragmentChecker,
}
impl Client {
@ -472,7 +484,7 @@ impl Client {
}
let status = match uri.scheme() {
_ if uri.is_file() => self.check_file(uri),
_ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
_ => self.check_website(uri, credentials).await?,
};
@ -659,13 +671,30 @@ impl Client {
}
/// Check a `file` URI.
pub fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
return Status::Ok(StatusCode::OK);
pub async fn check_file(&self, uri: &Uri) -> Status {
let Ok(path) = uri.url.to_file_path() else {
return ErrorKind::InvalidFilePath(uri.clone()).into();
};
if !path.exists() {
return ErrorKind::InvalidFilePath(uri.clone()).into();
}
if self.include_fragments {
self.check_fragment(&path, uri).await
} else {
Status::Ok(StatusCode::OK)
}
}
/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
}
}
ErrorKind::InvalidFilePath(uri.clone()).into()
}
/// Check a mail address, or equivalently a `mailto` URI.

View file

@ -1,4 +1,7 @@
use pulldown_cmark::{Event, Parser, Tag};
//! Extract things from markdown documents
use std::collections::{HashMap, HashSet};
use pulldown_cmark::{Event, Options, Parser, Tag};
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
@ -77,6 +80,79 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
.collect()
}
/// Extract unparsed URL strings from a Markdown string.
pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
let mut in_heading = false;
let mut heading = String::new();
let mut id_generator = HeadingIdGenerator::default();
let mut out = HashSet::new();
for event in Parser::new_ext(input, Options::ENABLE_HEADING_ATTRIBUTES) {
match event {
Event::Start(Tag::Heading(..)) => {
in_heading = true;
}
Event::End(Tag::Heading(_level, id, _classes)) => {
if let Some(frag) = id {
out.insert(frag.to_string());
}
if !heading.is_empty() {
let id = id_generator.generate(&heading);
out.insert(id);
heading.clear();
}
in_heading = false;
}
Event::Text(text) => {
if in_heading {
heading.push_str(&text);
};
}
// Silently skip over other events
_ => (),
}
}
out
}
#[derive(Default)]
struct HeadingIdGenerator {
counter: HashMap<String, usize>,
}
impl HeadingIdGenerator {
fn generate(&mut self, heading: &str) -> String {
let mut id = Self::into_kebab_case(heading);
let count = self.counter.entry(id.clone()).or_insert(0);
if *count != 0 {
id = format!("{}-{}", id, *count);
}
*count += 1;
id
}
/// Converts text into kebab case
#[must_use]
fn into_kebab_case(text: &str) -> String {
text.chars()
.filter_map(|ch| {
if ch.is_alphanumeric() || ch == '_' || ch == '-' {
Some(ch.to_ascii_lowercase())
} else if ch.is_whitespace() {
Some('-')
} else {
None
}
})
.collect::<String>()
}
}
#[cfg(test)]
mod tests {
use super::*;
@ -148,12 +224,12 @@ or inline like `https://bar.org` for instance.
#[test]
#[ignore]
fn test_skip_verbatim_html() {
let input = "
let input = "
<code>
http://link.com
</code>
<pre>
Some pre-formatted http://pre.com
Some pre-formatted http://pre.com
</pre>";
let expected = vec![];
@ -161,4 +237,26 @@ Some pre-formatted http://pre.com
let uris = extract_markdown(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_kebab_case() {
let check = |input, expected| {
let actual = HeadingIdGenerator::into_kebab_case(input);
assert_eq!(actual, expected);
};
check("A Heading", "a-heading");
check(
"This header has a :thumbsup: in it",
"this-header-has-a-thumbsup-in-it",
);
check(
"Header with 한글 characters (using unicode)",
"header-with-한글-characters-using-unicode",
);
check(
"Underscores foo_bar_, dots . and numbers 1.7e-3",
"underscores-foo_bar_-dots--and-numbers-17e-3",
);
check("Many spaces", "many----------spaces");
}
}

View file

@ -1,7 +1,7 @@
use crate::types::{uri::raw::RawUri, FileType, InputContent};
mod html;
mod markdown;
pub mod markdown;
mod plaintext;
use markdown::extract_markdown;

View file

@ -63,6 +63,10 @@ pub enum ErrorKind {
#[error("Cannot find file")]
InvalidFilePath(Uri),
/// The given URI cannot be converted to a file path
#[error("Cannot find fragment")]
InvalidFragment(Uri),
/// The given path cannot be converted to a URI
#[error("Invalid path to URL conversion: {0}")]
InvalidUrlFromPath(PathBuf),
@ -87,7 +91,7 @@ pub enum ErrorKind {
/// The given path does not resolve to a valid file
#[error("Cannot find local file {0}")]
FileNotFound(PathBuf),
InvalidFile(PathBuf),
/// Error while traversing an input directory
#[error("Cannot traverse input directory: {0}")]
@ -132,6 +136,7 @@ pub enum ErrorKind {
/// Basic auth extractor error
#[error("Basic auth extractor error")]
BasicAuthExtractorError(#[from] BasicAuthExtractorError),
/// Cannot load cookies
#[error("Cannot load cookies")]
Cookies(String),
@ -225,6 +230,14 @@ impl PartialEq for ErrorKind {
(Self::Regex(e1), Self::Regex(e2)) => e1.to_string() == e2.to_string(),
(Self::DirTraversal(e1), Self::DirTraversal(e2)) => e1.to_string() == e2.to_string(),
(Self::Channel(_), Self::Channel(_)) => true,
(Self::TooManyRedirects(e1), Self::TooManyRedirects(e2)) => {
e1.to_string() == e2.to_string()
}
(Self::BasicAuthExtractorError(e1), Self::BasicAuthExtractorError(e2)) => {
e1.to_string() == e2.to_string()
}
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
_ => false,
}
}
@ -249,13 +262,14 @@ impl Hash for ErrorKind {
Self::GithubRequest(e) => e.to_string().hash(state),
Self::InvalidGithubUrl(s) => s.hash(state),
Self::DirTraversal(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::InvalidFile(e) => e.to_string_lossy().hash(state),
Self::EmptyUrl => "Empty URL".hash(state),
Self::ParseUrl(e, s) => (e.to_string(), s).hash(state),
Self::InvalidURI(u) => u.hash(state),
Self::InvalidUrlFromPath(p) => p.hash(state),
Self::Utf8(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) => u.hash(state),
Self::InvalidFragment(u) => u.hash(state),
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
Self::InsecureURL(u, ..) => u.hash(state),
Self::InvalidBase(base, e) => (base, e).hash(state),

View file

@ -153,7 +153,7 @@ impl Input {
// and exit early if it does
// This check might not be sufficient to cover all cases
// but it catches the most common ones
return Err(ErrorKind::FileNotFound(path));
return Err(ErrorKind::InvalidFile(path));
} else {
// Invalid path; check if a valid URL can be constructed from the input
// by prefixing it with a `http://` scheme.
@ -442,10 +442,7 @@ mod tests {
let input = Input::new(test_file, None, false, None);
assert!(input.is_err());
assert!(matches!(
input,
Err(ErrorKind::FileNotFound(PathBuf { .. }))
));
assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. }))));
}
#[test]

View file

@ -0,0 +1,75 @@
use std::{
collections::{hash_map::Entry, HashMap, HashSet},
path::Path,
sync::Arc,
};
use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result};
use tokio::{fs, sync::Mutex};
use url::Url;
/// Holds a cache of fragments for a given URL.
///
/// Fragments, also known as anchors, are used to link to a specific
/// part of a page. For example, the URL `https://example.com#foo`
/// will link to the element with the `id` of `foo`.
///
/// This cache is used to avoid having to re-parse the same file
/// multiple times when checking if a given URL contains a fragment.
///
/// The cache is stored in a `HashMap` with the URL as the key and
/// a `HashSet` of fragments as the value.
#[derive(Default, Clone, Debug)]
pub(crate) struct FragmentChecker {
cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
}
impl FragmentChecker {
/// Creates a new `FragmentChecker`.
pub(crate) fn new() -> Self {
Self {
cache: Arc::default(),
}
}
/// Checks the given path contains the given fragment.
///
/// Returns false, if there is a fragment in the link and the path is to a markdown file which
/// doesn't contain the given fragment.
///
/// In all other cases, returns true.
pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
match (FileType::from(path), url.fragment()) {
(FileType::Markdown, Some(fragment)) => {
let url_without_frag = Self::remove_fragment(url.clone());
self.populate_cache_if_vacant(url_without_frag, path, fragment)
.await
}
_ => Ok(true),
}
}
fn remove_fragment(mut url: Url) -> String {
url.set_fragment(None);
url.into()
}
/// Populates the fragment cache with the given URL if it
/// is not already in the cache.
async fn populate_cache_if_vacant(
&self,
url_without_frag: String,
path: &Path,
fragment: &str,
) -> Result<bool> {
let mut fragment_cache = self.cache.lock().await;
match fragment_cache.entry(url_without_frag.clone()) {
Entry::Vacant(entry) => {
let content = fs::read_to_string(path).await?;
let file_frags = extract_markdown_fragments(&content);
Ok(entry.insert(file_frags).contains(fragment))
}
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
}
}
}

View file

@ -1,3 +1,4 @@
pub(crate) mod fragment_checker;
pub(crate) mod path;
pub(crate) mod request;
pub(crate) mod reqwest;

View file

@ -45,7 +45,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Opt
relative if dst.is_relative() => {
// Find `dst` in the parent directory of `src`
let Some(parent) = src.parent() else {
return Err(ErrorKind::FileNotFound(relative.to_path_buf()))
return Err(ErrorKind::InvalidFile(relative.to_path_buf()))
};
parent.join(relative)
}
@ -62,7 +62,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Opt
};
join(dir.to_path_buf(), absolute)
}
_ => return Err(ErrorKind::FileNotFound(dst.to_path_buf())),
_ => return Err(ErrorKind::InvalidFile(dst.to_path_buf())),
};
Ok(Some(absolute_path(resolved)))
}

View file

@ -70,10 +70,19 @@ pub(crate) fn create(
credentials,
)))
} else if let InputSource::FsPath(root) = &input_content.source {
if is_anchor {
// Silently ignore anchor links for now
Ok(None)
} else if let Some(url) = create_uri_from_path(root, &text, base)? {
let path = if is_anchor {
match root.file_name() {
Some(file_name) => match file_name.to_str() {
Some(valid_str) => valid_str.to_string() + &text,
None => return Err(ErrorKind::InvalidFile(root.clone())),
},
None => return Err(ErrorKind::InvalidFile(root.clone())),
}
} else {
text
};
if let Some(url) = create_uri_from_path(root, &path, base)? {
let uri = Uri { url };
let credentials = credentials(extractor, &uri);
@ -122,7 +131,7 @@ fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
}
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
let dst = url::remove_get_params_and_fragment(dst);
let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
@ -136,6 +145,10 @@ fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Op
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
match resolved {
Some(path) => Url::from_file_path(&path)
.map(|mut url| {
url.set_fragment(frag);
url
})
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),

View file

@ -4,18 +4,18 @@ use once_cell::sync::Lazy;
static LINK_FINDER: Lazy<LinkFinder> = Lazy::new(LinkFinder::new);
/// Remove all GET parameters from a URL.
/// Remove all GET parameters from a URL and seperates out the fragment.
/// The link is not a URL but a String as it may not have a base domain.
pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
let path = match url.split_once('#') {
Some((path_without_fragment, _fragment)) => path_without_fragment,
None => url,
pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) {
let (path, frag) = match url.split_once('#') {
Some((path, fragment)) => (path, Some(fragment)),
None => (url, None),
};
let path = match path.split_once('?') {
Some((path_without_params, _params)) => path_without_params,
None => path,
};
path
(path, frag)
}
// Use `LinkFinder` to offload the raw link searching in plaintext
@ -29,47 +29,49 @@ mod test_fs_tree {
#[test]
fn test_remove_get_params_and_fragment() {
assert_eq!(remove_get_params_and_fragment("/"), "/");
assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None));
assert_eq!(
remove_get_params_and_fragment("index.html?foo=bar"),
"index.html"
remove_get_params_and_seperate_fragment("index.html?foo=bar"),
("index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar"),
"/index.html"
remove_get_params_and_seperate_fragment("/index.html?foo=bar"),
("/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html"
remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
("/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html?foo=bar"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"),
("https://example.com/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar"),
("test.png", None)
);
assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html#anchor"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"),
("https://example.com/index.html", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html?foo=bar#anchor"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment(
"https://example.com/index.html?foo=bar#anchor"
),
("https://example.com/index.html", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"),
("test.png", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("test.png#anchor?anchor!?"),
"test.png"
remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"),
("test.png", Some("anchor?anchor!?"))
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"),
("test.png", Some("anchor?anchor!"))
);
}
}