mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Introduce fragment checking for links to markdown files. (#1126)
- Implemented enhancements to include fragments in file links - Checked links to markdown files with fragments, generating unique kebab case and heading attributes. - Made code more idiomatic and added an integration test. - Updated documentation. - Fixed issues with heading attributes fragments and ensured proper handling of file errors.
This commit is contained in:
parent
518fb27db9
commit
8e6369377c
18 changed files with 374 additions and 60 deletions
|
|
@ -386,6 +386,9 @@ Options:
|
|||
-a, --accept <ACCEPT>
|
||||
Comma-separated list of accepted status codes for valid links
|
||||
|
||||
--include-fragments
|
||||
Enable the checking of fragments in links
|
||||
|
||||
-t, --timeout <TIMEOUT>
|
||||
Website timeout in seconds from connect to response finished
|
||||
|
||||
|
|
|
|||
2
fixtures/TEST.md
vendored
2
fixtures/TEST.md
vendored
|
|
@ -1,7 +1,7 @@
|
|||
Check file link
|
||||

|
||||
|
||||

|
||||

|
||||
|
||||
Normal link, which should work as expected.
|
||||
[Wikipedia](https://en.wikipedia.org/wiki/Static_program_analysis)
|
||||
|
|
|
|||
0
fixtures/fragments/empty_file
vendored
Normal file
0
fixtures/fragments/empty_file
vendored
Normal file
42
fixtures/fragments/file1.md
vendored
Normal file
42
fixtures/fragments/file1.md
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# Fragment Test File 1
|
||||
|
||||
This is a test file for the fragment loader.
|
||||
|
||||
## Fragment 1
|
||||
|
||||
[Link to fragment 2](#fragment-2)
|
||||
|
||||
## Fragment 2
|
||||
|
||||
[Link to fragment 1 in file2](file2.md#fragment-1)
|
||||
|
||||
## Fragment 3
|
||||
|
||||
[Link to missing fragment](#missing-fragment)
|
||||
|
||||
[Link to missing fragment in file2](file2.md#missing-fragment)
|
||||
|
||||
## HTML Fragments
|
||||
|
||||
Explicit fragment links are currently not supported.
|
||||
Therefore we put the test into a code block for now to prevent false positives.
|
||||
|
||||
```
|
||||
<a name="explicit-fragment"></a>
|
||||
|
||||
[Link to explicit fragment](#explicit-fragment)
|
||||
```
|
||||
|
||||
## Custom Fragments
|
||||
|
||||
[Custom fragment id in file2](file2.md#custom-id)
|
||||
|
||||
# Kebab Case Fragment
|
||||
|
||||
[Link to kebab-case fragment](#kebab-case-fragment)
|
||||
|
||||
[Link to second kebab-case fragment](#kebab-case-fragment-1)
|
||||
|
||||
# Kebab Case Fragment
|
||||
|
||||
[Link to another file type](empty_file#fragment)
|
||||
7
fixtures/fragments/file2.md
vendored
Normal file
7
fixtures/fragments/file2.md
vendored
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Fragment Test File 2
|
||||
|
||||
This is a test file for the fragment loader.
|
||||
|
||||
### Some other heading with custom id {#custom-id}
|
||||
|
||||
#### Fragment 1
|
||||
|
|
@ -77,6 +77,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
|
|||
.accepted(accepted)
|
||||
.require_https(cfg.require_https)
|
||||
.cookie_jar(cookie_jar.cloned())
|
||||
.include_fragments(cfg.include_fragments)
|
||||
.build()
|
||||
.client()
|
||||
.context("Failed to create request client")
|
||||
|
|
|
|||
|
|
@ -307,6 +307,11 @@ pub(crate) struct Config {
|
|||
#[serde(default)]
|
||||
pub(crate) accept: Option<HashSet<u16>>,
|
||||
|
||||
/// Enable the checking of fragments in links.
|
||||
#[arg(long)]
|
||||
#[serde(default)]
|
||||
pub(crate) include_fragments: bool,
|
||||
|
||||
/// Website timeout in seconds from connect to response finished
|
||||
#[arg(short, long, default_value = &TIMEOUT_STR)]
|
||||
#[serde(default = "timeout")]
|
||||
|
|
@ -426,6 +431,7 @@ impl Config {
|
|||
output: None;
|
||||
require_https: false;
|
||||
cookie_jar: None;
|
||||
include_fragments: false;
|
||||
}
|
||||
|
||||
if self
|
||||
|
|
|
|||
|
|
@ -222,8 +222,8 @@ mod cli {
|
|||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("3 Total"))
|
||||
.stdout(contains("3 OK"));
|
||||
.stdout(contains("4 Total"))
|
||||
.stdout(contains("4 OK"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -489,8 +489,8 @@ mod cli {
|
|||
test_json_output!(
|
||||
"TEST.md",
|
||||
MockResponseStats {
|
||||
total: 11,
|
||||
successful: 9,
|
||||
total: 12,
|
||||
successful: 10,
|
||||
excludes: 2,
|
||||
..MockResponseStats::default()
|
||||
}
|
||||
|
|
@ -518,7 +518,7 @@ mod cli {
|
|||
// Running the command from the command line will print 9 links,
|
||||
// because the actual `--dump` command filters out the two
|
||||
// http(s)://example.com links
|
||||
assert_eq!(output.lines().count(), 11);
|
||||
assert_eq!(output.lines().count(), 12);
|
||||
fs::remove_file(outfile)?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -534,7 +534,7 @@ mod cli {
|
|||
.arg(".*")
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("11 Excluded"));
|
||||
.stdout(contains("12 Excluded"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1399,4 +1399,30 @@ mod cli {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fragments() {
|
||||
let mut cmd = main_command();
|
||||
let input = fixtures_path().join("fragments");
|
||||
|
||||
cmd.arg("--verbose")
|
||||
.arg("--include-fragments")
|
||||
.arg(input)
|
||||
.assert()
|
||||
.failure()
|
||||
.stderr(contains("fixtures/fragments/file1.md#fragment-2"))
|
||||
.stderr(contains("fixtures/fragments/file2.md#custom-id"))
|
||||
.stderr(contains("fixtures/fragments/file1.md#missing-fragment"))
|
||||
.stderr(contains("fixtures/fragments/file2.md#fragment-1"))
|
||||
.stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment"))
|
||||
.stderr(contains("fixtures/fragments/file2.md#missing-fragment"))
|
||||
.stderr(contains("fixtures/fragments/empty_file#fragment"))
|
||||
.stderr(contains(
|
||||
"fixtures/fragments/file1.md#kebab-case-fragment-1",
|
||||
))
|
||||
.stdout(contains("8 Total"))
|
||||
.stdout(contains("6 OK"))
|
||||
// 2 failures because of missing fragments
|
||||
.stdout(contains("2 Errors"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@
|
|||
clippy::default_trait_access,
|
||||
clippy::used_underscore_binding
|
||||
)]
|
||||
use std::{collections::HashSet, sync::Arc, time::Duration};
|
||||
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
|
||||
|
|
@ -22,7 +22,7 @@ use http::{
|
|||
header::{HeaderMap, HeaderValue, AUTHORIZATION},
|
||||
StatusCode,
|
||||
};
|
||||
use log::debug;
|
||||
use log::{debug, warn};
|
||||
use octocrab::Octocrab;
|
||||
use regex::RegexSet;
|
||||
use reqwest::{header, redirect, Url};
|
||||
|
|
@ -36,6 +36,7 @@ use crate::{
|
|||
remap::Remaps,
|
||||
retry::RetryExt,
|
||||
types::uri::github::GithubUri,
|
||||
utils::fragment_checker::FragmentChecker,
|
||||
BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
|
||||
};
|
||||
|
||||
|
|
@ -270,6 +271,9 @@ pub struct ClientBuilder {
|
|||
///
|
||||
/// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store
|
||||
cookie_jar: Option<Arc<CookieStoreMutex>>,
|
||||
|
||||
/// Enable the checking of fragments in links.
|
||||
include_fragments: bool,
|
||||
}
|
||||
|
||||
impl Default for ClientBuilder {
|
||||
|
|
@ -383,6 +387,8 @@ impl ClientBuilder {
|
|||
accepted: self.accepted,
|
||||
require_https: self.require_https,
|
||||
quirks,
|
||||
include_fragments: self.include_fragments,
|
||||
fragment_checker: FragmentChecker::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -429,6 +435,12 @@ pub struct Client {
|
|||
|
||||
/// Override behaviors for certain known issues with special URIs.
|
||||
quirks: Quirks,
|
||||
|
||||
/// Enable the checking of fragments in links.
|
||||
include_fragments: bool,
|
||||
|
||||
/// Caches Fragments
|
||||
fragment_checker: FragmentChecker,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
|
|
@ -472,7 +484,7 @@ impl Client {
|
|||
}
|
||||
|
||||
let status = match uri.scheme() {
|
||||
_ if uri.is_file() => self.check_file(uri),
|
||||
_ if uri.is_file() => self.check_file(uri).await,
|
||||
_ if uri.is_mail() => self.check_mail(uri).await,
|
||||
_ => self.check_website(uri, credentials).await?,
|
||||
};
|
||||
|
|
@ -659,13 +671,30 @@ impl Client {
|
|||
}
|
||||
|
||||
/// Check a `file` URI.
|
||||
pub fn check_file(&self, uri: &Uri) -> Status {
|
||||
if let Ok(path) = uri.url.to_file_path() {
|
||||
if path.exists() {
|
||||
return Status::Ok(StatusCode::OK);
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
let Ok(path) = uri.url.to_file_path() else {
|
||||
return ErrorKind::InvalidFilePath(uri.clone()).into();
|
||||
};
|
||||
if !path.exists() {
|
||||
return ErrorKind::InvalidFilePath(uri.clone()).into();
|
||||
}
|
||||
if self.include_fragments {
|
||||
self.check_fragment(&path, uri).await
|
||||
} else {
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks a `file` URI's fragment.
|
||||
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
|
||||
match self.fragment_checker.check(path, &uri.url).await {
|
||||
Ok(true) => Status::Ok(StatusCode::OK),
|
||||
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
|
||||
Err(err) => {
|
||||
warn!("Skipping fragment check due to the following error: {err}");
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
ErrorKind::InvalidFilePath(uri.clone()).into()
|
||||
}
|
||||
|
||||
/// Check a mail address, or equivalently a `mailto` URI.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,7 @@
|
|||
use pulldown_cmark::{Event, Parser, Tag};
|
||||
//! Extract things from markdown documents
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use pulldown_cmark::{Event, Options, Parser, Tag};
|
||||
|
||||
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
|
||||
|
||||
|
|
@ -77,6 +80,79 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
|
||||
let mut in_heading = false;
|
||||
let mut heading = String::new();
|
||||
let mut id_generator = HeadingIdGenerator::default();
|
||||
|
||||
let mut out = HashSet::new();
|
||||
|
||||
for event in Parser::new_ext(input, Options::ENABLE_HEADING_ATTRIBUTES) {
|
||||
match event {
|
||||
Event::Start(Tag::Heading(..)) => {
|
||||
in_heading = true;
|
||||
}
|
||||
Event::End(Tag::Heading(_level, id, _classes)) => {
|
||||
if let Some(frag) = id {
|
||||
out.insert(frag.to_string());
|
||||
}
|
||||
|
||||
if !heading.is_empty() {
|
||||
let id = id_generator.generate(&heading);
|
||||
out.insert(id);
|
||||
heading.clear();
|
||||
}
|
||||
|
||||
in_heading = false;
|
||||
}
|
||||
Event::Text(text) => {
|
||||
if in_heading {
|
||||
heading.push_str(&text);
|
||||
};
|
||||
}
|
||||
|
||||
// Silently skip over other events
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct HeadingIdGenerator {
|
||||
counter: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl HeadingIdGenerator {
|
||||
fn generate(&mut self, heading: &str) -> String {
|
||||
let mut id = Self::into_kebab_case(heading);
|
||||
let count = self.counter.entry(id.clone()).or_insert(0);
|
||||
if *count != 0 {
|
||||
id = format!("{}-{}", id, *count);
|
||||
}
|
||||
*count += 1;
|
||||
|
||||
id
|
||||
}
|
||||
|
||||
/// Converts text into kebab case
|
||||
#[must_use]
|
||||
fn into_kebab_case(text: &str) -> String {
|
||||
text.chars()
|
||||
.filter_map(|ch| {
|
||||
if ch.is_alphanumeric() || ch == '_' || ch == '-' {
|
||||
Some(ch.to_ascii_lowercase())
|
||||
} else if ch.is_whitespace() {
|
||||
Some('-')
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -148,12 +224,12 @@ or inline like `https://bar.org` for instance.
|
|||
#[test]
|
||||
#[ignore]
|
||||
fn test_skip_verbatim_html() {
|
||||
let input = "
|
||||
let input = "
|
||||
<code>
|
||||
http://link.com
|
||||
</code>
|
||||
<pre>
|
||||
Some pre-formatted http://pre.com
|
||||
Some pre-formatted http://pre.com
|
||||
</pre>";
|
||||
|
||||
let expected = vec![];
|
||||
|
|
@ -161,4 +237,26 @@ Some pre-formatted http://pre.com
|
|||
let uris = extract_markdown(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kebab_case() {
|
||||
let check = |input, expected| {
|
||||
let actual = HeadingIdGenerator::into_kebab_case(input);
|
||||
assert_eq!(actual, expected);
|
||||
};
|
||||
check("A Heading", "a-heading");
|
||||
check(
|
||||
"This header has a :thumbsup: in it",
|
||||
"this-header-has-a-thumbsup-in-it",
|
||||
);
|
||||
check(
|
||||
"Header with 한글 characters (using unicode)",
|
||||
"header-with-한글-characters-using-unicode",
|
||||
);
|
||||
check(
|
||||
"Underscores foo_bar_, dots . and numbers 1.7e-3",
|
||||
"underscores-foo_bar_-dots--and-numbers-17e-3",
|
||||
);
|
||||
check("Many spaces", "many----------spaces");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use crate::types::{uri::raw::RawUri, FileType, InputContent};
|
||||
|
||||
mod html;
|
||||
mod markdown;
|
||||
pub mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
use markdown::extract_markdown;
|
||||
|
|
|
|||
|
|
@ -63,6 +63,10 @@ pub enum ErrorKind {
|
|||
#[error("Cannot find file")]
|
||||
InvalidFilePath(Uri),
|
||||
|
||||
/// The given URI cannot be converted to a file path
|
||||
#[error("Cannot find fragment")]
|
||||
InvalidFragment(Uri),
|
||||
|
||||
/// The given path cannot be converted to a URI
|
||||
#[error("Invalid path to URL conversion: {0}")]
|
||||
InvalidUrlFromPath(PathBuf),
|
||||
|
|
@ -87,7 +91,7 @@ pub enum ErrorKind {
|
|||
|
||||
/// The given path does not resolve to a valid file
|
||||
#[error("Cannot find local file {0}")]
|
||||
FileNotFound(PathBuf),
|
||||
InvalidFile(PathBuf),
|
||||
|
||||
/// Error while traversing an input directory
|
||||
#[error("Cannot traverse input directory: {0}")]
|
||||
|
|
@ -132,6 +136,7 @@ pub enum ErrorKind {
|
|||
/// Basic auth extractor error
|
||||
#[error("Basic auth extractor error")]
|
||||
BasicAuthExtractorError(#[from] BasicAuthExtractorError),
|
||||
|
||||
/// Cannot load cookies
|
||||
#[error("Cannot load cookies")]
|
||||
Cookies(String),
|
||||
|
|
@ -225,6 +230,14 @@ impl PartialEq for ErrorKind {
|
|||
(Self::Regex(e1), Self::Regex(e2)) => e1.to_string() == e2.to_string(),
|
||||
(Self::DirTraversal(e1), Self::DirTraversal(e2)) => e1.to_string() == e2.to_string(),
|
||||
(Self::Channel(_), Self::Channel(_)) => true,
|
||||
(Self::TooManyRedirects(e1), Self::TooManyRedirects(e2)) => {
|
||||
e1.to_string() == e2.to_string()
|
||||
}
|
||||
(Self::BasicAuthExtractorError(e1), Self::BasicAuthExtractorError(e2)) => {
|
||||
e1.to_string() == e2.to_string()
|
||||
}
|
||||
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
|
||||
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
|
@ -249,13 +262,14 @@ impl Hash for ErrorKind {
|
|||
Self::GithubRequest(e) => e.to_string().hash(state),
|
||||
Self::InvalidGithubUrl(s) => s.hash(state),
|
||||
Self::DirTraversal(e) => e.to_string().hash(state),
|
||||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::InvalidFile(e) => e.to_string_lossy().hash(state),
|
||||
Self::EmptyUrl => "Empty URL".hash(state),
|
||||
Self::ParseUrl(e, s) => (e.to_string(), s).hash(state),
|
||||
Self::InvalidURI(u) => u.hash(state),
|
||||
Self::InvalidUrlFromPath(p) => p.hash(state),
|
||||
Self::Utf8(e) => e.to_string().hash(state),
|
||||
Self::InvalidFilePath(u) => u.hash(state),
|
||||
Self::InvalidFragment(u) => u.hash(state),
|
||||
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
|
||||
Self::InsecureURL(u, ..) => u.hash(state),
|
||||
Self::InvalidBase(base, e) => (base, e).hash(state),
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ impl Input {
|
|||
// and exit early if it does
|
||||
// This check might not be sufficient to cover all cases
|
||||
// but it catches the most common ones
|
||||
return Err(ErrorKind::FileNotFound(path));
|
||||
return Err(ErrorKind::InvalidFile(path));
|
||||
} else {
|
||||
// Invalid path; check if a valid URL can be constructed from the input
|
||||
// by prefixing it with a `http://` scheme.
|
||||
|
|
@ -442,10 +442,7 @@ mod tests {
|
|||
|
||||
let input = Input::new(test_file, None, false, None);
|
||||
assert!(input.is_err());
|
||||
assert!(matches!(
|
||||
input,
|
||||
Err(ErrorKind::FileNotFound(PathBuf { .. }))
|
||||
));
|
||||
assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. }))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
75
lychee-lib/src/utils/fragment_checker.rs
Normal file
75
lychee-lib/src/utils/fragment_checker.rs
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
use std::{
|
||||
collections::{hash_map::Entry, HashMap, HashSet},
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result};
|
||||
use tokio::{fs, sync::Mutex};
|
||||
use url::Url;
|
||||
|
||||
/// Holds a cache of fragments for a given URL.
|
||||
///
|
||||
/// Fragments, also known as anchors, are used to link to a specific
|
||||
/// part of a page. For example, the URL `https://example.com#foo`
|
||||
/// will link to the element with the `id` of `foo`.
|
||||
///
|
||||
/// This cache is used to avoid having to re-parse the same file
|
||||
/// multiple times when checking if a given URL contains a fragment.
|
||||
///
|
||||
/// The cache is stored in a `HashMap` with the URL as the key and
|
||||
/// a `HashSet` of fragments as the value.
|
||||
#[derive(Default, Clone, Debug)]
|
||||
pub(crate) struct FragmentChecker {
|
||||
cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
|
||||
}
|
||||
|
||||
impl FragmentChecker {
|
||||
/// Creates a new `FragmentChecker`.
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
cache: Arc::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the given path contains the given fragment.
|
||||
///
|
||||
/// Returns false, if there is a fragment in the link and the path is to a markdown file which
|
||||
/// doesn't contain the given fragment.
|
||||
///
|
||||
/// In all other cases, returns true.
|
||||
pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
|
||||
match (FileType::from(path), url.fragment()) {
|
||||
(FileType::Markdown, Some(fragment)) => {
|
||||
let url_without_frag = Self::remove_fragment(url.clone());
|
||||
self.populate_cache_if_vacant(url_without_frag, path, fragment)
|
||||
.await
|
||||
}
|
||||
_ => Ok(true),
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_fragment(mut url: Url) -> String {
|
||||
url.set_fragment(None);
|
||||
url.into()
|
||||
}
|
||||
|
||||
/// Populates the fragment cache with the given URL if it
|
||||
/// is not already in the cache.
|
||||
async fn populate_cache_if_vacant(
|
||||
&self,
|
||||
url_without_frag: String,
|
||||
path: &Path,
|
||||
fragment: &str,
|
||||
) -> Result<bool> {
|
||||
let mut fragment_cache = self.cache.lock().await;
|
||||
match fragment_cache.entry(url_without_frag.clone()) {
|
||||
Entry::Vacant(entry) => {
|
||||
let content = fs::read_to_string(path).await?;
|
||||
let file_frags = extract_markdown_fragments(&content);
|
||||
Ok(entry.insert(file_frags).contains(fragment))
|
||||
}
|
||||
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
pub(crate) mod fragment_checker;
|
||||
pub(crate) mod path;
|
||||
pub(crate) mod request;
|
||||
pub(crate) mod reqwest;
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Opt
|
|||
relative if dst.is_relative() => {
|
||||
// Find `dst` in the parent directory of `src`
|
||||
let Some(parent) = src.parent() else {
|
||||
return Err(ErrorKind::FileNotFound(relative.to_path_buf()))
|
||||
return Err(ErrorKind::InvalidFile(relative.to_path_buf()))
|
||||
};
|
||||
parent.join(relative)
|
||||
}
|
||||
|
|
@ -62,7 +62,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Opt
|
|||
};
|
||||
join(dir.to_path_buf(), absolute)
|
||||
}
|
||||
_ => return Err(ErrorKind::FileNotFound(dst.to_path_buf())),
|
||||
_ => return Err(ErrorKind::InvalidFile(dst.to_path_buf())),
|
||||
};
|
||||
Ok(Some(absolute_path(resolved)))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,10 +70,19 @@ pub(crate) fn create(
|
|||
credentials,
|
||||
)))
|
||||
} else if let InputSource::FsPath(root) = &input_content.source {
|
||||
if is_anchor {
|
||||
// Silently ignore anchor links for now
|
||||
Ok(None)
|
||||
} else if let Some(url) = create_uri_from_path(root, &text, base)? {
|
||||
let path = if is_anchor {
|
||||
match root.file_name() {
|
||||
Some(file_name) => match file_name.to_str() {
|
||||
Some(valid_str) => valid_str.to_string() + &text,
|
||||
None => return Err(ErrorKind::InvalidFile(root.clone())),
|
||||
},
|
||||
None => return Err(ErrorKind::InvalidFile(root.clone())),
|
||||
}
|
||||
} else {
|
||||
text
|
||||
};
|
||||
|
||||
if let Some(url) = create_uri_from_path(root, &path, base)? {
|
||||
let uri = Uri { url };
|
||||
let credentials = credentials(extractor, &uri);
|
||||
|
||||
|
|
@ -122,7 +131,7 @@ fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
|
|||
}
|
||||
|
||||
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
|
|
@ -136,6 +145,10 @@ fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Op
|
|||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(|mut url| {
|
||||
url.set_fragment(frag);
|
||||
url
|
||||
})
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
||||
None => Ok(None),
|
||||
|
|
|
|||
|
|
@ -4,18 +4,18 @@ use once_cell::sync::Lazy;
|
|||
|
||||
static LINK_FINDER: Lazy<LinkFinder> = Lazy::new(LinkFinder::new);
|
||||
|
||||
/// Remove all GET parameters from a URL.
|
||||
/// Remove all GET parameters from a URL and seperates out the fragment.
|
||||
/// The link is not a URL but a String as it may not have a base domain.
|
||||
pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
|
||||
let path = match url.split_once('#') {
|
||||
Some((path_without_fragment, _fragment)) => path_without_fragment,
|
||||
None => url,
|
||||
pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) {
|
||||
let (path, frag) = match url.split_once('#') {
|
||||
Some((path, fragment)) => (path, Some(fragment)),
|
||||
None => (url, None),
|
||||
};
|
||||
let path = match path.split_once('?') {
|
||||
Some((path_without_params, _params)) => path_without_params,
|
||||
None => path,
|
||||
};
|
||||
path
|
||||
(path, frag)
|
||||
}
|
||||
|
||||
// Use `LinkFinder` to offload the raw link searching in plaintext
|
||||
|
|
@ -29,47 +29,49 @@ mod test_fs_tree {
|
|||
|
||||
#[test]
|
||||
fn test_remove_get_params_and_fragment() {
|
||||
assert_eq!(remove_get_params_and_fragment("/"), "/");
|
||||
assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None));
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("index.html?foo=bar"),
|
||||
"index.html"
|
||||
remove_get_params_and_seperate_fragment("index.html?foo=bar"),
|
||||
("index.html", None)
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("/index.html?foo=bar"),
|
||||
"/index.html"
|
||||
remove_get_params_and_seperate_fragment("/index.html?foo=bar"),
|
||||
("/index.html", None)
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
|
||||
"/index.html"
|
||||
remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
|
||||
("/index.html", None)
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.com/index.html?foo=bar"),
|
||||
"https://example.com/index.html"
|
||||
remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"),
|
||||
("https://example.com/index.html", None)
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar"),
|
||||
"test.png"
|
||||
remove_get_params_and_seperate_fragment("test.png?foo=bar"),
|
||||
("test.png", None)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.com/index.html#anchor"),
|
||||
"https://example.com/index.html"
|
||||
remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"),
|
||||
("https://example.com/index.html", Some("anchor"))
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.com/index.html?foo=bar#anchor"),
|
||||
"https://example.com/index.html"
|
||||
remove_get_params_and_seperate_fragment(
|
||||
"https://example.com/index.html?foo=bar#anchor"
|
||||
),
|
||||
("https://example.com/index.html", Some("anchor"))
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar#anchor"),
|
||||
"test.png"
|
||||
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"),
|
||||
("test.png", Some("anchor"))
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png#anchor?anchor!?"),
|
||||
"test.png"
|
||||
remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"),
|
||||
("test.png", Some("anchor?anchor!?"))
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
|
||||
"test.png"
|
||||
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"),
|
||||
("test.png", Some("anchor?anchor!"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue