mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-17 03:40:59 +00:00
118 lines
4.4 KiB
Rust
118 lines
4.4 KiB
Rust
use log::info;
|
|
use percent_encoding::percent_decode_str;
|
|
use reqwest::Url;
|
|
use std::{
|
|
collections::HashSet,
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use crate::{
|
|
helpers::{path, url},
|
|
types::{uri::raw::RawUri, InputContent, InputSource},
|
|
Base, ErrorKind, Request, Result, Uri,
|
|
};
|
|
|
|
const MAX_TRUNCATED_STR_LEN: usize = 100;
|
|
|
|
/// Create requests out of the collected URLs.
|
|
/// Only keeps "valid" URLs. This filters out anchors for example.
|
|
pub(crate) fn create(
|
|
uris: Vec<RawUri>,
|
|
input_content: &InputContent,
|
|
base: &Option<Base>,
|
|
) -> Result<HashSet<Request>> {
|
|
let base_url = Base::from_source(&input_content.source);
|
|
|
|
let requests: Result<Vec<Option<Request>>> = uris
|
|
.into_iter()
|
|
.map(|raw_uri| {
|
|
let is_anchor = raw_uri.is_anchor();
|
|
let text = raw_uri.text.clone();
|
|
let element = raw_uri.element.clone();
|
|
let attribute = raw_uri.attribute.clone();
|
|
|
|
// Truncate the source in case it gets too long Ideally we should
|
|
// avoid the initial String allocation for `source` altogether
|
|
let source = match &input_content.source {
|
|
InputSource::String(s) => {
|
|
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
|
|
}
|
|
// Cloning is cheap here
|
|
c => c.clone(),
|
|
};
|
|
|
|
if let Ok(uri) = Uri::try_from(raw_uri) {
|
|
Ok(Some(Request::new(uri, source, element, attribute)))
|
|
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
|
|
Ok(Some(Request::new(Uri { url }, source, element, attribute)))
|
|
} else if let InputSource::FsPath(root) = &input_content.source {
|
|
if is_anchor {
|
|
// Silently ignore anchor links for now
|
|
Ok(None)
|
|
} else if let Some(url) = create_uri_from_path(root, &text, base)? {
|
|
Ok(Some(Request::new(Uri { url }, source, element, attribute)))
|
|
} else {
|
|
// In case we cannot create a URI from a path but we didn't receive an error,
|
|
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
|
Ok(None)
|
|
}
|
|
} else if let Some(url) = construct_url(&base_url, &text) {
|
|
if base.is_some() {
|
|
Ok(None)
|
|
} else {
|
|
Ok(Some(Request::new(
|
|
Uri { url: url? },
|
|
source,
|
|
element,
|
|
attribute,
|
|
)))
|
|
}
|
|
} else {
|
|
info!("Handling of `{}` not implemented yet", text);
|
|
Ok(None)
|
|
}
|
|
})
|
|
.collect();
|
|
let requests: Vec<Request> = requests?.into_iter().flatten().collect();
|
|
Ok(HashSet::from_iter(requests))
|
|
}
|
|
|
|
fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
|
|
base.as_ref().map(|base| {
|
|
base.join(text)
|
|
.map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}")))
|
|
})
|
|
}
|
|
|
|
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
|
|
let dst = url::remove_get_params_and_fragment(dst);
|
|
// Avoid double-encoding already encoded destination paths by removing any
|
|
// potential encoding (e.g. `web%20site` becomes `web site`).
|
|
// That's because Url::from_file_path will encode the full URL in the end.
|
|
// This behavior cannot be configured.
|
|
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
|
// TODO: This is not a perfect solution.
|
|
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
|
// `from_file_path` at the moment) while `dst` gets left untouched and simply
|
|
// appended to the end.
|
|
let decoded = percent_decode_str(dst).decode_utf8()?;
|
|
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
|
|
match resolved {
|
|
Some(path) => Url::from_file_path(&path)
|
|
.map(Some)
|
|
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
|
None => Ok(None),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_create_uri_from_path() {
|
|
let result =
|
|
create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
|
|
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
|
|
}
|
|
}
|