mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-01 04:10:24 +00:00
Extract base from the source URL if --base is empty (#358)
When running lychee against a remote URL all relative links are ignored by default because `--base` is normally not set. A good default in this case is to automatically use the base domain from the source URL. Setting `--base` overrides the automatic source extraction from the source URL (same behaviour as we currently have).
This commit is contained in:
parent
2be3b3b896
commit
174331d983
2 changed files with 58 additions and 1 deletions
|
|
@ -50,6 +50,17 @@ impl Extractor {
|
|||
/// Only keeps legit URLs. For example this filters out anchors.
|
||||
fn create_requests(&self, input_content: &InputContent) -> Result<HashSet<Request>> {
|
||||
let mut requests: HashSet<Request> = HashSet::with_capacity(self.urls.len());
|
||||
|
||||
let base_input = match &input_content.input {
|
||||
Input::RemoteUrl(url) => Some(Url::parse(&format!(
|
||||
"{}://{}",
|
||||
url.scheme(),
|
||||
url.host_str().ok_or(ErrorKind::InvalidUrlHost)?
|
||||
))?),
|
||||
_ => None,
|
||||
// other inputs do not have a URL to extract a base
|
||||
};
|
||||
|
||||
for url in &self.urls {
|
||||
let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
|
|
@ -68,6 +79,11 @@ impl Extractor {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
} else if let Some(url) = base_input.as_ref().map(|u| u.join(url)) {
|
||||
if self.base.is_some() {
|
||||
continue;
|
||||
}
|
||||
Request::new(Uri { url: url? }, input_content.input.clone())
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &url);
|
||||
continue;
|
||||
|
|
@ -400,6 +416,42 @@ mod test {
|
|||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_url_with_base_extracted_from_input() {
|
||||
let input = Input::RemoteUrl(Box::new(
|
||||
Url::parse("https://example.org/some-post").unwrap(),
|
||||
));
|
||||
|
||||
let contents = r#"<html>
|
||||
<div class="row">
|
||||
<a href="https://github.com/lycheeverse/lychee/">Github</a>
|
||||
<a href="/about">About</a>
|
||||
</div>
|
||||
</html>"#;
|
||||
|
||||
let input_content = &InputContent {
|
||||
input,
|
||||
file_type: FileType::Html,
|
||||
content: contents.to_string(),
|
||||
};
|
||||
|
||||
let mut extractor = Extractor::new(None);
|
||||
let links = extractor.extract(input_content);
|
||||
let urls = links
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|x| x.uri.url.as_str().to_string())
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let expected_urls = array::IntoIter::new([
|
||||
String::from("https://github.com/lycheeverse/lychee/"),
|
||||
String::from("https://example.org/about"),
|
||||
])
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(urls, expected_urls);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html5_lowercase_doctype() {
|
||||
// this has been problematic with previous XML based parser
|
||||
|
|
|
|||
|
|
@ -42,6 +42,8 @@ pub enum ErrorKind {
|
|||
MissingGitHubToken,
|
||||
/// The website is available through HTTPS, but HTTP scheme is used.
|
||||
InsecureURL(Uri),
|
||||
/// An URL with an invalid host was found
|
||||
InvalidUrlHost,
|
||||
/// Invalid URI
|
||||
InvalidURI(Uri),
|
||||
}
|
||||
|
|
@ -87,7 +89,9 @@ impl Hash for ErrorKind {
|
|||
Self::InvalidBase(base, e) => (base, e).hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
|
||||
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
|
||||
Self::MissingGitHubToken | Self::InvalidUrlHost => {
|
||||
std::mem::discriminant(self).hash(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -134,6 +138,7 @@ impl Display for ErrorKind {
|
|||
),
|
||||
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
|
||||
Self::Utf8Error(e) => e.fmt(f),
|
||||
Self::InvalidUrlHost => write!(f, "URL is missing a host"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue