Extract base from the source URL if --base is empty (#358)

When running lychee against a remote URL all relative links are ignored
by default because `--base` is normally not set. A good default in this
case is to automatically use the base domain from the source URL.
Setting `--base` overrides the automatic source extraction from the
source URL (same behaviour as we currently have).
This commit is contained in:
Jorge Luis Betancourt 2021-10-10 02:42:01 +02:00 committed by GitHub
parent 2be3b3b896
commit 174331d983
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 58 additions and 1 deletions

View file

@ -50,6 +50,17 @@ impl Extractor {
/// Only keeps legit URLs. For example this filters out anchors.
fn create_requests(&self, input_content: &InputContent) -> Result<HashSet<Request>> {
let mut requests: HashSet<Request> = HashSet::with_capacity(self.urls.len());
let base_input = match &input_content.input {
Input::RemoteUrl(url) => Some(Url::parse(&format!(
"{}://{}",
url.scheme(),
url.host_str().ok_or(ErrorKind::InvalidUrlHost)?
))?),
_ => None,
// other inputs do not have a URL to extract a base
};
for url in &self.urls {
let req = if let Ok(uri) = Uri::try_from(url.as_ref()) {
Request::new(uri, input_content.input.clone())
@ -68,6 +79,11 @@ impl Extractor {
continue;
}
}
} else if let Some(url) = base_input.as_ref().map(|u| u.join(url)) {
if self.base.is_some() {
continue;
}
Request::new(Uri { url: url? }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &url);
continue;
@ -400,6 +416,42 @@ mod test {
assert_eq!(links, expected_links);
}
#[test]
fn test_relative_url_with_base_extracted_from_input() {
let input = Input::RemoteUrl(Box::new(
Url::parse("https://example.org/some-post").unwrap(),
));
let contents = r#"<html>
<div class="row">
<a href="https://github.com/lycheeverse/lychee/">Github</a>
<a href="/about">About</a>
</div>
</html>"#;
let input_content = &InputContent {
input,
file_type: FileType::Html,
content: contents.to_string(),
};
let mut extractor = Extractor::new(None);
let links = extractor.extract(input_content);
let urls = links
.unwrap()
.iter()
.map(|x| x.uri.url.as_str().to_string())
.collect::<HashSet<_>>();
let expected_urls = array::IntoIter::new([
String::from("https://github.com/lycheeverse/lychee/"),
String::from("https://example.org/about"),
])
.collect::<HashSet<_>>();
assert_eq!(urls, expected_urls);
}
#[test]
fn test_extract_html5_lowercase_doctype() {
// this has been problematic with previous XML based parser

View file

@ -42,6 +42,8 @@ pub enum ErrorKind {
MissingGitHubToken,
/// The website is available through HTTPS, but HTTP scheme is used.
InsecureURL(Uri),
/// An URL with an invalid host was found
InvalidUrlHost,
/// Invalid URI
InvalidURI(Uri),
}
@ -87,7 +89,9 @@ impl Hash for ErrorKind {
Self::InvalidBase(base, e) => (base, e).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
Self::MissingGitHubToken | Self::InvalidUrlHost => {
std::mem::discriminant(self).hash(state);
}
}
}
}
@ -134,6 +138,7 @@ impl Display for ErrorKind {
),
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
Self::Utf8Error(e) => e.fmt(f),
Self::InvalidUrlHost => write!(f, "URL is missing a host"),
}
}
}