diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 1e4e92c..af307c0 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -50,6 +50,17 @@ impl Extractor { /// Only keeps legit URLs. For example this filters out anchors. fn create_requests(&self, input_content: &InputContent) -> Result> { let mut requests: HashSet = HashSet::with_capacity(self.urls.len()); + + let base_input = match &input_content.input { + Input::RemoteUrl(url) => Some(Url::parse(&format!( + "{}://{}", + url.scheme(), + url.host_str().ok_or(ErrorKind::InvalidUrlHost)? + ))?), + _ => None, + // other inputs do not have a URL to extract a base + }; + for url in &self.urls { let req = if let Ok(uri) = Uri::try_from(url.as_ref()) { Request::new(uri, input_content.input.clone()) @@ -68,6 +79,11 @@ impl Extractor { continue; } } + } else if let Some(url) = base_input.as_ref().map(|u| u.join(url)) { + if self.base.is_some() { + continue; + } + Request::new(Uri { url: url? }, input_content.input.clone()) } else { info!("Handling of {} not implemented yet", &url); continue; @@ -400,6 +416,42 @@ mod test { assert_eq!(links, expected_links); } + #[test] + fn test_relative_url_with_base_extracted_from_input() { + let input = Input::RemoteUrl(Box::new( + Url::parse("https://example.org/some-post").unwrap(), + )); + + let contents = r#" +
+ Github + About +
+ "#; + + let input_content = &InputContent { + input, + file_type: FileType::Html, + content: contents.to_string(), + }; + + let mut extractor = Extractor::new(None); + let links = extractor.extract(input_content); + let urls = links + .unwrap() + .iter() + .map(|x| x.uri.url.as_str().to_string()) + .collect::>(); + + let expected_urls = array::IntoIter::new([ + String::from("https://github.com/lycheeverse/lychee/"), + String::from("https://example.org/about"), + ]) + .collect::>(); + + assert_eq!(urls, expected_urls); + } + #[test] fn test_extract_html5_lowercase_doctype() { // this has been problematic with previous XML based parser diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 84e9840..01f7234 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -42,6 +42,8 @@ pub enum ErrorKind { MissingGitHubToken, /// The website is available through HTTPS, but HTTP scheme is used. InsecureURL(Uri), + /// An URL with an invalid host was found + InvalidUrlHost, /// Invalid URI InvalidURI(Uri), } @@ -87,7 +89,9 @@ impl Hash for ErrorKind { Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), - Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), + Self::MissingGitHubToken | Self::InvalidUrlHost => { + std::mem::discriminant(self).hash(state); + } } } } @@ -134,6 +138,7 @@ impl Display for ErrorKind { ), Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e), Self::Utf8Error(e) => e.fmt(f), + Self::InvalidUrlHost => write!(f, "URL is missing a host"), } } }