diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index 58b16df..0e08a36 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -63,7 +63,7 @@ pub(crate) fn parse_statuscodes>(accept: T) -> Result } #[cfg(test)] -mod test { +mod tests { use std::collections::HashSet; use headers::{HeaderMap, HeaderMapExt}; diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index cc9fd26..fdd9579 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -76,7 +76,7 @@ impl ResponseStats { } #[cfg(test)] -mod test { +mod tests { use std::collections::{HashMap, HashSet}; use http::StatusCode; diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 9c32325..2fedf97 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -31,7 +31,7 @@ use crate::{ filter::{Excludes, Filter, Includes}, quirks::Quirks, remap::Remaps, - types::{mail, GithubUri}, + types::{mail, uri::github::GithubUri}, ErrorKind, Request, Response, Result, Status, Uri, }; @@ -470,7 +470,7 @@ impl Client { // Pull out the heavy machinery in case of a failed normal request. // This could be a GitHub URL and we ran into the rate limiter. - if let Some(github_uri) = uri.gh_org_and_repo() { + if let Ok(github_uri) = GithubUri::try_from(uri) { let status = self.check_github(github_uri).await; // Only return Github status in case of success // Otherwise return the original error, which has more information @@ -590,7 +590,7 @@ where } #[cfg(test)] -mod test { +mod tests { use std::{ fs::File, time::{Duration, Instant}, diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index b531bbb..117d746 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,5 +1,5 @@ use crate::{ - extract::Extractor, helpers::request, types::raw_uri::RawUri, Base, Input, Request, Result, + extract::Extractor, helpers::request, types::uri::raw::RawUri, Base, Input, Request, Result, }; use futures::{ stream::{self, Stream}, @@ -83,7 +83,7 @@ impl Collector { } #[cfg(test)] -mod test { +mod tests { use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write}; use http::StatusCode; diff --git a/lychee-lib/src/extract/html5ever.rs b/lychee-lib/src/extract/html5ever.rs index b6afcbe..7674618 100644 --- a/lychee-lib/src/extract/html5ever.rs +++ b/lychee-lib/src/extract/html5ever.rs @@ -5,7 +5,7 @@ use html5ever::{ }; use super::{is_verbatim_elem, plaintext::extract_plaintext}; -use crate::types::raw_uri::RawUri; +use crate::types::uri::raw::RawUri; #[derive(Clone, Default)] struct LinkExtractor { diff --git a/lychee-lib/src/extract/html5gum.rs b/lychee-lib/src/extract/html5gum.rs index 92e13ec..8b5278c 100644 --- a/lychee-lib/src/extract/html5gum.rs +++ b/lychee-lib/src/extract/html5gum.rs @@ -2,7 +2,7 @@ use html5gum::{Emitter, Error, Tokenizer}; use super::is_verbatim_elem; use super::plaintext::extract_plaintext; -use crate::types::raw_uri::RawUri; +use crate::types::uri::raw::RawUri; #[derive(Clone)] struct LinkExtractor { diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 57b9e40..431ef76 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,6 +1,6 @@ use pulldown_cmark::{Event, Parser, Tag}; -use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri}; +use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; use super::html5gum::extract_html; diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 31000b5..b051cf6 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; -use crate::types::{raw_uri::RawUri, FileType, InputContent}; +use crate::types::{uri::raw::RawUri, FileType, InputContent}; mod html5ever; mod html5gum; @@ -79,9 +79,9 @@ impl Extractor { } #[cfg(test)] -mod test { +mod tests { use reqwest::Url; - use std::{collections::HashSet, convert::TryFrom, path::Path}; + use std::{collections::HashSet, path::Path}; use super::*; use crate::{ diff --git a/lychee-lib/src/extract/plaintext.rs b/lychee-lib/src/extract/plaintext.rs index 7830d2e..7032a60 100644 --- a/lychee-lib/src/extract/plaintext.rs +++ b/lychee-lib/src/extract/plaintext.rs @@ -1,4 +1,4 @@ -use crate::{helpers::url, types::raw_uri::RawUri}; +use crate::{helpers::url, types::uri::raw::RawUri}; /// Extract unparsed URL strings from plaintext pub(crate) fn extract_plaintext(input: &str) -> Vec { diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index 6114288..7955f02 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -202,7 +202,7 @@ impl Filter { } #[cfg(test)] -mod test { +mod tests { use regex::RegexSet; use reqwest::Url; use url::Host; diff --git a/lychee-lib/src/helpers/request.rs b/lychee-lib/src/helpers/request.rs index b580d8b..2d2ff00 100644 --- a/lychee-lib/src/helpers/request.rs +++ b/lychee-lib/src/helpers/request.rs @@ -8,7 +8,7 @@ use std::{ use crate::{ helpers::{path, url}, - types::{raw_uri::RawUri, InputContent, InputSource}, + types::{uri::raw::RawUri, InputContent, InputSource}, Base, ErrorKind, Request, Result, Uri, }; diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index bda3b67..e68a874 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -88,7 +88,7 @@ pub use crate::{ collector::Collector, filter::{Excludes, Filter, Includes}, types::{ - Base, CacheStatus, ErrorKind, FileType, Input, InputContent, InputSource, Request, - Response, ResponseBody, Result, Status, Uri, + uri::valid::Uri, Base, CacheStatus, ErrorKind, FileType, Input, InputContent, InputSource, + Request, Response, ResponseBody, Result, Status, }, }; diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 4b4cd86..0dcbf88 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -6,11 +6,10 @@ mod error; mod file; mod input; pub(crate) mod mail; -pub(crate) mod raw_uri; mod request; mod response; mod status; -mod uri; +pub(crate) mod uri; pub use base::Base; pub use cache::CacheStatus; @@ -20,7 +19,6 @@ pub use input::{Input, InputContent, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; pub use status::Status; -pub use uri::{GithubUri, Uri}; /// The lychee `Result` type pub type Result = std::result::Result; diff --git a/lychee-lib/src/types/uri/github.rs b/lychee-lib/src/types/uri/github.rs new file mode 100644 index 0000000..a04f2c2 --- /dev/null +++ b/lychee-lib/src/types/uri/github.rs @@ -0,0 +1,204 @@ +use lazy_static::lazy_static; +use std::collections::HashSet; + +use crate::{ErrorKind, Result, Uri}; + +lazy_static! { + static ref GITHUB_API_EXCLUDED_ENDPOINTS: HashSet<&'static str> = HashSet::from_iter([ + "about", + "collections", + "events", + "explore", + "features", + "issues", + "marketplace", + "new", + "notifications", + "pricing", + "pulls", + "sponsors", + "topics", + "watching", + ]); +} + +/// Uri path segments extracted from a Github URL +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct GithubUri { + /// Organization name + pub owner: String, + /// Repository name + pub repo: String, + /// e.g. `issues` in `/org/repo/issues` + pub endpoint: Option, +} + +impl GithubUri { + /// Create a new Github URI without an endpoint + #[cfg(test)] + fn new>(owner: T, repo: T) -> Self { + GithubUri { + owner: owner.into(), + repo: repo.into(), + endpoint: None, + } + } + + #[cfg(test)] + fn with_endpoint>(owner: T, repo: T, endpoint: T) -> Self { + GithubUri { + owner: owner.into(), + repo: repo.into(), + endpoint: Some(endpoint.into()), + } + } + + // TODO: Support GitLab etc. + fn gh_org_and_repo(uri: &Uri) -> Result { + fn remove_suffix<'a>(input: &'a str, suffix: &str) -> &'a str { + if let Some(stripped) = input.strip_suffix(suffix) { + return stripped; + } + input + } + + debug_assert!(!uri.is_mail(), "Should only be called on a Website type!"); + + let domain = match uri.domain() { + Some(domain) => domain, + None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())), + }; + + if !matches!( + domain, + "github.com" | "www.github.com" | "raw.githubusercontent.com" + ) { + return Err(ErrorKind::InvalidGithubUrl(uri.to_string())); + }; + + let parts: Vec<_> = match uri.path_segments() { + Some(parts) => parts.collect(), + None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())), + }; + + if parts.len() < 2 { + // Not a valid org/repo pair. + // Note: We don't check for exactly 2 here, because the Github + // API doesn't handle checking individual files inside repos or + // paths like , so we are more + // permissive and only check for repo existence. This is the + // only way to get a basic check for private repos. Public repos + // are not affected and should work with a normal check. + return Err(ErrorKind::InvalidGithubUrl(uri.to_string())); + } + + let owner = parts[0]; + if GITHUB_API_EXCLUDED_ENDPOINTS.contains(owner) { + return Err(ErrorKind::InvalidGithubUrl(uri.to_string())); + } + + let repo = parts[1]; + // If the URL ends with `.git`, assume this is an SSH URL and strip + // the suffix. See https://github.com/lycheeverse/lychee/issues/384 + let repo = remove_suffix(repo, ".git"); + + let endpoint = if parts.len() > 2 && !parts[2].is_empty() { + Some(parts[2..].join("/")) + } else { + None + }; + + Ok(GithubUri { + owner: owner.to_string(), + repo: repo.to_string(), + endpoint, + }) + } +} + +impl TryFrom for GithubUri { + type Error = ErrorKind; + + fn try_from(uri: Uri) -> Result { + GithubUri::gh_org_and_repo(&uri) + } +} + +impl TryFrom<&Uri> for GithubUri { + type Error = ErrorKind; + + fn try_from(uri: &Uri) -> Result { + GithubUri::gh_org_and_repo(uri) + } +} + +#[cfg(test)] +mod tests { + use crate::test_utils::website; + + use super::*; + + #[test] + fn test_github() { + assert_eq!( + GithubUri::try_from(website("http://github.com/lycheeverse/lychee")).unwrap(), + GithubUri::new("lycheeverse", "lychee") + ); + + assert_eq!( + GithubUri::try_from(website("http://www.github.com/lycheeverse/lychee")).unwrap(), + GithubUri::new("lycheeverse", "lychee") + ); + + assert_eq!( + GithubUri::try_from(website("https://github.com/lycheeverse/lychee")).unwrap(), + GithubUri::new("lycheeverse", "lychee") + ); + + assert_eq!( + GithubUri::try_from(website("https://github.com/lycheeverse/lychee/")).unwrap(), + GithubUri::new("lycheeverse", "lychee") + ); + + assert_eq!( + GithubUri::try_from(website("https://github.com/lycheeverse/lychee/foo/bar")).unwrap(), + GithubUri::with_endpoint("lycheeverse", "lychee", "foo/bar") + ); + + assert_eq!( + GithubUri::try_from(website( + "https://github.com/Microsoft/python-language-server.git" + )) + .unwrap(), + GithubUri::new("Microsoft", "python-language-server") + ); + + assert_eq!( + GithubUri::try_from(website( + "https://github.com/lycheeverse/lychee/blob/master/NON_EXISTENT_FILE.md" + )) + .unwrap(), + GithubUri::with_endpoint("lycheeverse", "lychee", "blob/master/NON_EXISTENT_FILE.md") + ); + } + + #[test] + fn test_github_false_positives() { + assert!( + GithubUri::try_from(website("https://github.com/sponsors/analysis-tools-dev ")) + .is_err() + ); + + assert!(GithubUri::try_from(website( + "https://github.com/marketplace/actions/lychee-broken-link-checker" + )) + .is_err()); + + assert!(GithubUri::try_from(website("https://github.com/features/actions")).is_err()); + + assert!(GithubUri::try_from(website( + "https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath" + )) + .is_err()); + } +} diff --git a/lychee-lib/src/types/uri/mod.rs b/lychee-lib/src/types/uri/mod.rs new file mode 100644 index 0000000..e800df2 --- /dev/null +++ b/lychee-lib/src/types/uri/mod.rs @@ -0,0 +1,3 @@ +pub(crate) mod github; +pub(crate) mod raw; +pub(crate) mod valid; diff --git a/lychee-lib/src/types/raw_uri.rs b/lychee-lib/src/types/uri/raw.rs similarity index 99% rename from lychee-lib/src/types/raw_uri.rs rename to lychee-lib/src/types/uri/raw.rs index a10e0b7..f3aa890 100644 --- a/lychee-lib/src/types/raw_uri.rs +++ b/lychee-lib/src/types/uri/raw.rs @@ -42,7 +42,7 @@ impl From<&str> for RawUri { } #[cfg(test)] -mod test { +mod tests { use super::*; #[test] diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri/valid.rs similarity index 59% rename from lychee-lib/src/types/uri.rs rename to lychee-lib/src/types/uri/valid.rs index 25165d7..61e4112 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri/valid.rs @@ -1,65 +1,13 @@ -use std::{collections::HashSet, convert::TryFrom, fmt::Display, net::IpAddr}; +use std::{convert::TryFrom, fmt::Display, net::IpAddr}; use fast_chemail::parse_email; use ip_network::Ipv6Network; -use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; use url::Url; use crate::{ErrorKind, Result}; -use super::raw_uri::RawUri; - -lazy_static! { - static ref GITHUB_API_EXCLUDED_ENDPOINTS: HashSet<&'static str> = HashSet::from_iter([ - "about", - "collections", - "events", - "explore", - "features", - "issues", - "marketplace", - "new", - "notifications", - "pricing", - "pulls", - "sponsors", - "topics", - "watching", - ]); -} - -/// Uri path segments extracted from a Github URL -#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] -pub struct GithubUri { - /// Organization name - pub owner: String, - /// Repository name - pub repo: String, - /// e.g. `issues` in `/org/repo/issues` - pub endpoint: Option, -} - -impl GithubUri { - /// Create a new Github URI without an endpoint - #[cfg(test)] - fn new>(owner: T, repo: T) -> Self { - GithubUri { - owner: owner.into(), - repo: repo.into(), - endpoint: None, - } - } - - #[cfg(test)] - fn with_endpoint>(owner: T, repo: T, endpoint: T) -> Self { - GithubUri { - owner: owner.into(), - repo: repo.into(), - endpoint: Some(endpoint.into()), - } - } -} +use super::raw::RawUri; /// Lychee's own representation of a URI, which encapsulates all supported /// formats. @@ -125,59 +73,6 @@ impl Uri { } } - // TODO: Support GitLab etc. - pub(crate) fn gh_org_and_repo(&self) -> Option { - fn remove_suffix<'a>(input: &'a str, suffix: &str) -> &'a str { - if let Some(stripped) = input.strip_suffix(suffix) { - return stripped; - } - input - } - - debug_assert!(!self.is_mail(), "Should only be called on a Website type!"); - - if matches!( - self.domain()?, - "github.com" | "www.github.com" | "raw.githubusercontent.com" - ) { - let parts: Vec<_> = self.path_segments()?.collect(); - if parts.len() < 2 { - // Not a valid org/repo pair. - // Note: We don't check for exactly 2 here, because the Github - // API doesn't handle checking individual files inside repos or - // paths like `github.com/org/repo/issues`, so we are more - // permissive and only check for repo existence. This is the - // only way to get a basic check for private repos. Public repos - // are not affected and should work with a normal check. - return None; - } - - let owner = parts[0]; - if GITHUB_API_EXCLUDED_ENDPOINTS.contains(owner) { - return None; - } - - let repo = parts[1]; - // If the URL ends with `.git`, assume this is an SSH URL and strip - // the suffix. See https://github.com/lycheeverse/lychee/issues/384 - let repo = remove_suffix(repo, ".git"); - - let endpoint = if parts.len() > 2 && !parts[2].is_empty() { - Some(parts[2..].join("/")) - } else { - None - }; - - return Some(GithubUri { - owner: owner.to_string(), - repo: repo.to_string(), - endpoint, - }); - } - - None - } - #[inline] #[must_use] /// Check if the URI is a valid mail address @@ -326,18 +221,14 @@ impl Display for Uri { } #[cfg(test)] -mod test { +mod tests { + use super::*; + use crate::test_utils::{mail, website}; use std::{ convert::TryFrom, net::{IpAddr, Ipv4Addr, Ipv6Addr}, }; - use super::Uri; - use crate::{ - test_utils::{mail, website}, - types::uri::GithubUri, - }; - #[test] fn test_ipv4_uri_is_loopback() { let uri = Uri::try_from("http://127.0.0.0").unwrap(); @@ -399,70 +290,4 @@ mod test { Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))) ); } - - #[test] - fn test_github() { - assert_eq!( - website("http://github.com/lycheeverse/lychee").gh_org_and_repo(), - Some(GithubUri::new("lycheeverse", "lychee")) - ); - - assert_eq!( - website("http://www.github.com/lycheeverse/lychee").gh_org_and_repo(), - Some(GithubUri::new("lycheeverse", "lychee")) - ); - - assert_eq!( - website("https://github.com/lycheeverse/lychee").gh_org_and_repo(), - Some(GithubUri::new("lycheeverse", "lychee")) - ); - - assert_eq!( - website("https://github.com/lycheeverse/lychee/").gh_org_and_repo(), - Some(GithubUri::new("lycheeverse", "lychee")) - ); - - assert_eq!( - website("https://github.com/Microsoft/python-language-server.git").gh_org_and_repo(), - Some(GithubUri::new("Microsoft", "python-language-server")) - ); - - assert_eq!( - website("https://github.com/lycheeverse/lychee/foo/bar").gh_org_and_repo(), - Some(GithubUri::with_endpoint("lycheeverse", "lychee", "foo/bar")) - ); - - assert_eq!( - website("https://github.com/lycheeverse/lychee/blob/master/NON_EXISTENT_FILE.md") - .gh_org_and_repo(), - Some(GithubUri::with_endpoint( - "lycheeverse", - "lychee", - "blob/master/NON_EXISTENT_FILE.md" - )) - ); - } - - #[test] - fn test_github_false_positives() { - assert!(website("https://github.com/sponsors/analysis-tools-dev ") - .gh_org_and_repo() - .is_none()); - - assert!( - website("https://github.com/marketplace/actions/lychee-broken-link-checker") - .gh_org_and_repo() - .is_none() - ); - - assert!(website("https://github.com/features/actions") - .gh_org_and_repo() - .is_none()); - - assert!( - website("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath") - .gh_org_and_repo() - .is_none() - ); - } }