Only accept two path segments (org/repo) for Github API check

This commit is contained in:
Matthias 2022-01-10 21:57:05 +01:00
parent 7667842bb6
commit e91c0c60f0
2 changed files with 44 additions and 23 deletions

View file

@ -262,7 +262,7 @@ impl Client {
}
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Some((owner, repo)) = uri.extract_github() {
if let Some((owner, repo)) = uri.gh_org_and_repo() {
return self.check_github(owner, repo).await;
}

View file

@ -84,17 +84,12 @@ impl Uri {
}
// TODO: Support GitLab etc.
pub(crate) fn extract_github(&self) -> Option<(&str, &str)> {
// Custom function to return a suffix.
// We could use [str::strip_suffix], but it returns an `Option` and is
// rather tedious to use in our case. There is also [str::trim_end_matches],
// but we only want to remove a single occurrence at the end.
pub(crate) fn gh_org_and_repo(&self) -> Option<(&str, &str)> {
fn remove_suffix<'a>(input: &'a str, suffix: &str) -> &'a str {
if input.ends_with(suffix) {
&input[..input.len() - suffix.len()]
} else {
input
if let Some(stripped) = input.strip_suffix(suffix) {
return stripped;
}
input
}
debug_assert!(!self.is_mail(), "Should only be called on a Website type!");
@ -103,13 +98,22 @@ impl Uri {
self.domain()?,
"github.com" | "www.github.com" | "raw.githubusercontent.com"
) {
let mut segments = self.path_segments()?;
let owner = segments.next()?;
let parts: Vec<_> = self.path_segments()?.collect();
if parts.len() != 2 {
// Accept additional _empty_ path segment after last slash
if !(parts.len() == 3 && parts[2].is_empty()) {
// Not a valid org/repo pair.
// Skip this as the API doesn't handle files etc.
return None;
}
}
let owner = parts[0];
if GITHUB_EXCLUDED_ENDPOINTS.contains(owner) {
return None;
}
let repo = segments.next()?;
let repo = parts[1];
// If the URL ends with `.git`, assume this is an SSH URL and strip
// the suffix. See https://github.com/lycheeverse/lychee/issues/384
let repo = remove_suffix(repo, ".git");
@ -263,45 +267,62 @@ mod test {
}
#[test]
fn test_is_github() {
fn test_github() {
assert_eq!(
website("http://github.com/lycheeverse/lychee").extract_github(),
website("http://github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("http://www.github.com/lycheeverse/lychee").extract_github(),
website("http://www.github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee").extract_github(),
website("https://github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/Microsoft/python-language-server.git").extract_github(),
website("https://github.com/lycheeverse/lychee/").gh_org_and_repo(),
Some(("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/Microsoft/python-language-server.git").gh_org_and_repo(),
Some(("Microsoft", "python-language-server"))
);
}
#[test]
fn test_github_false_positives() {
assert!(website("https://github.com/lycheeverse/lychee/foo/bar")
.gh_org_and_repo()
.is_none());
assert!(
website("https://github.com/lycheeverse/lychee/blob/master/NON_EXISTENT_FILE.md")
.gh_org_and_repo()
.is_none()
);
// Check known false positives
assert!(website("https://github.com/sponsors/analysis-tools-dev ")
.extract_github()
.gh_org_and_repo()
.is_none());
assert!(
website("https://github.com/marketplace/actions/lychee-broken-link-checker")
.extract_github()
.gh_org_and_repo()
.is_none()
);
assert!(website("https://github.com/features/actions")
.extract_github()
.gh_org_and_repo()
.is_none());
assert!(
website("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath")
.extract_github()
.gh_org_and_repo()
.is_none()
);
}