Refactor request types (#637)

This commit is contained in:
Matthias 2022-06-03 20:13:07 +02:00 committed by GitHub
parent 96da3d64c0
commit 84de43c554
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 232 additions and 202 deletions

View file

@ -63,7 +63,7 @@ pub(crate) fn parse_statuscodes<T: AsRef<str>>(accept: T) -> Result<HashSet<u16>
}
#[cfg(test)]
mod test {
mod tests {
use std::collections::HashSet;
use headers::{HeaderMap, HeaderMapExt};

View file

@ -76,7 +76,7 @@ impl ResponseStats {
}
#[cfg(test)]
mod test {
mod tests {
use std::collections::{HashMap, HashSet};
use http::StatusCode;

View file

@ -31,7 +31,7 @@ use crate::{
filter::{Excludes, Filter, Includes},
quirks::Quirks,
remap::Remaps,
types::{mail, GithubUri},
types::{mail, uri::github::GithubUri},
ErrorKind, Request, Response, Result, Status, Uri,
};
@ -470,7 +470,7 @@ impl Client {
// Pull out the heavy machinery in case of a failed normal request.
// This could be a GitHub URL and we ran into the rate limiter.
if let Some(github_uri) = uri.gh_org_and_repo() {
if let Ok(github_uri) = GithubUri::try_from(uri) {
let status = self.check_github(github_uri).await;
// Only return Github status in case of success
// Otherwise return the original error, which has more information
@ -590,7 +590,7 @@ where
}
#[cfg(test)]
mod test {
mod tests {
use std::{
fs::File,
time::{Duration, Instant},

View file

@ -1,5 +1,5 @@
use crate::{
extract::Extractor, helpers::request, types::raw_uri::RawUri, Base, Input, Request, Result,
extract::Extractor, helpers::request, types::uri::raw::RawUri, Base, Input, Request, Result,
};
use futures::{
stream::{self, Stream},
@ -83,7 +83,7 @@ impl Collector {
}
#[cfg(test)]
mod test {
mod tests {
use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};
use http::StatusCode;

View file

@ -5,7 +5,7 @@ use html5ever::{
};
use super::{is_verbatim_elem, plaintext::extract_plaintext};
use crate::types::raw_uri::RawUri;
use crate::types::uri::raw::RawUri;
#[derive(Clone, Default)]
struct LinkExtractor {

View file

@ -2,7 +2,7 @@ use html5gum::{Emitter, Error, Tokenizer};
use super::is_verbatim_elem;
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;
use crate::types::uri::raw::RawUri;
#[derive(Clone)]
struct LinkExtractor {

View file

@ -1,6 +1,6 @@
use pulldown_cmark::{Event, Parser, Tag};
use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri};
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
use super::html5gum::extract_html;

View file

@ -1,6 +1,6 @@
use std::collections::HashSet;
use crate::types::{raw_uri::RawUri, FileType, InputContent};
use crate::types::{uri::raw::RawUri, FileType, InputContent};
mod html5ever;
mod html5gum;
@ -79,9 +79,9 @@ impl Extractor {
}
#[cfg(test)]
mod test {
mod tests {
use reqwest::Url;
use std::{collections::HashSet, convert::TryFrom, path::Path};
use std::{collections::HashSet, path::Path};
use super::*;
use crate::{

View file

@ -1,4 +1,4 @@
use crate::{helpers::url, types::raw_uri::RawUri};
use crate::{helpers::url, types::uri::raw::RawUri};
/// Extract unparsed URL strings from plaintext
pub(crate) fn extract_plaintext(input: &str) -> Vec<RawUri> {

View file

@ -202,7 +202,7 @@ impl Filter {
}
#[cfg(test)]
mod test {
mod tests {
use regex::RegexSet;
use reqwest::Url;
use url::Host;

View file

@ -8,7 +8,7 @@ use std::{
use crate::{
helpers::{path, url},
types::{raw_uri::RawUri, InputContent, InputSource},
types::{uri::raw::RawUri, InputContent, InputSource},
Base, ErrorKind, Request, Result, Uri,
};

View file

@ -88,7 +88,7 @@ pub use crate::{
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{
Base, CacheStatus, ErrorKind, FileType, Input, InputContent, InputSource, Request,
Response, ResponseBody, Result, Status, Uri,
uri::valid::Uri, Base, CacheStatus, ErrorKind, FileType, Input, InputContent, InputSource,
Request, Response, ResponseBody, Result, Status,
},
};

View file

@ -6,11 +6,10 @@ mod error;
mod file;
mod input;
pub(crate) mod mail;
pub(crate) mod raw_uri;
mod request;
mod response;
mod status;
mod uri;
pub(crate) mod uri;
pub use base::Base;
pub use cache::CacheStatus;
@ -20,7 +19,6 @@ pub use input::{Input, InputContent, InputSource};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
pub use uri::{GithubUri, Uri};
/// The lychee `Result` type
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;

View file

@ -0,0 +1,204 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::{ErrorKind, Result, Uri};
lazy_static! {
static ref GITHUB_API_EXCLUDED_ENDPOINTS: HashSet<&'static str> = HashSet::from_iter([
"about",
"collections",
"events",
"explore",
"features",
"issues",
"marketplace",
"new",
"notifications",
"pricing",
"pulls",
"sponsors",
"topics",
"watching",
]);
}
/// Uri path segments extracted from a Github URL
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct GithubUri {
/// Organization name
pub owner: String,
/// Repository name
pub repo: String,
/// e.g. `issues` in `/org/repo/issues`
pub endpoint: Option<String>,
}
impl GithubUri {
/// Create a new Github URI without an endpoint
#[cfg(test)]
fn new<T: Into<String>>(owner: T, repo: T) -> Self {
GithubUri {
owner: owner.into(),
repo: repo.into(),
endpoint: None,
}
}
#[cfg(test)]
fn with_endpoint<T: Into<String>>(owner: T, repo: T, endpoint: T) -> Self {
GithubUri {
owner: owner.into(),
repo: repo.into(),
endpoint: Some(endpoint.into()),
}
}
// TODO: Support GitLab etc.
fn gh_org_and_repo(uri: &Uri) -> Result<GithubUri> {
fn remove_suffix<'a>(input: &'a str, suffix: &str) -> &'a str {
if let Some(stripped) = input.strip_suffix(suffix) {
return stripped;
}
input
}
debug_assert!(!uri.is_mail(), "Should only be called on a Website type!");
let domain = match uri.domain() {
Some(domain) => domain,
None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())),
};
if !matches!(
domain,
"github.com" | "www.github.com" | "raw.githubusercontent.com"
) {
return Err(ErrorKind::InvalidGithubUrl(uri.to_string()));
};
let parts: Vec<_> = match uri.path_segments() {
Some(parts) => parts.collect(),
None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())),
};
if parts.len() < 2 {
// Not a valid org/repo pair.
// Note: We don't check for exactly 2 here, because the Github
// API doesn't handle checking individual files inside repos or
// paths like <github.com/org/repo/issues>, so we are more
// permissive and only check for repo existence. This is the
// only way to get a basic check for private repos. Public repos
// are not affected and should work with a normal check.
return Err(ErrorKind::InvalidGithubUrl(uri.to_string()));
}
let owner = parts[0];
if GITHUB_API_EXCLUDED_ENDPOINTS.contains(owner) {
return Err(ErrorKind::InvalidGithubUrl(uri.to_string()));
}
let repo = parts[1];
// If the URL ends with `.git`, assume this is an SSH URL and strip
// the suffix. See https://github.com/lycheeverse/lychee/issues/384
let repo = remove_suffix(repo, ".git");
let endpoint = if parts.len() > 2 && !parts[2].is_empty() {
Some(parts[2..].join("/"))
} else {
None
};
Ok(GithubUri {
owner: owner.to_string(),
repo: repo.to_string(),
endpoint,
})
}
}
impl TryFrom<Uri> for GithubUri {
type Error = ErrorKind;
fn try_from(uri: Uri) -> Result<Self> {
GithubUri::gh_org_and_repo(&uri)
}
}
impl TryFrom<&Uri> for GithubUri {
type Error = ErrorKind;
fn try_from(uri: &Uri) -> Result<Self> {
GithubUri::gh_org_and_repo(uri)
}
}
#[cfg(test)]
mod tests {
use crate::test_utils::website;
use super::*;
#[test]
fn test_github() {
assert_eq!(
GithubUri::try_from(website("http://github.com/lycheeverse/lychee")).unwrap(),
GithubUri::new("lycheeverse", "lychee")
);
assert_eq!(
GithubUri::try_from(website("http://www.github.com/lycheeverse/lychee")).unwrap(),
GithubUri::new("lycheeverse", "lychee")
);
assert_eq!(
GithubUri::try_from(website("https://github.com/lycheeverse/lychee")).unwrap(),
GithubUri::new("lycheeverse", "lychee")
);
assert_eq!(
GithubUri::try_from(website("https://github.com/lycheeverse/lychee/")).unwrap(),
GithubUri::new("lycheeverse", "lychee")
);
assert_eq!(
GithubUri::try_from(website("https://github.com/lycheeverse/lychee/foo/bar")).unwrap(),
GithubUri::with_endpoint("lycheeverse", "lychee", "foo/bar")
);
assert_eq!(
GithubUri::try_from(website(
"https://github.com/Microsoft/python-language-server.git"
))
.unwrap(),
GithubUri::new("Microsoft", "python-language-server")
);
assert_eq!(
GithubUri::try_from(website(
"https://github.com/lycheeverse/lychee/blob/master/NON_EXISTENT_FILE.md"
))
.unwrap(),
GithubUri::with_endpoint("lycheeverse", "lychee", "blob/master/NON_EXISTENT_FILE.md")
);
}
#[test]
fn test_github_false_positives() {
assert!(
GithubUri::try_from(website("https://github.com/sponsors/analysis-tools-dev "))
.is_err()
);
assert!(GithubUri::try_from(website(
"https://github.com/marketplace/actions/lychee-broken-link-checker"
))
.is_err());
assert!(GithubUri::try_from(website("https://github.com/features/actions")).is_err());
assert!(GithubUri::try_from(website(
"https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath"
))
.is_err());
}
}

View file

@ -0,0 +1,3 @@
pub(crate) mod github;
pub(crate) mod raw;
pub(crate) mod valid;

View file

@ -42,7 +42,7 @@ impl From<&str> for RawUri {
}
#[cfg(test)]
mod test {
mod tests {
use super::*;
#[test]

View file

@ -1,65 +1,13 @@
use std::{collections::HashSet, convert::TryFrom, fmt::Display, net::IpAddr};
use std::{convert::TryFrom, fmt::Display, net::IpAddr};
use fast_chemail::parse_email;
use ip_network::Ipv6Network;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use url::Url;
use crate::{ErrorKind, Result};
use super::raw_uri::RawUri;
lazy_static! {
static ref GITHUB_API_EXCLUDED_ENDPOINTS: HashSet<&'static str> = HashSet::from_iter([
"about",
"collections",
"events",
"explore",
"features",
"issues",
"marketplace",
"new",
"notifications",
"pricing",
"pulls",
"sponsors",
"topics",
"watching",
]);
}
/// Uri path segments extracted from a Github URL
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct GithubUri {
/// Organization name
pub owner: String,
/// Repository name
pub repo: String,
/// e.g. `issues` in `/org/repo/issues`
pub endpoint: Option<String>,
}
impl GithubUri {
/// Create a new Github URI without an endpoint
#[cfg(test)]
fn new<T: Into<String>>(owner: T, repo: T) -> Self {
GithubUri {
owner: owner.into(),
repo: repo.into(),
endpoint: None,
}
}
#[cfg(test)]
fn with_endpoint<T: Into<String>>(owner: T, repo: T, endpoint: T) -> Self {
GithubUri {
owner: owner.into(),
repo: repo.into(),
endpoint: Some(endpoint.into()),
}
}
}
use super::raw::RawUri;
/// Lychee's own representation of a URI, which encapsulates all supported
/// formats.
@ -125,59 +73,6 @@ impl Uri {
}
}
// TODO: Support GitLab etc.
pub(crate) fn gh_org_and_repo(&self) -> Option<GithubUri> {
fn remove_suffix<'a>(input: &'a str, suffix: &str) -> &'a str {
if let Some(stripped) = input.strip_suffix(suffix) {
return stripped;
}
input
}
debug_assert!(!self.is_mail(), "Should only be called on a Website type!");
if matches!(
self.domain()?,
"github.com" | "www.github.com" | "raw.githubusercontent.com"
) {
let parts: Vec<_> = self.path_segments()?.collect();
if parts.len() < 2 {
// Not a valid org/repo pair.
// Note: We don't check for exactly 2 here, because the Github
// API doesn't handle checking individual files inside repos or
// paths like `github.com/org/repo/issues`, so we are more
// permissive and only check for repo existence. This is the
// only way to get a basic check for private repos. Public repos
// are not affected and should work with a normal check.
return None;
}
let owner = parts[0];
if GITHUB_API_EXCLUDED_ENDPOINTS.contains(owner) {
return None;
}
let repo = parts[1];
// If the URL ends with `.git`, assume this is an SSH URL and strip
// the suffix. See https://github.com/lycheeverse/lychee/issues/384
let repo = remove_suffix(repo, ".git");
let endpoint = if parts.len() > 2 && !parts[2].is_empty() {
Some(parts[2..].join("/"))
} else {
None
};
return Some(GithubUri {
owner: owner.to_string(),
repo: repo.to_string(),
endpoint,
});
}
None
}
#[inline]
#[must_use]
/// Check if the URI is a valid mail address
@ -326,18 +221,14 @@ impl Display for Uri {
}
#[cfg(test)]
mod test {
mod tests {
use super::*;
use crate::test_utils::{mail, website};
use std::{
convert::TryFrom,
net::{IpAddr, Ipv4Addr, Ipv6Addr},
};
use super::Uri;
use crate::{
test_utils::{mail, website},
types::uri::GithubUri,
};
#[test]
fn test_ipv4_uri_is_loopback() {
let uri = Uri::try_from("http://127.0.0.0").unwrap();
@ -399,70 +290,4 @@ mod test {
Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)))
);
}
#[test]
fn test_github() {
assert_eq!(
website("http://github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(GithubUri::new("lycheeverse", "lychee"))
);
assert_eq!(
website("http://www.github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(GithubUri::new("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee").gh_org_and_repo(),
Some(GithubUri::new("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee/").gh_org_and_repo(),
Some(GithubUri::new("lycheeverse", "lychee"))
);
assert_eq!(
website("https://github.com/Microsoft/python-language-server.git").gh_org_and_repo(),
Some(GithubUri::new("Microsoft", "python-language-server"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee/foo/bar").gh_org_and_repo(),
Some(GithubUri::with_endpoint("lycheeverse", "lychee", "foo/bar"))
);
assert_eq!(
website("https://github.com/lycheeverse/lychee/blob/master/NON_EXISTENT_FILE.md")
.gh_org_and_repo(),
Some(GithubUri::with_endpoint(
"lycheeverse",
"lychee",
"blob/master/NON_EXISTENT_FILE.md"
))
);
}
#[test]
fn test_github_false_positives() {
assert!(website("https://github.com/sponsors/analysis-tools-dev ")
.gh_org_and_repo()
.is_none());
assert!(
website("https://github.com/marketplace/actions/lychee-broken-link-checker")
.gh_org_and_repo()
.is_none()
);
assert!(website("https://github.com/features/actions")
.gh_org_and_repo()
.is_none());
assert!(
website("https://pkg.go.dev/github.com/Debian/pkg-go-tools/cmd/pgt-gopath")
.gh_org_and_repo()
.is_none()
);
}
}