Merge pull request #7 from pawroman/exclude-private-urls

Add exclude private URLs feature
This commit is contained in:
Matthias 2020-10-18 00:28:30 +02:00 committed by GitHub
commit dd73d6e145
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 383 additions and 45 deletions

86
Cargo.lock generated
View file

@ -68,6 +68,19 @@ version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a"
[[package]]
name = "assert_cmd"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c88b9ca26f9c16ec830350d309397e74ee9abdfd8eb1f71cb6ecc71a3fc818da"
dependencies = [
"doc-comment",
"predicates",
"predicates-core",
"predicates-tree",
"wait-timeout",
]
[[package]]
name = "async-channel"
version = "1.4.0"
@ -577,6 +590,18 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4d0e2d24e5ee3b23a01de38eefdcd978907890701f08ffffd4cb457ca4ee8d6"
[[package]]
name = "difference"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198"
[[package]]
name = "doc-comment"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "dtoa"
version = "0.4.6"
@ -675,6 +700,15 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "float-cmp"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1267f4ac4f343772758f7b1bdcbe767c218bbab93bb432acbf5162bbf85a6c4"
dependencies = [
"num-traits",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -1317,6 +1351,7 @@ name = "lychee"
version = "0.3.0"
dependencies = [
"anyhow",
"assert_cmd",
"check-if-email-exists",
"futures",
"glob",
@ -1326,6 +1361,7 @@ dependencies = [
"indicatif",
"linkify",
"log",
"predicates",
"pretty_env_logger",
"regex",
"reqwest",
@ -1525,6 +1561,12 @@ dependencies = [
"version_check",
]
[[package]]
name = "normalize-line-endings"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
[[package]]
name = "nuclei"
version = "0.1.1"
@ -1753,6 +1795,35 @@ version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c36fa947111f5c62a733b652544dd0016a43ce89619538a8ef92724a6f501a20"
[[package]]
name = "predicates"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bfead12e90dccead362d62bb2c90a5f6fc4584963645bc7f71a735e0b0735a"
dependencies = [
"difference",
"float-cmp",
"normalize-line-endings",
"predicates-core",
"regex",
]
[[package]]
name = "predicates-core"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06075c3a3e92559ff8929e7a280684489ea27fe44805174c3ebd9328dcb37178"
[[package]]
name = "predicates-tree"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e63c4859013b38a76eca2414c64911fba30def9e3202ac461a2d22831220124"
dependencies = [
"predicates-core",
"treeline",
]
[[package]]
name = "pretty_env_logger"
version = "0.4.0"
@ -2374,6 +2445,12 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "treeline"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41"
[[package]]
name = "trust-dns-proto"
version = "0.19.5"
@ -2508,6 +2585,15 @@ version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
[[package]]
name = "wait-timeout"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
dependencies = [
"libc",
]
[[package]]
name = "waker-fn"
version = "1.0.0"

View file

@ -34,3 +34,5 @@ version = "0.2"
[dev-dependencies]
wiremock = "0.2.4"
assert_cmd = "1.0"
predicates = "1.0"

View file

@ -0,0 +1,12 @@
Test file: "private" URLs (should all be excluded when using `-E` flag).
- Loopback: http://127.0.0.1
- Link-local 1: http://169.254.0.1
- Link-local 2: https://169.254.10.1:8080
- Private class A: http://10.0.1.1
- Private class B: http://172.16.42.42
- Private class C: http://192.168.10.1
IPv6:
- Loopback: http://[::1]

View file

@ -1,4 +1,5 @@
use crate::extract::{self, Uri};
use crate::options::LycheeOptions;
use anyhow::anyhow;
use anyhow::{Context, Result};
use check_if_email_exists::{check_email, CheckEmailInput};
@ -6,6 +7,7 @@ use hubcaps::{Credentials, Github};
use indicatif::ProgressBar;
use regex::{Regex, RegexSet};
use reqwest::header::{self, HeaderMap, HeaderValue};
use std::net::IpAddr;
use std::{collections::HashSet, convert::TryFrom, time::Duration};
use url::Url;
@ -68,12 +70,46 @@ impl From<reqwest::Error> for Status {
}
}
/// Exclude configuration for the link checker.
pub(crate) struct Excludes {
regex: Option<RegexSet>,
private_ips: bool,
link_local_ips: bool,
loopback_ips: bool,
}
impl Excludes {
pub fn from_options(options: &LycheeOptions) -> Self {
// exclude_all_private option turns on all "private" excludes,
// including private IPs, link-local IPs and loopback IPs
let enable_exclude = |opt| opt || options.exclude_all_private;
Self {
regex: RegexSet::new(&options.exclude).ok(),
private_ips: enable_exclude(options.exclude_private),
link_local_ips: enable_exclude(options.exclude_link_local),
loopback_ips: enable_exclude(options.exclude_loopback),
}
}
}
impl Default for Excludes {
fn default() -> Self {
Self {
regex: None,
private_ips: false,
link_local_ips: false,
loopback_ips: false,
}
}
}
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
pub(crate) struct Checker<'a> {
reqwest_client: reqwest::Client,
github: Github,
excludes: Option<RegexSet>,
excludes: Excludes,
scheme: Option<String>,
method: RequestMethod,
accepted: Option<HashSet<reqwest::StatusCode>>,
@ -85,7 +121,7 @@ impl<'a> Checker<'a> {
/// Creates a new link checker
pub fn try_new(
token: String,
excludes: Option<RegexSet>,
excludes: Excludes,
max_redirects: usize,
user_agent: String,
allow_insecure: bool,
@ -195,8 +231,8 @@ impl<'a> Checker<'a> {
}
}
fn in_excludes(&self, input: &str) -> bool {
if let Some(excludes) = &self.excludes {
fn in_regex_excludes(&self, input: &str) -> bool {
if let Some(excludes) = &self.excludes.regex {
if excludes.is_match(input) {
return true;
}
@ -204,8 +240,35 @@ impl<'a> Checker<'a> {
false
}
fn in_ip_excludes(&self, uri: &Uri) -> bool {
if let Some(ipaddr) = uri.host_ip() {
if self.excludes.loopback_ips && ipaddr.is_loopback() {
return true;
}
// Note: in a pathological case, an IPv6 address can be IPv4-mapped
// (IPv4 address embedded in a IPv6). We purposefully
// don't deal with it here, and assume if an address is IPv6,
// we shouldn't attempt to map it to IPv4.
// See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2
if let IpAddr::V4(v4addr) = ipaddr {
if self.excludes.private_ips && v4addr.is_private() {
return true;
}
if self.excludes.link_local_ips && v4addr.is_link_local() {
return true;
}
}
}
false
}
pub fn excluded(&self, uri: &Uri) -> bool {
if self.in_excludes(uri.as_str()) {
if self.in_regex_excludes(uri.as_str()) {
return true;
}
if self.in_ip_excludes(&uri) {
return true;
}
if self.scheme.is_none() {
@ -290,10 +353,29 @@ mod test {
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
// Note: the standard library as of Rust stable 1.47.0 does not expose
// "link-local" or "private" IPv6 checks. However, one might argue
// that these concepts do exist in IPv6, albeit the naming is different.
// See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
// See: https://en.wikipedia.org/wiki/Private_network#IPv6
// See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
const V4_LOOPBACK: &str = "http://127.0.0.1";
const V6_LOOPBACK: &str = "http://[::1]";
const V4_LINK_LOCAL: &str = "http://169.254.0.1";
// IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
fn get_checker(allow_insecure: bool, custom_headers: HeaderMap) -> Checker<'static> {
let checker = Checker::try_new(
"DUMMY_GITHUB_TOKEN".to_string(),
None,
Excludes::default(),
5,
"curl/7.71.1".to_string(),
allow_insecure,
@ -309,12 +391,14 @@ mod test {
checker
}
fn website_url(s: &str) -> Uri {
Uri::Website(Url::parse(s).expect("Expected valid Website Uri"))
}
#[tokio::test]
async fn test_nonexistent() {
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://endler.dev/abcd").unwrap(),
))
.check(&website_url("https://endler.dev/abcd"))
.await;
assert!(matches!(res, Status::Failed(_)));
}
@ -332,9 +416,7 @@ mod test {
async fn test_github() {
assert!(matches!(
get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://github.com/mre/idiomatic-rust").unwrap()
))
.check(&website_url("https://github.com/mre/idiomatic-rust"))
.await,
Status::Ok(_)
));
@ -343,8 +425,8 @@ mod test {
#[tokio::test]
async fn test_github_nonexistent() {
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://github.com/mre/idiomatic-rust-doesnt-exist-man").unwrap(),
.check(&website_url(
"https://github.com/mre/idiomatic-rust-doesnt-exist-man",
))
.await;
assert!(matches!(res, Status::Error(_)));
@ -353,7 +435,7 @@ mod test {
#[tokio::test]
async fn test_non_github() {
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(Url::parse("https://endler.dev").unwrap()))
.check(&website_url("https://endler.dev"))
.await;
assert!(matches!(res, Status::Ok(_)));
}
@ -361,17 +443,13 @@ mod test {
#[tokio::test]
async fn test_invalid_ssl() {
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://expired.badssl.com/").unwrap(),
))
.check(&website_url("https://expired.badssl.com/"))
.await;
assert!(matches!(res, Status::Error(_)));
// Same, but ignore certificate error
let res = get_checker(true, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://expired.badssl.com/").unwrap(),
))
.check(&website_url("https://expired.badssl.com/"))
.await;
assert!(matches!(res, Status::Ok(_)));
}
@ -379,9 +457,7 @@ mod test {
#[tokio::test]
async fn test_custom_headers() {
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(
Url::parse("https://crates.io/keywords/cassandra").unwrap(),
))
.check(&website_url("https://crates.io/keywords/cassandra"))
.await;
assert!(matches!(res, Status::Failed(StatusCode::NOT_FOUND)));
@ -391,9 +467,7 @@ mod test {
let mut custom = HeaderMap::new();
custom.insert(header::ACCEPT, "text/html".parse().unwrap());
let res = get_checker(true, custom)
.check(&Uri::Website(
Url::parse("https://crates.io/keywords/cassandra").unwrap(),
))
.check(&website_url("https://crates.io/keywords/cassandra"))
.await;
assert!(matches!(res, Status::Ok(_)));
}
@ -411,20 +485,21 @@ mod test {
.await;
let res = get_checker(false, HeaderMap::new())
.check(&Uri::Website(Url::parse(&mock_server.uri()).unwrap()))
.check(&website_url(&mock_server.uri()))
.await;
println!("{:?}", res);
assert!(matches!(res, Status::Timeout));
}
#[tokio::test]
async fn test_exclude() {
let excludes =
RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap();
async fn test_exclude_regex() {
let mut excludes = Excludes::default();
excludes.regex =
Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap());
let checker = Checker::try_new(
"DUMMY_GITHUB_TOKEN".to_string(),
Some(excludes),
excludes,
5,
"curl/7.71.1".to_string(),
true,
@ -437,14 +512,8 @@ mod test {
None,
)
.unwrap();
assert_eq!(
checker.excluded(&Uri::Website(Url::parse("http://github.com").unwrap())),
true
);
assert_eq!(
checker.excluded(&Uri::Website(Url::parse("http://exclude.org").unwrap())),
true
);
assert_eq!(checker.excluded(&website_url("http://github.com")), true);
assert_eq!(checker.excluded(&website_url("http://exclude.org")), true);
assert_eq!(
checker.excluded(&Uri::Mail("mail@example.com".to_string())),
true
@ -454,4 +523,89 @@ mod test {
false
);
}
#[test]
fn test_const_sanity() {
let get_host = |s| {
Url::parse(s)
.expect("Expected valid URL")
.host()
.expect("Expected host address")
.to_owned()
};
let into_v4 = |host| match host {
url::Host::Ipv4(ipv4) => ipv4,
_ => panic!("Not IPv4"),
};
let into_v6 = |host| match host {
url::Host::Ipv6(ipv6) => ipv6,
_ => panic!("Not IPv6"),
};
assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
}
#[test]
fn test_excludes_no_private_ips_by_default() {
let checker = get_checker(false, HeaderMap::new());
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), false);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), false);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), false);
assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), false);
assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), false);
assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), false);
}
#[test]
fn test_exclude_private() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.private_ips = true;
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_A)), true);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_B)), true);
assert_eq!(checker.excluded(&website_url(V4_PRIVATE_CLASS_C)), true);
}
#[test]
fn test_exclude_link_local() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.link_local_ips = true;
assert_eq!(checker.excluded(&website_url(V4_LINK_LOCAL)), true);
}
#[test]
fn test_exclude_loopback() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.loopback_ips = true;
assert_eq!(checker.excluded(&website_url(V4_LOOPBACK)), true);
assert_eq!(checker.excluded(&website_url(V6_LOOPBACK)), true);
}
#[test]
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
let mut checker = get_checker(false, HeaderMap::new());
checker.excludes.private_ips = true;
checker.excludes.link_local_ips = true;
// if these were pure IPv4, we would exclude
assert_eq!(
checker.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)),
false
);
assert_eq!(
checker.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)),
false
);
}
}

View file

@ -1,5 +1,6 @@
use linkify::LinkFinder;
use std::net::IpAddr;
use std::{collections::HashSet, fmt::Display};
use url::Url;
@ -23,6 +24,17 @@ impl Uri {
Uri::Mail(_address) => None,
}
}
pub fn host_ip(&self) -> Option<IpAddr> {
match self {
Self::Website(url) => match url.host()? {
url::Host::Ipv4(v4_addr) => Some(v4_addr.into()),
url::Host::Ipv6(v6_addr) => Some(v6_addr.into()),
_ => None,
},
Self::Mail(_) => None,
}
}
}
impl Display for Uri {
@ -56,6 +68,7 @@ pub(crate) fn extract_links(input: &str) -> HashSet<Uri> {
mod test {
use super::*;
use std::iter::FromIterator;
use std::net::{Ipv4Addr, Ipv6Addr};
#[test]
fn test_extract_markdown_links() {
@ -113,4 +126,30 @@ mod test {
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
}
#[test]
fn test_uri_host_ip_v4() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri =
Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6"));
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
IpAddr::V6(Ipv6Addr::new(0x2020, 0, 0, 0, 0, 0, 0, 0x10))
);
}
#[test]
fn test_uri_host_ip_no_ip() {
let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI"));
let ip = uri.host_ip();
assert!(ip.is_none());
}
}

View file

@ -6,7 +6,6 @@ use anyhow::Result;
use futures::future::join_all;
use gumdrop::Options;
use indicatif::{ProgressBar, ProgressStyle};
use regex::RegexSet;
use reqwest::header::{HeaderMap, HeaderName};
use std::{collections::HashSet, convert::TryInto, env, time::Duration};
@ -15,7 +14,7 @@ mod collector;
mod extract;
mod options;
use checker::{Checker, Status};
use checker::{Checker, Excludes, Status};
use extract::Uri;
use options::LycheeOptions;
@ -31,7 +30,7 @@ fn print_summary(found: &HashSet<Uri>, results: &Vec<Status>) {
.count();
let errors: usize = found - excluded - success;
println!("");
println!();
println!("📝Summary");
println!("-------------------");
println!("🔍Found: {}", found);
@ -60,7 +59,7 @@ fn main() -> Result<()> {
}
async fn run(opts: LycheeOptions) -> Result<i32> {
let excludes = RegexSet::new(opts.exclude).unwrap();
let excludes = Excludes::from_options(&opts);
let headers = parse_headers(opts.headers)?;
let accepted = match opts.accept {
Some(accept) => parse_statuscodes(accept)?,
@ -82,7 +81,7 @@ async fn run(opts: LycheeOptions) -> Result<i32> {
};
let checker = Checker::try_new(
env::var("GITHUB_TOKEN")?,
Some(excludes),
excludes,
opts.max_redirects,
opts.user_agent,
opts.insecure,

View file

@ -38,6 +38,21 @@ pub(crate) struct LycheeOptions {
#[options(help = "Exclude URLs from checking (supports regex)")]
pub exclude: Vec<String>,
#[options(
help = "Exclude all private IPs from checking, equivalent to `--exclude-private --exclude-link-local --exclude--loopback`",
short = "E"
)]
pub exclude_all_private: bool,
#[options(help = "Exclude private IP address ranges from checking", no_short)]
pub exclude_private: bool,
#[options(help = "Exclude link-local IP address range from checking", no_short)]
pub exclude_link_local: bool,
#[options(help = "Exclude loopback IP address range from checking", no_short)]
pub exclude_loopback: bool,
// Accumulate all headers in a vector
#[options(help = "Custom request headers")]
pub headers: Vec<String>,

31
tests/cli.rs Normal file
View file

@ -0,0 +1,31 @@
#[cfg(test)]
mod cli {
use assert_cmd::Command;
use predicates::str::contains;
use std::path::Path;
#[test]
fn test_exclude_all_private() {
// this gets the "main" binary name (e.g. `lychee`)
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
let test_all_private_path = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_ALL_PRIVATE.md");
// assert that the command runs OK, and that it excluded all the links
cmd.env("GITHUB_TOKEN", "invalid-token")
.arg("--exclude-all-private")
.arg("--verbose")
.arg(test_all_private_path)
.assert()
.success()
.stdout(contains("Found: 7"))
.stdout(contains("Excluded: 7"))
.stdout(contains("Successful: 0"))
.stdout(contains("Errors: 0"));
}
}