mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-18 20:21:04 +00:00
Don't check Twitter URLs (#1147)
Twitter completely locked down and requires a login to read tweets. (Temporarily) disable all Twitter URLs to avoid false-positives. For context: https://github.com/zedeus/nitter/issues/919 https://news.ycombinator.com/item?id=36540957 https://techcrunch.com/2023/06/30/twitter-now-requires-an-account-to-view-tweets/ Fixes https://github.com/lycheeverse/lychee/issues/1108
This commit is contained in:
parent
8f4907c42c
commit
40ba18794d
3 changed files with 28 additions and 40 deletions
|
|
@ -241,8 +241,10 @@ mod cli {
|
|||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "Twitter quirk works locally but is flaky on Github (timeout)"]
|
||||
fn test_twitter_quirk() {
|
||||
// Exclude Twitter links because they require login to view tweets.
|
||||
// https://techcrunch.com/2023/06/30/twitter-now-requires-an-account-to-view-tweets/
|
||||
// https://github.com/zedeus/nitter/issues/919
|
||||
fn test_ignored_hosts() {
|
||||
let url = "https://twitter.com/zarfeblong/status/1339742840142872577";
|
||||
|
||||
main_command()
|
||||
|
|
@ -253,7 +255,7 @@ mod cli {
|
|||
.assert()
|
||||
.success()
|
||||
.stdout(contains("1 Total"))
|
||||
.stdout(contains("1 OK"));
|
||||
.stdout(contains("1 Excluded"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -21,6 +21,14 @@ static EXAMPLE_DOMAINS: Lazy<HashSet<&'static str>> =
|
|||
#[cfg(any(test, feature = "check_example_domains"))]
|
||||
static EXAMPLE_DOMAINS: Lazy<HashSet<&'static str>> = Lazy::new(HashSet::new);
|
||||
|
||||
static UNSUPPORTED_DOMAINS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||
HashSet::from_iter([
|
||||
// Twitter requires an account to view tweets
|
||||
// https://news.ycombinator.com/item?id=36540957
|
||||
"twitter.com",
|
||||
])
|
||||
});
|
||||
|
||||
/// Pre-defined exclusions for known false-positives
|
||||
const FALSE_POSITIVE_PAT: &[&str] = &[
|
||||
r"^https?://schemas.openxmlformats.org",
|
||||
|
|
@ -70,6 +78,20 @@ pub fn is_example_domain(uri: &Uri) -> bool {
|
|||
res
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[must_use]
|
||||
/// Check if the host belongs to a known unsupported domain
|
||||
pub fn is_unsupported_domain(uri: &Uri) -> bool {
|
||||
if let Some(domain) = uri.domain() {
|
||||
// It is not enough to use `UNSUPPORTED_DOMAINS.contains(domain)` here
|
||||
// as this would not include checks for subdomains, such as
|
||||
// `foo.example.com`
|
||||
UNSUPPORTED_DOMAINS.iter().any(|tld| domain.ends_with(tld))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// A generic URI filter
|
||||
/// Used to decide if a given URI should be checked or skipped
|
||||
#[allow(clippy::struct_excessive_bools)]
|
||||
|
|
@ -179,6 +201,7 @@ impl Filter {
|
|||
|| self.is_host_excluded(uri)
|
||||
|| self.is_scheme_excluded(uri)
|
||||
|| is_example_domain(uri)
|
||||
|| is_unsupported_domain(uri)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,8 +5,6 @@ use regex::Regex;
|
|||
use reqwest::{Request, Url};
|
||||
use std::collections::HashMap;
|
||||
|
||||
static TWITTER_PATTERN: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap());
|
||||
static CRATES_PATTERN: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"^(https?://)?(www\.)?crates.io").unwrap());
|
||||
static YOUTUBE_PATTERN: Lazy<Regex> =
|
||||
|
|
@ -33,13 +31,6 @@ pub(crate) struct Quirks {
|
|||
impl Default for Quirks {
|
||||
fn default() -> Self {
|
||||
let quirks = vec![
|
||||
Quirk {
|
||||
pattern: &TWITTER_PATTERN,
|
||||
rewrite: |mut request| {
|
||||
request.url_mut().set_host(Some("nitter.net")).unwrap();
|
||||
request
|
||||
},
|
||||
},
|
||||
Quirk {
|
||||
pattern: &CRATES_PATTERN,
|
||||
rewrite: |mut request| {
|
||||
|
|
@ -118,34 +109,6 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_twitter_request() {
|
||||
let cases = vec![
|
||||
(
|
||||
"https://twitter.com/search?q=rustlang",
|
||||
"https://nitter.net/search?q=rustlang",
|
||||
),
|
||||
("http://twitter.com/jack", "http://nitter.net/jack"),
|
||||
(
|
||||
"https://twitter.com/notifications",
|
||||
"https://nitter.net/notifications",
|
||||
),
|
||||
];
|
||||
|
||||
for (input, output) in cases {
|
||||
let url = Url::parse(input).unwrap();
|
||||
let expected = Url::parse(output).unwrap();
|
||||
|
||||
let request = Request::new(Method::GET, url.clone());
|
||||
let modified = Quirks::default().apply(request);
|
||||
|
||||
assert_eq!(
|
||||
MockRequest(modified),
|
||||
MockRequest::new(Method::GET, expected)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cratesio_request() {
|
||||
let url = Url::parse("https://crates.io/crates/lychee").unwrap();
|
||||
|
|
|
|||
Loading…
Reference in a new issue