Cookie Support (#1146)

This is a very conservative and limited implementation of cookie support.

The goal is to ship an MVP, which covers 80% of the use-cases.
When you run lychee with --cookie-jar cookies.json, all cookies will be stored in cookies.json, one cookie per line.
This makes cookies easy to edit by hand if needed, although this is an advanced use-case and the API for the format is not guaranteed to be stable.

Fixes: #645, #715
Partially fixes: #1108
This commit is contained in:
Matthias Endler 2023-07-13 17:32:41 +02:00 committed by GitHub
parent 40ba18794d
commit 14e748793e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 297 additions and 21 deletions

3
.gitignore vendored
View file

@ -21,3 +21,6 @@ Cargo.lock
# Config smoketest report file
.config.dummy.report.md
# Other
cookies.json

88
Cargo.lock generated
View file

@ -730,6 +730,51 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "cookie"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e859cd57d0710d9e06c381b550c06e76992472a8c6d527aecd2fc673dcc231fb"
dependencies = [
"percent-encoding",
"time",
"version_check",
]
[[package]]
name = "cookie_store"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d606d0fba62e13cf04db20536c05cb7f13673c161cb47a47a82b9b9e7d3f1daa"
dependencies = [
"cookie",
"idna 0.2.3",
"log",
"publicsuffix",
"serde",
"serde_derive",
"serde_json",
"time",
"url",
]
[[package]]
name = "cookie_store"
version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5a18f35792056f8c7c2de9c002e7e4fe44c7b5f66e7d99f46468dbb730a7ea7"
dependencies = [
"cookie",
"idna 0.3.0",
"log",
"publicsuffix",
"serde",
"serde_derive",
"serde_json",
"time",
"url",
]
[[package]]
name = "core-foundation"
version = "0.9.3"
@ -1763,6 +1808,16 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "idna"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
dependencies = [
"unicode-bidi",
"unicode-normalization",
]
[[package]]
name = "idna"
version = "0.4.0"
@ -2027,6 +2082,7 @@ dependencies = [
"pretty_assertions",
"regex",
"reqwest",
"reqwest_cookie_store",
"ring",
"secrecy",
"serde",
@ -2072,6 +2128,7 @@ dependencies = [
"pulldown-cmark",
"regex",
"reqwest",
"reqwest_cookie_store",
"ring",
"secrecy",
"serde",
@ -2707,6 +2764,22 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "psl-types"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac"
[[package]]
name = "publicsuffix"
version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96a8c1bda5ae1af7f99a2962e49df150414a43d62404644d98dd5c3a93d07457"
dependencies = [
"idna 0.3.0",
"psl-types",
]
[[package]]
name = "pulldown-cmark"
version = "0.9.3"
@ -2922,6 +2995,8 @@ dependencies = [
"async-compression",
"base64 0.21.2",
"bytes",
"cookie",
"cookie_store 0.16.2",
"encoding_rs",
"futures-core",
"futures-util",
@ -2959,6 +3034,19 @@ dependencies = [
"winreg 0.10.1",
]
[[package]]
name = "reqwest_cookie_store"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06b407c05de7a0f7e4cc2a56af5e9bd6468e509124e81078ce1f8bc2ed3536bf"
dependencies = [
"bytes",
"cookie",
"cookie_store 0.19.1",
"reqwest",
"url",
]
[[package]]
name = "resolv-conf"
version = "0.7.0"

View file

@ -18,6 +18,10 @@ docker-run: ## Run Docker image
docker-push: ## Push image to Docker Hub
docker push $(IMAGE_NAME)
.PHONY: clean
clean: ## Clean up build artifacts
cargo clean
.PHONY: build
build: ## Build Rust code locally
cargo build

View file

@ -136,6 +136,7 @@ outdated information.
| [Use as library] | ![yes] | ![yes] | ![no] | ![yes] | ![yes] | ![no] | ![yes] | ![no] |
| Quiet mode | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] |
| [Config file] | ![yes] | ![no] | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![no] |
| Cookies | ![yes] | ![no] | ![yes] | ![no] | ![no] | ![yes] | ![no] | ![yes] |
| Recursion | ![no] | ![no] | ![yes] | ![yes] | ![yes] | ![yes] | ![yes] | ![no] |
| Amazing lychee logo | ![yes] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] | ![no] |
@ -407,6 +408,9 @@ Options:
--require-https
When HTTPS is available, treat HTTP links as errors
--cookie-jar <COOKIE_JAR>
Tell lychee to read cookies from the given file. Cookies will be stored in the cookie jar and sent with requests. New cookies will be stored in the cookie jar and existing cookies will be updated
-h, --help
Print help (see a summary with '-h')

View file

@ -39,6 +39,7 @@ openssl-sys = { version = "0.9.90", optional = true }
pad = "0.1.6"
regex = "1.9.1"
reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "json"] }
reqwest_cookie_store = "0.5.0"
# Make build work on Apple Silicon.
# See https://github.com/briansmith/ring/issues/1163
# This is necessary for the homebrew build

View file

@ -4,12 +4,13 @@ use anyhow::{Context, Result};
use http::StatusCode;
use lychee_lib::{Client, ClientBuilder};
use regex::RegexSet;
use reqwest_cookie_store::CookieStoreMutex;
use std::sync::Arc;
use std::{collections::HashSet, str::FromStr};
/// Creates a client according to the command-line config
pub(crate) fn create(cfg: &Config) -> Result<Client> {
pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -> Result<Client> {
let headers = parse_headers(&cfg.header)?;
let timeout = parse_duration_secs(cfg.timeout);
let retry_wait_time = parse_duration_secs(cfg.retry_wait_time);
let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?;
@ -56,6 +57,7 @@ pub(crate) fn create(cfg: &Config) -> Result<Client> {
.schemes(HashSet::from_iter(schemes))
.accepted(accepted)
.require_https(cfg.require_https)
.cookie_jar(cookie_jar.cloned())
.build()
.client()
.context("Failed to create request client")

View file

@ -46,7 +46,7 @@ where
let cache = params.cache;
let accept = params.cfg.accept;
let pb = if params.cfg.no_progress {
let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info {
None
} else {
Some(init_progress_bar("Extracting links"))

View file

@ -73,10 +73,13 @@ use log::{error, info, warn};
#[cfg(feature = "native-tls")]
use openssl_sys as _; // required for vendored-openssl feature
use openssl_sys as _;
use options::LYCHEE_CONFIG_FILE;
use ring as _; // required for apple silicon
use lychee_lib::{BasicAuthExtractor, Collector};
use lychee_lib::BasicAuthExtractor;
use lychee_lib::Collector;
use lychee_lib::CookieJar;
mod archive;
mod cache;
@ -188,6 +191,14 @@ fn load_config() -> Result<LycheeOptions> {
Ok(opts)
}
/// Load cookie jar from path (if exists)
fn load_cookie_jar(cfg: &Config) -> Result<Option<CookieJar>> {
match &cfg.cookie_jar {
Some(path) => Ok(CookieJar::load(path.clone()).map(Some)?),
None => Ok(None),
}
}
#[must_use]
/// Load cache (if exists and is still valid)
/// This returns an `Option` as starting without a cache is a common scenario
@ -290,13 +301,24 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
let requests = collector.collect_links(inputs).await;
let client = client::create(&opts.config)?;
let cache = load_cache(&opts.config).unwrap_or_default();
let cache = Arc::new(cache);
let cookie_jar = load_cookie_jar(&opts.config).with_context(|| {
format!(
"Cannot load cookie jar from path `{}`",
opts.config
.cookie_jar
.as_ref()
.map_or_else(|| "<none>".to_string(), |p| p.display().to_string())
)
})?;
let response_formatter: Box<dyn ResponseFormatter> =
formatters::get_formatter(&opts.config.format);
let client = client::create(&opts.config, cookie_jar.as_deref())?;
let params = CommandParams {
client,
cache,
@ -348,6 +370,12 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
if opts.config.cache {
cache.store(LYCHEE_CACHE_FILE)?;
}
if let Some(cookie_jar) = cookie_jar.as_ref() {
info!("Saving cookie jar");
cookie_jar.save().context("Cannot save cookie jar")?;
}
exit_code
};

View file

@ -356,6 +356,13 @@ pub(crate) struct Config {
#[arg(long)]
#[serde(default)]
pub(crate) require_https: bool,
/// Tell lychee to read cookies from the given file.
/// Cookies will be stored in the cookie jar and sent with requests.
/// New cookies will be stored in the cookie jar and existing cookies will be updated.
#[arg(long)]
#[serde(default)]
pub(crate) cookie_jar: Option<PathBuf>,
}
impl Config {
@ -406,6 +413,7 @@ impl Config {
glob_ignore_case: false;
output: None;
require_https: false;
cookie_jar: None;
}
if self

View file

@ -16,6 +16,7 @@ mod cli {
use pretty_assertions::assert_eq;
use serde::Serialize;
use serde_json::Value;
use tempfile::NamedTempFile;
use uuid::Uuid;
use wiremock::{matchers::basic_auth, Mock, ResponseTemplate};
@ -886,8 +887,12 @@ mod cli {
/// and even if they are invalid, we don't know if they will be valid in the
/// future.
///
/// Since we cannot test this with our mock server (because hyper panics on invalid status codes)
/// we use LinkedIn as a test target.
/// Since we cannot test this with our mock server (because hyper panics on
/// invalid status codes) we use LinkedIn as a test target.
///
/// Unfortunately, LinkedIn does not always return 999, so this is a flaky
/// test. We only check that the cache file doesn't contain any invalid
/// status codes.
#[tokio::test]
async fn test_skip_cache_unknown_status_code() -> Result<()> {
let base_path = fixtures_path().join("cache");
@ -910,13 +915,20 @@ mod cli {
.arg("--")
.arg("-")
.assert()
.stderr(contains(format!("[999] {unknown_url} | Unknown status")));
// LinkedIn does not always return 999, so we cannot check for that
// .stderr(contains(format!("[999] {unknown_url} | Unknown status")))
;
// The cache file should be empty, because the only checked URL is
// unsupported and we don't want to cache that. It might be supported in
// future versions.
// If the status code was 999, the cache file should be empty
// because we do not want to cache unknown status codes
let buf = fs::read(&cache_file).unwrap();
assert!(buf.is_empty());
if !buf.is_empty() {
let data = String::from_utf8(buf)?;
// The cache file should not contain any invalid status codes
// In that case, we expect a single entry with status code 200
assert!(!data.contains("999"));
assert!(data.contains("200"));
}
// clear the cache file
fs::remove_file(&cache_file)?;
@ -1309,4 +1321,30 @@ mod cli {
Ok(())
}
#[tokio::test]
async fn test_cookie_jar() -> Result<()> {
// Create a random cookie jar file
let cookie_jar = NamedTempFile::new()?;
let mut cmd = main_command();
cmd.arg("--cookie-jar")
.arg(cookie_jar.path().to_str().unwrap())
.arg("-")
// Using Google as a test target because I couldn't
// get the mock server to work with the cookie jar
.write_stdin("https://google.com")
.assert()
.success();
// check that the cookie jar file contains the expected cookies
let file = std::fs::File::open(cookie_jar.path()).map(std::io::BufReader::new)?;
let cookie_store = reqwest_cookie_store::CookieStore::load_json(file).unwrap();
let all_cookies = cookie_store.iter_any().collect::<Vec<_>>();
assert!(!all_cookies.is_empty());
assert!(all_cookies.iter().all(|c| c.domain() == Some("google.com")));
Ok(())
}
}

View file

@ -41,7 +41,8 @@ pulldown-cmark = "0.9.3"
regex = "1.9.1"
# Use trust-dns to avoid lookup failures on high concurrency
# https://github.com/seanmonstar/reqwest/issues/296
reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "trust-dns"] }
reqwest = { version = "0.11.18", features = ["gzip", "trust-dns", "cookies"] }
reqwest_cookie_store = "0.5.0"
# Make build work on Apple Silicon.
# See https://github.com/briansmith/ring/issues/1163
# This is necessary for the homebrew build

View file

@ -13,7 +13,7 @@
clippy::default_trait_access,
clippy::used_underscore_binding
)]
use std::{collections::HashSet, time::Duration};
use std::{collections::HashSet, sync::Arc, time::Duration};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
@ -26,6 +26,7 @@ use log::debug;
use octocrab::Octocrab;
use regex::RegexSet;
use reqwest::{header, redirect, Url};
use reqwest_cookie_store::CookieStoreMutex;
use secrecy::{ExposeSecret, SecretString};
use typed_builder::TypedBuilder;
@ -264,6 +265,11 @@ pub struct ClientBuilder {
/// It has no effect on non-HTTP schemes or if the URL doesn't support
/// HTTPS.
require_https: bool,
/// Cookie store used for requests.
///
/// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store
cookie_jar: Option<Arc<CookieStoreMutex>>,
}
impl Default for ClientBuilder {
@ -321,7 +327,7 @@ impl ClientBuilder {
}
});
let builder = reqwest::ClientBuilder::new()
let mut builder = reqwest::ClientBuilder::new()
.gzip(true)
.default_headers(headers)
.danger_accept_invalid_certs(self.allow_insecure)
@ -329,10 +335,14 @@ impl ClientBuilder {
.tcp_keepalive(Duration::from_secs(TCP_KEEPALIVE))
.redirect(redirect_policy);
let reqwest_client = (match self.timeout {
if let Some(cookie_jar) = self.cookie_jar {
builder = builder.cookie_provider(cookie_jar);
}
let reqwest_client = match self.timeout {
Some(t) => builder.timeout(t),
None => builder,
})
}
.build()
.map_err(ErrorKind::NetworkRequest)?;
@ -477,7 +487,6 @@ impl Client {
/// Returns an `Err` if the final, remapped `uri` is not a valid URI.
pub fn remap(&self, uri: &mut Uri) -> Result<()> {
if let Some(ref remaps) = self.remaps {
debug!("Remapping URI: {}", uri.url);
uri.url = remaps.remap(&uri.url)?;
}
Ok(())

View file

@ -91,8 +91,8 @@ pub use crate::{
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{
uri::valid::Uri, Base, BasicAuthCredentials, BasicAuthSelector, CacheStatus, ErrorKind,
FileType, Input, InputContent, InputSource, Request, Response, ResponseBody, Result,
Status,
uri::valid::Uri, Base, BasicAuthCredentials, BasicAuthSelector, CacheStatus, CookieJar,
ErrorKind, FileType, Input, InputContent, InputSource, Request, Response, ResponseBody,
Result, Status,
},
};

View file

@ -0,0 +1,84 @@
use std::io::ErrorKind as IoErrorKind;
use std::{path::PathBuf, sync::Arc};
use crate::{ErrorKind, Result};
use log::info;
use reqwest_cookie_store::{CookieStore as ReqwestCookieStore, CookieStoreMutex};
/// A wrapper around `reqwest_cookie_store::CookieStore`
///
/// We keep track of the file path of the cookie store and
/// implement `PartialEq` to compare cookie jars by their path
#[derive(Debug, Clone)]
pub struct CookieJar {
pub(crate) path: PathBuf,
pub(crate) inner: Arc<CookieStoreMutex>,
}
impl CookieJar {
/// Load a cookie store from a file
///
/// Currently only JSON files are supported
///
/// # Errors
///
/// This function will return an error if
/// - the file cannot be opened or
/// - if the file is not valid JSON
pub fn load(path: PathBuf) -> Result<Self> {
match std::fs::File::open(&path).map(std::io::BufReader::new) {
Ok(reader) => {
info!("Loading cookies from {}", path.display());
let inner = Arc::new(CookieStoreMutex::new(
ReqwestCookieStore::load_json(reader)
.map_err(|e| ErrorKind::Cookies(format!("Failed to load cookies: {e}")))?,
));
Ok(Self { path, inner })
}
// Create a new cookie store if the file does not exist
Err(e) if e.kind() == IoErrorKind::NotFound => Ok(Self {
path,
inner: Arc::new(CookieStoreMutex::new(ReqwestCookieStore::default())),
}),
// Propagate other IO errors (like permission denied) to the caller
Err(e) => Err(e.into()),
}
}
/// Save the cookie store to file as JSON
/// This will overwrite the file, which was loaded if any
///
/// # Errors
///
/// This function will return an error if
/// - the cookie store is locked or
/// - the file cannot be opened or
/// - if the file cannot be written to or
/// - if the file cannot be serialized to JSON
pub fn save(&self) -> Result<()> {
let mut file = std::fs::File::create(&self.path)?;
self.inner
.lock()
.map_err(|e| ErrorKind::Cookies(format!("Failed to lock cookie store: {e}")))?
.save_json(&mut file)
.map_err(|e| ErrorKind::Cookies(format!("Failed to save cookies: {e}")))
}
}
// Deref to inner cookie store
impl std::ops::Deref for CookieJar {
type Target = Arc<CookieStoreMutex>;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl PartialEq for CookieJar {
fn eq(&self, other: &Self) -> bool {
// Assume that the cookie jar is the same if the path is the same
// Comparing the cookie stores directly is not possible because the
// `CookieStore` struct does not implement `Eq`
self.path == other.path
}
}

View file

@ -132,6 +132,9 @@ pub enum ErrorKind {
/// Basic auth extractor error
#[error("Basic auth extractor error")]
BasicAuthExtractorError(#[from] BasicAuthExtractorError),
/// Cannot load cookies
#[error("Cannot load cookies")]
Cookies(String),
}
impl ErrorKind {
@ -267,6 +270,7 @@ impl Hash for ErrorKind {
Self::Regex(e) => e.to_string().hash(state),
Self::TooManyRedirects(e) => e.to_string().hash(state),
Self::BasicAuthExtractorError(e) => e.to_string().hash(state),
Self::Cookies(e) => e.to_string().hash(state),
}
}
}

View file

@ -3,6 +3,7 @@
mod base;
mod basic_auth;
mod cache;
mod cookies;
mod error;
mod file;
mod input;
@ -15,6 +16,7 @@ pub(crate) mod uri;
pub use base::Base;
pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector};
pub use cache::CacheStatus;
pub use cookies::CookieJar;
pub use error::ErrorKind;
pub use file::FileType;
pub use input::{Input, InputContent, InputSource};