diff --git a/src/checker.rs b/src/checker.rs new file mode 100644 index 0000000..dd9b032 --- /dev/null +++ b/src/checker.rs @@ -0,0 +1,87 @@ +use anyhow::{Context, Result}; +use github_rs::client::{Executor, Github}; +use github_rs::StatusCode; +use regex::Regex; +use reqwest::header::{self, HeaderValue}; +use serde_json::Value; +use url::Url; + +/// A link checker using an API token for Github links +/// otherwise a normal HTTP client. +pub(crate) struct Checker { + reqwest_client: reqwest::blocking::Client, + gh_client: Github, +} + +impl Checker { + /// Creates a new link checker + pub fn try_new(token: String) -> Result { + let mut headers = header::HeaderMap::new(); + // Faking the user agent is necessary for some websites, unfortunately. + // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). + headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?); + headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?); + + let reqwest_client = reqwest::blocking::ClientBuilder::new() + .gzip(true) + .default_headers(headers) + .build()?; + + let gh_client = Github::new(token).unwrap(); + Ok(Checker { + reqwest_client, + gh_client, + }) + } + + fn check_github(&self, owner: String, repo: String) -> bool { + let (_headers, status, _json) = self + .gh_client + .get() + .repos() + .owner(&owner) + .repo(&repo) + .execute::() + .expect("Get failed"); + status == StatusCode::OK + } + + fn check_normal(&self, url: &Url) -> bool { + let res = self.reqwest_client.get(url.as_str()).send(); + if res.is_err() { + warn!("Cannot send request: {:?}", res); + return false; + } + if let Ok(res) = res { + if res.status().is_success() { + true + } else { + warn!("Request with non-ok status code: {:?}", res); + false + } + } else { + warn!("Invalid response: {:?}", res); + false + } + } + + fn extract_github(&self, url: &str) -> Result<(String, String)> { + let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?; + let caps = re.captures(&url).context("Invalid capture")?; + let owner = caps.get(1).context("Cannot capture owner")?; + let repo = caps.get(2).context("Cannot capture repo")?; + Ok((owner.as_str().into(), repo.as_str().into())) + } + + pub fn check(&self, url: &Url) -> bool { + if self.check_normal(&url) { + return true; + } + // Pull out the heavy weapons in case of a failed normal request. + // This could be a Github URL and we run into the rate limiter. + if let Ok((owner, repo)) = self.extract_github(url.as_str()) { + return self.check_github(owner, repo); + } + false + } +} diff --git a/src/extract.rs b/src/extract.rs new file mode 100644 index 0000000..b389728 --- /dev/null +++ b/src/extract.rs @@ -0,0 +1,20 @@ +use pulldown_cmark::{Event, Parser, Tag}; +use std::collections::HashSet; +use url::Url; + +pub(crate) fn extract_links(md: &str) -> HashSet { + let mut links: Vec = Vec::new(); + Parser::new(md).for_each(|event| match event { + Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()), + Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()), + _ => (), + }); + + // Only keep legit URLs. This sorts out things like anchors. + // Silently ignore the parse failures for now. + // TODO: Log errors in verbose mode + let links: HashSet = links.iter().flat_map(|l| Url::parse(&l)).collect(); + debug!("Testing links: {:#?}", links); + + links +} diff --git a/src/main.rs b/src/main.rs index e419b4c..22dfd69 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,111 +1,16 @@ #[macro_use] extern crate log; -use anyhow::{Context, Result}; -use github_rs::client::{Executor, Github}; -use github_rs::StatusCode; -use pulldown_cmark::{Event, Parser, Tag}; -use regex::Regex; -use reqwest::header::{self, HeaderValue}; -use serde_json::Value; +mod checker; +mod extract; + +use checker::Checker; +use extract::extract_links; + +use anyhow::Result; use std::env; -use std::{collections::HashSet, fs}; -use url::Url; -struct Checker { - reqwest_client: reqwest::blocking::Client, - gh_client: Github, -} - -impl Checker { - /// Creates a new link checker - pub fn try_new(token: String) -> Result { - let mut headers = header::HeaderMap::new(); - // Faking the user agent is necessary for some websites, unfortunately. - // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). - headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?); - headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?); - - let reqwest_client = reqwest::blocking::ClientBuilder::new() - .gzip(true) - .default_headers(headers) - .build()?; - - let gh_client = Github::new(token).unwrap(); - Ok(Checker { - reqwest_client, - gh_client, - }) - } - - fn check_github(&self, owner: String, repo: String) -> bool { - let (_headers, status, _json) = self - .gh_client - .get() - .repos() - .owner(&owner) - .repo(&repo) - .execute::() - .expect("Get failed"); - status == StatusCode::OK - } - - fn check_normal(&self, url: &Url) -> bool { - let res = self.reqwest_client.get(url.as_str()).send(); - if res.is_err() { - warn!("Cannot send request: {:?}", res); - return false; - } - if let Ok(res) = res { - if res.status().is_success() { - true - } else { - warn!("Request with non-ok status code: {:?}", res); - false - } - } else { - warn!("Invalid response: {:?}", res); - false - } - } - - fn extract_github(&self, url: &str) -> Result<(String, String)> { - let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?; - let caps = re.captures(&url).context("Invalid capture")?; - let owner = caps.get(1).context("Cannot capture owner")?; - let repo = caps.get(2).context("Cannot capture repo")?; - Ok((owner.as_str().into(), repo.as_str().into())) - } - - pub fn check(&self, url: &Url) -> bool { - if self.check_normal(&url) { - return true; - } - // Pull out the heavy weapons in case of a failed normal request. - // This could be a Github URL and we run into the rate limiter. - if let Ok((owner, repo)) = self.extract_github(url.as_str()) { - return self.check_github(owner, repo); - } - false - } -} - -fn extract_links(md: &str) -> HashSet { - let mut links: Vec = Vec::new(); - Parser::new(md).for_each(|event| match event { - Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()), - Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()), - _ => (), - }); - - // Only keep legit URLs. This sorts out things like anchors. - // Silently ignore the parse failures for now. - // TODO: Log errors in verbose mode - let links: HashSet = links.iter().flat_map(|l| Url::parse(&l)).collect(); - debug!("Testing links: {:#?}", links); - - links -} +use std::fs; struct Args { verbose: bool,