Split up code into modules

This commit is contained in:
Matthias Endler 2020-08-09 22:47:39 +02:00
parent 7c51a24c44
commit bc615c9bfb
3 changed files with 115 additions and 103 deletions

87
src/checker.rs Normal file
View file

@ -0,0 +1,87 @@
use anyhow::{Context, Result};
use github_rs::client::{Executor, Github};
use github_rs::StatusCode;
use regex::Regex;
use reqwest::header::{self, HeaderValue};
use serde_json::Value;
use url::Url;
/// A link checker using an API token for Github links
/// otherwise a normal HTTP client.
pub(crate) struct Checker {
reqwest_client: reqwest::blocking::Client,
gh_client: Github,
}
impl Checker {
/// Creates a new link checker
pub fn try_new(token: String) -> Result<Self> {
let mut headers = header::HeaderMap::new();
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
let reqwest_client = reqwest::blocking::ClientBuilder::new()
.gzip(true)
.default_headers(headers)
.build()?;
let gh_client = Github::new(token).unwrap();
Ok(Checker {
reqwest_client,
gh_client,
})
}
fn check_github(&self, owner: String, repo: String) -> bool {
let (_headers, status, _json) = self
.gh_client
.get()
.repos()
.owner(&owner)
.repo(&repo)
.execute::<Value>()
.expect("Get failed");
status == StatusCode::OK
}
fn check_normal(&self, url: &Url) -> bool {
let res = self.reqwest_client.get(url.as_str()).send();
if res.is_err() {
warn!("Cannot send request: {:?}", res);
return false;
}
if let Ok(res) = res {
if res.status().is_success() {
true
} else {
warn!("Request with non-ok status code: {:?}", res);
false
}
} else {
warn!("Invalid response: {:?}", res);
false
}
}
fn extract_github(&self, url: &str) -> Result<(String, String)> {
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
let caps = re.captures(&url).context("Invalid capture")?;
let owner = caps.get(1).context("Cannot capture owner")?;
let repo = caps.get(2).context("Cannot capture repo")?;
Ok((owner.as_str().into(), repo.as_str().into()))
}
pub fn check(&self, url: &Url) -> bool {
if self.check_normal(&url) {
return true;
}
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
return self.check_github(owner, repo);
}
false
}
}

20
src/extract.rs Normal file
View file

@ -0,0 +1,20 @@
use pulldown_cmark::{Event, Parser, Tag};
use std::collections::HashSet;
use url::Url;
pub(crate) fn extract_links(md: &str) -> HashSet<Url> {
let mut links: Vec<String> = Vec::new();
Parser::new(md).for_each(|event| match event {
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
_ => (),
});
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
// TODO: Log errors in verbose mode
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
debug!("Testing links: {:#?}", links);
links
}

View file

@ -1,111 +1,16 @@
#[macro_use]
extern crate log;
use anyhow::{Context, Result};
use github_rs::client::{Executor, Github};
use github_rs::StatusCode;
use pulldown_cmark::{Event, Parser, Tag};
use regex::Regex;
use reqwest::header::{self, HeaderValue};
use serde_json::Value;
mod checker;
mod extract;
use checker::Checker;
use extract::extract_links;
use anyhow::Result;
use std::env;
use std::{collections::HashSet, fs};
use url::Url;
struct Checker {
reqwest_client: reqwest::blocking::Client,
gh_client: Github,
}
impl Checker {
/// Creates a new link checker
pub fn try_new(token: String) -> Result<Self> {
let mut headers = header::HeaderMap::new();
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
let reqwest_client = reqwest::blocking::ClientBuilder::new()
.gzip(true)
.default_headers(headers)
.build()?;
let gh_client = Github::new(token).unwrap();
Ok(Checker {
reqwest_client,
gh_client,
})
}
fn check_github(&self, owner: String, repo: String) -> bool {
let (_headers, status, _json) = self
.gh_client
.get()
.repos()
.owner(&owner)
.repo(&repo)
.execute::<Value>()
.expect("Get failed");
status == StatusCode::OK
}
fn check_normal(&self, url: &Url) -> bool {
let res = self.reqwest_client.get(url.as_str()).send();
if res.is_err() {
warn!("Cannot send request: {:?}", res);
return false;
}
if let Ok(res) = res {
if res.status().is_success() {
true
} else {
warn!("Request with non-ok status code: {:?}", res);
false
}
} else {
warn!("Invalid response: {:?}", res);
false
}
}
fn extract_github(&self, url: &str) -> Result<(String, String)> {
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
let caps = re.captures(&url).context("Invalid capture")?;
let owner = caps.get(1).context("Cannot capture owner")?;
let repo = caps.get(2).context("Cannot capture repo")?;
Ok((owner.as_str().into(), repo.as_str().into()))
}
pub fn check(&self, url: &Url) -> bool {
if self.check_normal(&url) {
return true;
}
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
return self.check_github(owner, repo);
}
false
}
}
fn extract_links(md: &str) -> HashSet<Url> {
let mut links: Vec<String> = Vec::new();
Parser::new(md).for_each(|event| match event {
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
_ => (),
});
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
// TODO: Log errors in verbose mode
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
debug!("Testing links: {:#?}", links);
links
}
use std::fs;
struct Args {
verbose: bool,