mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-26 22:23:58 +00:00
Split up code into modules
This commit is contained in:
parent
7c51a24c44
commit
bc615c9bfb
3 changed files with 115 additions and 103 deletions
87
src/checker.rs
Normal file
87
src/checker.rs
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use github_rs::client::{Executor, Github};
|
||||||
|
use github_rs::StatusCode;
|
||||||
|
use regex::Regex;
|
||||||
|
use reqwest::header::{self, HeaderValue};
|
||||||
|
use serde_json::Value;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
/// A link checker using an API token for Github links
|
||||||
|
/// otherwise a normal HTTP client.
|
||||||
|
pub(crate) struct Checker {
|
||||||
|
reqwest_client: reqwest::blocking::Client,
|
||||||
|
gh_client: Github,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Checker {
|
||||||
|
/// Creates a new link checker
|
||||||
|
pub fn try_new(token: String) -> Result<Self> {
|
||||||
|
let mut headers = header::HeaderMap::new();
|
||||||
|
// Faking the user agent is necessary for some websites, unfortunately.
|
||||||
|
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
|
||||||
|
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
|
||||||
|
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
|
||||||
|
|
||||||
|
let reqwest_client = reqwest::blocking::ClientBuilder::new()
|
||||||
|
.gzip(true)
|
||||||
|
.default_headers(headers)
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let gh_client = Github::new(token).unwrap();
|
||||||
|
Ok(Checker {
|
||||||
|
reqwest_client,
|
||||||
|
gh_client,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_github(&self, owner: String, repo: String) -> bool {
|
||||||
|
let (_headers, status, _json) = self
|
||||||
|
.gh_client
|
||||||
|
.get()
|
||||||
|
.repos()
|
||||||
|
.owner(&owner)
|
||||||
|
.repo(&repo)
|
||||||
|
.execute::<Value>()
|
||||||
|
.expect("Get failed");
|
||||||
|
status == StatusCode::OK
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_normal(&self, url: &Url) -> bool {
|
||||||
|
let res = self.reqwest_client.get(url.as_str()).send();
|
||||||
|
if res.is_err() {
|
||||||
|
warn!("Cannot send request: {:?}", res);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if let Ok(res) = res {
|
||||||
|
if res.status().is_success() {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
warn!("Request with non-ok status code: {:?}", res);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
warn!("Invalid response: {:?}", res);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_github(&self, url: &str) -> Result<(String, String)> {
|
||||||
|
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
|
||||||
|
let caps = re.captures(&url).context("Invalid capture")?;
|
||||||
|
let owner = caps.get(1).context("Cannot capture owner")?;
|
||||||
|
let repo = caps.get(2).context("Cannot capture repo")?;
|
||||||
|
Ok((owner.as_str().into(), repo.as_str().into()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn check(&self, url: &Url) -> bool {
|
||||||
|
if self.check_normal(&url) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// Pull out the heavy weapons in case of a failed normal request.
|
||||||
|
// This could be a Github URL and we run into the rate limiter.
|
||||||
|
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
|
||||||
|
return self.check_github(owner, repo);
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
20
src/extract.rs
Normal file
20
src/extract.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
use pulldown_cmark::{Event, Parser, Tag};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
pub(crate) fn extract_links(md: &str) -> HashSet<Url> {
|
||||||
|
let mut links: Vec<String> = Vec::new();
|
||||||
|
Parser::new(md).for_each(|event| match event {
|
||||||
|
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
|
||||||
|
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
|
||||||
|
_ => (),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Only keep legit URLs. This sorts out things like anchors.
|
||||||
|
// Silently ignore the parse failures for now.
|
||||||
|
// TODO: Log errors in verbose mode
|
||||||
|
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
|
||||||
|
debug!("Testing links: {:#?}", links);
|
||||||
|
|
||||||
|
links
|
||||||
|
}
|
||||||
111
src/main.rs
111
src/main.rs
|
|
@ -1,111 +1,16 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate log;
|
extern crate log;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
mod checker;
|
||||||
use github_rs::client::{Executor, Github};
|
mod extract;
|
||||||
use github_rs::StatusCode;
|
|
||||||
use pulldown_cmark::{Event, Parser, Tag};
|
use checker::Checker;
|
||||||
use regex::Regex;
|
use extract::extract_links;
|
||||||
use reqwest::header::{self, HeaderValue};
|
|
||||||
use serde_json::Value;
|
use anyhow::Result;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::{collections::HashSet, fs};
|
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
struct Checker {
|
use std::fs;
|
||||||
reqwest_client: reqwest::blocking::Client,
|
|
||||||
gh_client: Github,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Checker {
|
|
||||||
/// Creates a new link checker
|
|
||||||
pub fn try_new(token: String) -> Result<Self> {
|
|
||||||
let mut headers = header::HeaderMap::new();
|
|
||||||
// Faking the user agent is necessary for some websites, unfortunately.
|
|
||||||
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
|
|
||||||
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
|
|
||||||
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
|
|
||||||
|
|
||||||
let reqwest_client = reqwest::blocking::ClientBuilder::new()
|
|
||||||
.gzip(true)
|
|
||||||
.default_headers(headers)
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let gh_client = Github::new(token).unwrap();
|
|
||||||
Ok(Checker {
|
|
||||||
reqwest_client,
|
|
||||||
gh_client,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_github(&self, owner: String, repo: String) -> bool {
|
|
||||||
let (_headers, status, _json) = self
|
|
||||||
.gh_client
|
|
||||||
.get()
|
|
||||||
.repos()
|
|
||||||
.owner(&owner)
|
|
||||||
.repo(&repo)
|
|
||||||
.execute::<Value>()
|
|
||||||
.expect("Get failed");
|
|
||||||
status == StatusCode::OK
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_normal(&self, url: &Url) -> bool {
|
|
||||||
let res = self.reqwest_client.get(url.as_str()).send();
|
|
||||||
if res.is_err() {
|
|
||||||
warn!("Cannot send request: {:?}", res);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if let Ok(res) = res {
|
|
||||||
if res.status().is_success() {
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
warn!("Request with non-ok status code: {:?}", res);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
warn!("Invalid response: {:?}", res);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_github(&self, url: &str) -> Result<(String, String)> {
|
|
||||||
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
|
|
||||||
let caps = re.captures(&url).context("Invalid capture")?;
|
|
||||||
let owner = caps.get(1).context("Cannot capture owner")?;
|
|
||||||
let repo = caps.get(2).context("Cannot capture repo")?;
|
|
||||||
Ok((owner.as_str().into(), repo.as_str().into()))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn check(&self, url: &Url) -> bool {
|
|
||||||
if self.check_normal(&url) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// Pull out the heavy weapons in case of a failed normal request.
|
|
||||||
// This could be a Github URL and we run into the rate limiter.
|
|
||||||
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
|
|
||||||
return self.check_github(owner, repo);
|
|
||||||
}
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_links(md: &str) -> HashSet<Url> {
|
|
||||||
let mut links: Vec<String> = Vec::new();
|
|
||||||
Parser::new(md).for_each(|event| match event {
|
|
||||||
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
|
|
||||||
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
|
|
||||||
_ => (),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Only keep legit URLs. This sorts out things like anchors.
|
|
||||||
// Silently ignore the parse failures for now.
|
|
||||||
// TODO: Log errors in verbose mode
|
|
||||||
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
|
|
||||||
debug!("Testing links: {:#?}", links);
|
|
||||||
|
|
||||||
links
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Args {
|
struct Args {
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue