mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Split up code into modules
This commit is contained in:
parent
7c51a24c44
commit
bc615c9bfb
3 changed files with 115 additions and 103 deletions
87
src/checker.rs
Normal file
87
src/checker.rs
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
use anyhow::{Context, Result};
|
||||
use github_rs::client::{Executor, Github};
|
||||
use github_rs::StatusCode;
|
||||
use regex::Regex;
|
||||
use reqwest::header::{self, HeaderValue};
|
||||
use serde_json::Value;
|
||||
use url::Url;
|
||||
|
||||
/// A link checker using an API token for Github links
|
||||
/// otherwise a normal HTTP client.
|
||||
pub(crate) struct Checker {
|
||||
reqwest_client: reqwest::blocking::Client,
|
||||
gh_client: Github,
|
||||
}
|
||||
|
||||
impl Checker {
|
||||
/// Creates a new link checker
|
||||
pub fn try_new(token: String) -> Result<Self> {
|
||||
let mut headers = header::HeaderMap::new();
|
||||
// Faking the user agent is necessary for some websites, unfortunately.
|
||||
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
|
||||
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
|
||||
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
|
||||
|
||||
let reqwest_client = reqwest::blocking::ClientBuilder::new()
|
||||
.gzip(true)
|
||||
.default_headers(headers)
|
||||
.build()?;
|
||||
|
||||
let gh_client = Github::new(token).unwrap();
|
||||
Ok(Checker {
|
||||
reqwest_client,
|
||||
gh_client,
|
||||
})
|
||||
}
|
||||
|
||||
fn check_github(&self, owner: String, repo: String) -> bool {
|
||||
let (_headers, status, _json) = self
|
||||
.gh_client
|
||||
.get()
|
||||
.repos()
|
||||
.owner(&owner)
|
||||
.repo(&repo)
|
||||
.execute::<Value>()
|
||||
.expect("Get failed");
|
||||
status == StatusCode::OK
|
||||
}
|
||||
|
||||
fn check_normal(&self, url: &Url) -> bool {
|
||||
let res = self.reqwest_client.get(url.as_str()).send();
|
||||
if res.is_err() {
|
||||
warn!("Cannot send request: {:?}", res);
|
||||
return false;
|
||||
}
|
||||
if let Ok(res) = res {
|
||||
if res.status().is_success() {
|
||||
true
|
||||
} else {
|
||||
warn!("Request with non-ok status code: {:?}", res);
|
||||
false
|
||||
}
|
||||
} else {
|
||||
warn!("Invalid response: {:?}", res);
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_github(&self, url: &str) -> Result<(String, String)> {
|
||||
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
|
||||
let caps = re.captures(&url).context("Invalid capture")?;
|
||||
let owner = caps.get(1).context("Cannot capture owner")?;
|
||||
let repo = caps.get(2).context("Cannot capture repo")?;
|
||||
Ok((owner.as_str().into(), repo.as_str().into()))
|
||||
}
|
||||
|
||||
pub fn check(&self, url: &Url) -> bool {
|
||||
if self.check_normal(&url) {
|
||||
return true;
|
||||
}
|
||||
// Pull out the heavy weapons in case of a failed normal request.
|
||||
// This could be a Github URL and we run into the rate limiter.
|
||||
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
|
||||
return self.check_github(owner, repo);
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
20
src/extract.rs
Normal file
20
src/extract.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
use pulldown_cmark::{Event, Parser, Tag};
|
||||
use std::collections::HashSet;
|
||||
use url::Url;
|
||||
|
||||
pub(crate) fn extract_links(md: &str) -> HashSet<Url> {
|
||||
let mut links: Vec<String> = Vec::new();
|
||||
Parser::new(md).for_each(|event| match event {
|
||||
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
|
||||
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
|
||||
_ => (),
|
||||
});
|
||||
|
||||
// Only keep legit URLs. This sorts out things like anchors.
|
||||
// Silently ignore the parse failures for now.
|
||||
// TODO: Log errors in verbose mode
|
||||
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
|
||||
debug!("Testing links: {:#?}", links);
|
||||
|
||||
links
|
||||
}
|
||||
111
src/main.rs
111
src/main.rs
|
|
@ -1,111 +1,16 @@
|
|||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use github_rs::client::{Executor, Github};
|
||||
use github_rs::StatusCode;
|
||||
use pulldown_cmark::{Event, Parser, Tag};
|
||||
use regex::Regex;
|
||||
use reqwest::header::{self, HeaderValue};
|
||||
use serde_json::Value;
|
||||
mod checker;
|
||||
mod extract;
|
||||
|
||||
use checker::Checker;
|
||||
use extract::extract_links;
|
||||
|
||||
use anyhow::Result;
|
||||
use std::env;
|
||||
use std::{collections::HashSet, fs};
|
||||
use url::Url;
|
||||
|
||||
struct Checker {
|
||||
reqwest_client: reqwest::blocking::Client,
|
||||
gh_client: Github,
|
||||
}
|
||||
|
||||
impl Checker {
|
||||
/// Creates a new link checker
|
||||
pub fn try_new(token: String) -> Result<Self> {
|
||||
let mut headers = header::HeaderMap::new();
|
||||
// Faking the user agent is necessary for some websites, unfortunately.
|
||||
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
|
||||
headers.insert(header::USER_AGENT, HeaderValue::from_str("curl/7.71.1")?);
|
||||
headers.insert(header::TRANSFER_ENCODING, HeaderValue::from_str("chunked")?);
|
||||
|
||||
let reqwest_client = reqwest::blocking::ClientBuilder::new()
|
||||
.gzip(true)
|
||||
.default_headers(headers)
|
||||
.build()?;
|
||||
|
||||
let gh_client = Github::new(token).unwrap();
|
||||
Ok(Checker {
|
||||
reqwest_client,
|
||||
gh_client,
|
||||
})
|
||||
}
|
||||
|
||||
fn check_github(&self, owner: String, repo: String) -> bool {
|
||||
let (_headers, status, _json) = self
|
||||
.gh_client
|
||||
.get()
|
||||
.repos()
|
||||
.owner(&owner)
|
||||
.repo(&repo)
|
||||
.execute::<Value>()
|
||||
.expect("Get failed");
|
||||
status == StatusCode::OK
|
||||
}
|
||||
|
||||
fn check_normal(&self, url: &Url) -> bool {
|
||||
let res = self.reqwest_client.get(url.as_str()).send();
|
||||
if res.is_err() {
|
||||
warn!("Cannot send request: {:?}", res);
|
||||
return false;
|
||||
}
|
||||
if let Ok(res) = res {
|
||||
if res.status().is_success() {
|
||||
true
|
||||
} else {
|
||||
warn!("Request with non-ok status code: {:?}", res);
|
||||
false
|
||||
}
|
||||
} else {
|
||||
warn!("Invalid response: {:?}", res);
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_github(&self, url: &str) -> Result<(String, String)> {
|
||||
let re = Regex::new(r"github\.com/([^/]*)/([^/]*)")?;
|
||||
let caps = re.captures(&url).context("Invalid capture")?;
|
||||
let owner = caps.get(1).context("Cannot capture owner")?;
|
||||
let repo = caps.get(2).context("Cannot capture repo")?;
|
||||
Ok((owner.as_str().into(), repo.as_str().into()))
|
||||
}
|
||||
|
||||
pub fn check(&self, url: &Url) -> bool {
|
||||
if self.check_normal(&url) {
|
||||
return true;
|
||||
}
|
||||
// Pull out the heavy weapons in case of a failed normal request.
|
||||
// This could be a Github URL and we run into the rate limiter.
|
||||
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
|
||||
return self.check_github(owner, repo);
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_links(md: &str) -> HashSet<Url> {
|
||||
let mut links: Vec<String> = Vec::new();
|
||||
Parser::new(md).for_each(|event| match event {
|
||||
Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()),
|
||||
Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()),
|
||||
_ => (),
|
||||
});
|
||||
|
||||
// Only keep legit URLs. This sorts out things like anchors.
|
||||
// Silently ignore the parse failures for now.
|
||||
// TODO: Log errors in verbose mode
|
||||
let links: HashSet<Url> = links.iter().flat_map(|l| Url::parse(&l)).collect();
|
||||
debug!("Testing links: {:#?}", links);
|
||||
|
||||
links
|
||||
}
|
||||
use std::fs;
|
||||
|
||||
struct Args {
|
||||
verbose: bool,
|
||||
|
|
|
|||
Loading…
Reference in a new issue