lychee/examples/collect_links/collect_links.rs
Markus Unterwaditzer 68d09f7e5b
Add html5gum as alternative link extractor (#480)
html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
2022-02-07 22:54:47 +01:00

34 lines
995 B
Rust

use lychee_lib::{Collector, Input, InputSource, Result};
use reqwest::Url;
use std::path::PathBuf;
use tokio_stream::StreamExt;
#[tokio::main]
#[allow(clippy::trivial_regex)]
async fn main() -> Result<()> {
// Collect all links from the following inputs
let inputs = vec![
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
)),
file_type_hint: None,
},
Input {
source: InputSource::FsPath(PathBuf::from("fixtures/TEST.md")),
file_type_hint: None,
},
];
let links = Collector::new(None) // base
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.use_html5ever(false) // use html5ever for parsing? (default=false)
.collect_links(inputs) // base url or directory
.await
.collect::<Result<Vec<_>>>()
.await?;
dbg!(links);
Ok(())
}