From 6e0f559b25c915e4961484e9f33f03fffa8c4138 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 9 Aug 2020 23:12:25 +0200 Subject: [PATCH] Switch to linkify to cover non MD links --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + src/extract.rs | 15 ++++++--------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cce2f4a..7bf02e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -717,6 +717,15 @@ version = "0.2.71" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9457b06509d27052635f90d6466700c65095fdf75409b3fbdd903e988b886f49" +[[package]] +name = "linkify" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03951527dd24d2c59f407502e7d88e0948ef06fac23335b556a4c2bc03c22096" +dependencies = [ + "memchr", +] + [[package]] name = "lock_api" version = "0.3.4" @@ -753,6 +762,7 @@ dependencies = [ "any", "anyhow", "github-rs", + "linkify", "log", "pico-args", "pretty_env_logger", diff --git a/Cargo.toml b/Cargo.toml index de52605..542a7d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ version = "0.1.0" any = "*" anyhow = "*" github-rs = "0.7.0" +linkify = "*" log = "0.4" pico-args = "0.3.3" pretty_env_logger = "0.4" diff --git a/src/extract.rs b/src/extract.rs index 0244082..08ed5d9 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,19 +1,16 @@ -use pulldown_cmark::{Event, Parser, Tag}; +use linkify::{LinkFinder, LinkKind}; + use std::collections::HashSet; use url::Url; -pub(crate) fn extract_links(md: &str) -> HashSet { - let mut links: Vec = Vec::new(); - Parser::new(md).for_each(|event| match event { - Event::Start(Tag::Link(_, link, _)) => links.push(link.into_string()), - Event::Start(Tag::Image(_, link, _)) => links.push(link.into_string()), - _ => (), - }); +pub(crate) fn extract_links(input: &str) -> HashSet { + let finder = LinkFinder::new(); + let links: Vec<_> = finder.links(input).collect(); // Only keep legit URLs. This sorts out things like anchors. // Silently ignore the parse failures for now. // TODO: Log errors in verbose mode - let links: HashSet = links.iter().flat_map(|l| Url::parse(&l)).collect(); + let links: HashSet = links.iter().flat_map(|l| Url::parse(l.as_str())).collect(); debug!("Testing links: {:#?}", links); links