From 77b172488153e7f44eba4c1363fb966ee9a33ab7 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 23 Mar 2022 23:06:49 +0100 Subject: [PATCH] Optimize plaintext extractor for small strings (#565) Immediately return for very small strings which cannot be valid URIs. The shortest valid URI without a scheme might be g.cn (Google China) At least I am not aware of a shorter one. We set this as a lower threshold for parsing URIs from plaintext to avoid false-positives and as a slight performance optimization, which could add up for big files. This threshold might be adjusted in the future. --- lychee-lib/src/extract/plaintext.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lychee-lib/src/extract/plaintext.rs b/lychee-lib/src/extract/plaintext.rs index 7830d2e..46a6ede 100644 --- a/lychee-lib/src/extract/plaintext.rs +++ b/lychee-lib/src/extract/plaintext.rs @@ -1,7 +1,21 @@ use crate::{helpers::url, types::raw_uri::RawUri}; +/// Shortest valid URI that lychee extracts from plaintext. +/// +/// The shortest valid URI without a scheme might be g.cn (Google China) +/// At least I am not aware of a shorter one. We set this as a lower threshold +/// for parsing URIs from plaintext to avoid false-positives and as a slight +/// performance optimization, which could add up for big files. +/// This threshold might be adjusted in the future. +const MIN_URI_LENGTH: usize = 4; + /// Extract unparsed URL strings from plaintext pub(crate) fn extract_plaintext(input: &str) -> Vec { + if input.len() < MIN_URI_LENGTH { + // Immediately return for very small strings which cannot be valid URIs + return vec![]; + } + url::find_links(input) .map(|uri| RawUri::from(uri.as_str())) .collect()