mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-14 00:33:10 +00:00
Optimize plaintext extractor for small strings (#565)
Immediately return for very small strings which cannot be valid URIs. The shortest valid URI without a scheme might be g.cn (Google China) At least I am not aware of a shorter one. We set this as a lower threshold for parsing URIs from plaintext to avoid false-positives and as a slight performance optimization, which could add up for big files. This threshold might be adjusted in the future.
This commit is contained in:
parent
9ece4f9552
commit
77b1724881
1 changed files with 14 additions and 0 deletions
|
|
@ -1,7 +1,21 @@
|
|||
use crate::{helpers::url, types::raw_uri::RawUri};
|
||||
|
||||
/// Shortest valid URI that lychee extracts from plaintext.
|
||||
///
|
||||
/// The shortest valid URI without a scheme might be g.cn (Google China)
|
||||
/// At least I am not aware of a shorter one. We set this as a lower threshold
|
||||
/// for parsing URIs from plaintext to avoid false-positives and as a slight
|
||||
/// performance optimization, which could add up for big files.
|
||||
/// This threshold might be adjusted in the future.
|
||||
const MIN_URI_LENGTH: usize = 4;
|
||||
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
pub(crate) fn extract_plaintext(input: &str) -> Vec<RawUri> {
|
||||
if input.len() < MIN_URI_LENGTH {
|
||||
// Immediately return for very small strings which cannot be valid URIs
|
||||
return vec![];
|
||||
}
|
||||
|
||||
url::find_links(input)
|
||||
.map(|uri| RawUri::from(uri.as_str()))
|
||||
.collect()
|
||||
|
|
|
|||
Loading…
Reference in a new issue