mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-01 10:24:47 +00:00
Don't check preconnect links (#1187)
Preconnect links are used to establish a server connection without loading a specific resource yet. Not always do these links point to a URL that should return a 200, and they are not user-facing, i.e. they don't show up in the final rendered version of a page. Therefore, we should not check them at all; not even in `--include-verbatim` mode, as they might not point to a valid resource. This turned out to require a significant overhaul of the html5gum extractor to handle random attribute ordering correctly. Changes to the html5gum extractor: * Refactor HTML link extractor for improved performance and maintainability - Replace Vec<u8> with String for better readability and manipulation - Introduce Element struct to encapsulate element-related data - Use `HashMap<String, String>` for current_attributes for efficient lookups - Add verbatim_stack to properly handle nested verbatim elements - Remove unsafe code where possible, using String::from_utf8_lossy - Improve attribute handling with `HashMap` entry API and prioritize `srcset` - Simplify logic and consolidate verbatim element handling - Enhance encapsulation in `LinkExtractor` struct - Improve overall performance with more efficient data structures - Increase flexibility for future feature additions or modifications Fixes #897
This commit is contained in:
parent
8026684d64
commit
11adc09725
7 changed files with 307 additions and 208 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -2676,6 +2676,7 @@ dependencies = [
|
|||
"par-stream",
|
||||
"path-clean",
|
||||
"percent-encoding",
|
||||
"pretty_assertions",
|
||||
"pulldown-cmark",
|
||||
"regex",
|
||||
"reqwest 0.12.8",
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ wiremock = "0.6.2"
|
|||
serde_json = "1.0.128"
|
||||
rstest = "0.23.0"
|
||||
toml = "0.8.19"
|
||||
pretty_assertions = "1.4.0"
|
||||
|
||||
[features]
|
||||
|
||||
|
|
@ -91,4 +92,4 @@ vendored-openssl = ["openssl-sys/vendored"]
|
|||
# See https://users.rust-lang.org/t/36630
|
||||
check_example_domains = []
|
||||
|
||||
default = ["native-tls", "email-check"]
|
||||
default = ["native-tls", "email-check"]
|
||||
|
|
@ -6,7 +6,9 @@ use html5ever::{
|
|||
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
|
||||
};
|
||||
|
||||
use super::{super::plaintext::extract_plaintext, is_email_link, is_verbatim_elem, srcset};
|
||||
use super::{
|
||||
super::plaintext::extract_raw_uri_from_plaintext, is_email_link, is_verbatim_elem, srcset,
|
||||
};
|
||||
use crate::types::uri::raw::RawUri;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
|
|
@ -26,7 +28,9 @@ impl TokenSink for LinkExtractor {
|
|||
if self.current_verbatim_element_name.borrow().is_some() {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
self.links.borrow_mut().extend(extract_plaintext(&raw));
|
||||
self.links
|
||||
.borrow_mut()
|
||||
.extend(extract_raw_uri_from_plaintext(&raw));
|
||||
}
|
||||
Token::TagToken(tag) => {
|
||||
let Tag {
|
||||
|
|
@ -72,6 +76,14 @@ impl TokenSink for LinkExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
// Check and exclude rel=preconnect. Other than prefetch and preload,
|
||||
// preconnect only does DNS lookups and might not be a link to a resource
|
||||
if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
|
||||
if rel.value.contains("preconnect") {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
}
|
||||
|
||||
for attr in attrs {
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(
|
||||
&attr.name.local,
|
||||
|
|
@ -80,7 +92,7 @@ impl TokenSink for LinkExtractor {
|
|||
);
|
||||
|
||||
let new_urls = match urls {
|
||||
None => extract_plaintext(&attr.value),
|
||||
None => extract_raw_uri_from_plaintext(&attr.value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.filter(|url| {
|
||||
|
|
@ -140,6 +152,7 @@ impl LinkExtractor {
|
|||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
|
||||
match (elem_name, attr_name) {
|
||||
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
|
|
@ -380,4 +393,24 @@ mod tests {
|
|||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_preconnect() {
|
||||
let input = r#"
|
||||
<link rel="preconnect" href="https://example.com">
|
||||
"#;
|
||||
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_preconnect_reverse_order() {
|
||||
let input = r#"
|
||||
<link href="https://example.com" rel="preconnect">
|
||||
"#;
|
||||
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,199 +1,211 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use html5gum::{Emitter, Error, State, Tokenizer};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use super::{is_email_link, is_verbatim_elem, srcset};
|
||||
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
|
||||
use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri};
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Extract links from HTML documents.
|
||||
///
|
||||
/// This is the main driver for the html5gum tokenizer.
|
||||
/// It implements the `Emitter` trait, which is used by the tokenizer to
|
||||
/// communicate with the caller.
|
||||
///
|
||||
/// The `LinkExtractor` keeps track of the current element being processed,
|
||||
/// the current attribute being processed, and a bunch of plain characters
|
||||
/// currently being processed.
|
||||
///
|
||||
/// The `links` vector contains all links extracted from the HTML document and
|
||||
/// the `fragments` set contains all fragments extracted from the HTML document.
|
||||
#[derive(Clone, Default)]
|
||||
struct LinkExtractor {
|
||||
// note: what html5gum calls a tag, lychee calls an element
|
||||
/// Links extracted from the HTML document.
|
||||
links: Vec<RawUri>,
|
||||
/// Fragments extracted from the HTML document.
|
||||
fragments: HashSet<String>,
|
||||
current_string: Vec<u8>,
|
||||
current_element_name: Vec<u8>,
|
||||
current_element_is_closing: bool,
|
||||
current_element_nofollow: bool,
|
||||
current_attribute_name: Vec<u8>,
|
||||
current_attribute_value: Vec<u8>,
|
||||
last_start_element: Vec<u8>,
|
||||
/// Whether to include verbatim elements in the output.
|
||||
include_verbatim: bool,
|
||||
current_verbatim_element_name: Option<Vec<u8>>,
|
||||
/// Current element being processed.
|
||||
current_element: Element,
|
||||
/// Current attributes being processed.
|
||||
/// This is a list of key-value pairs (in order of appearance), where the key is the attribute name
|
||||
/// and the value is the attribute value.
|
||||
current_attributes: HashMap<String, String>,
|
||||
/// Current attribute name being processed.
|
||||
current_attribute_name: String,
|
||||
/// A bunch of plain characters currently being processed.
|
||||
current_raw_string: String,
|
||||
/// Element name of the current verbatim block.
|
||||
/// Used to keep track of nested verbatim blocks.
|
||||
verbatim_stack: Vec<String>,
|
||||
}
|
||||
|
||||
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
|
||||
/// of debugging
|
||||
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
|
||||
debug_assert!(std::str::from_utf8(s).is_ok());
|
||||
std::str::from_utf8_unchecked(s)
|
||||
#[derive(Clone, Default)]
|
||||
struct Element {
|
||||
/// Current element name being processed.
|
||||
/// This is called a tag in html5gum.
|
||||
name: String,
|
||||
/// Whether the current element is a closing tag.
|
||||
is_closing: bool,
|
||||
}
|
||||
|
||||
impl LinkExtractor {
|
||||
pub(crate) fn new(include_verbatim: bool) -> Self {
|
||||
LinkExtractor {
|
||||
links: Vec::new(),
|
||||
fragments: HashSet::new(),
|
||||
current_string: Vec::new(),
|
||||
current_element_name: Vec::new(),
|
||||
current_element_is_closing: false,
|
||||
current_element_nofollow: false,
|
||||
current_attribute_name: Vec::new(),
|
||||
current_attribute_value: Vec::new(),
|
||||
last_start_element: Vec::new(),
|
||||
/// Create a new `LinkExtractor`.
|
||||
///
|
||||
/// Set `include_verbatim` to `true` if you want to include verbatim
|
||||
/// elements in the output.
|
||||
fn new(include_verbatim: bool) -> Self {
|
||||
Self {
|
||||
include_verbatim,
|
||||
current_verbatim_element_name: None,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given HTML attribute.
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
pub(crate) fn extract_urls_from_elem_attr<'a>(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &'a str,
|
||||
) -> Option<impl Iterator<Item = &'a str>> {
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
fn extract_urls_from_elem_attr(&self) -> Vec<RawUri> {
|
||||
let mut urls = Vec::new();
|
||||
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
| ("applet", "codebase")
|
||||
| ("body", "background")
|
||||
| ("button", "formaction")
|
||||
| ("command", "icon")
|
||||
| ("form", "action")
|
||||
| ("frame", "longdesc")
|
||||
| ("head", "profile")
|
||||
| ("html", "manifest")
|
||||
| ("iframe", "longdesc")
|
||||
| ("img", "longdesc")
|
||||
| ("input", "formaction")
|
||||
| ("object", "classid")
|
||||
| ("object", "codebase")
|
||||
| ("object", "data")
|
||||
| ("video", "poster") => {
|
||||
Some(vec![attr_value].into_iter())
|
||||
}
|
||||
(_, "srcset") => {
|
||||
Some(srcset::parse(attr_value).into_iter())
|
||||
}
|
||||
_ => None,
|
||||
// Process 'srcset' attribute first
|
||||
if let Some(srcset) = self.current_attributes.get("srcset") {
|
||||
urls.extend(srcset::parse(srcset).into_iter().map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(self.current_element.name.clone()),
|
||||
attribute: Some("srcset".to_string()),
|
||||
}));
|
||||
}
|
||||
|
||||
// Process other attributes
|
||||
for (attr_name, attr_value) in &self.current_attributes {
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
match (self.current_element.name.as_str(), attr_name.as_str()) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap") |
|
||||
// Less common (but still valid!) combinations
|
||||
("applet", "codebase") |
|
||||
("body", "background") |
|
||||
("button", "formaction") |
|
||||
("command", "icon") |
|
||||
("form", "action") |
|
||||
("frame", "longdesc") |
|
||||
("head", "profile") |
|
||||
("html", "manifest") |
|
||||
("iframe", "longdesc") |
|
||||
("img", "longdesc") |
|
||||
("input", "formaction") |
|
||||
("object", "classid" | "codebase" | "data") |
|
||||
("video", "poster") => {
|
||||
urls.push(RawUri {
|
||||
text: attr_value.to_string(),
|
||||
element: Some(self.current_element.name.clone()),
|
||||
attribute: Some(attr_name.to_string()),
|
||||
});
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
/// Extract links from the current string and add them to the links vector.
|
||||
fn flush_current_characters(&mut self) {
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
||||
if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
|
||||
self.update_verbatim_element_name();
|
||||
// Early return if we don't want to extract links from preformatted text
|
||||
self.current_string.clear();
|
||||
if !self.include_verbatim
|
||||
&& (is_verbatim_elem(&self.current_element.name) || !self.verbatim_stack.is_empty())
|
||||
{
|
||||
self.update_verbatim_element();
|
||||
// Early return since we don't want to extract links from verbatim
|
||||
// blocks according to the configuration.
|
||||
self.current_raw_string.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
|
||||
self.links.extend(extract_plaintext(raw));
|
||||
self.current_string.clear();
|
||||
}
|
||||
|
||||
/// Check if we are currently inside a verbatim element.
|
||||
const fn inside_verbatim_block(&self) -> bool {
|
||||
self.current_verbatim_element_name.is_some()
|
||||
self.links
|
||||
.extend(extract_raw_uri_from_plaintext(&self.current_raw_string));
|
||||
self.current_raw_string.clear();
|
||||
}
|
||||
|
||||
/// Update the current verbatim element name.
|
||||
///
|
||||
/// Keeps track of the last verbatim element name, so that we can
|
||||
/// properly handle nested verbatim blocks.
|
||||
fn update_verbatim_element_name(&mut self) {
|
||||
if self.current_element_is_closing {
|
||||
if self.inside_verbatim_block() {
|
||||
// If we are closing a verbatim element, we need to check if it is the
|
||||
// top-level verbatim element. If it is, we need to reset the verbatim block.
|
||||
if Some(&self.current_element_name) == self.current_verbatim_element_name.as_ref() {
|
||||
self.current_verbatim_element_name = None;
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
fn update_verbatim_element(&mut self) {
|
||||
if self.current_element.is_closing {
|
||||
if let Some(last_verbatim) = self.verbatim_stack.last() {
|
||||
if last_verbatim == &self.current_element.name {
|
||||
self.verbatim_stack.pop();
|
||||
}
|
||||
}
|
||||
} else if !self.include_verbatim
|
||||
&& is_verbatim_elem(unsafe { from_utf8_unchecked(&self.current_element_name) })
|
||||
{
|
||||
// If we are opening a verbatim element, we need to check if we are already
|
||||
// inside a verbatim element. If so, we need to ignore this element.
|
||||
if !self.inside_verbatim_block() {
|
||||
self.current_verbatim_element_name = Some(self.current_element_name.clone());
|
||||
}
|
||||
} else if !self.include_verbatim && is_verbatim_elem(&self.current_element.name) {
|
||||
self.verbatim_stack.push(self.current_element.name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_old_attribute(&mut self) {
|
||||
/// Flush the current element and attribute values to the links vector.
|
||||
///
|
||||
/// This function is called whenever a new element is encountered or when the
|
||||
/// current element is closing. It extracts URLs from the current attribute value
|
||||
/// and adds them to the links vector.
|
||||
///
|
||||
/// Here are the rules for extracting links:
|
||||
/// - If the current element has a `rel=nofollow` attribute, the current attribute
|
||||
/// value is ignored.
|
||||
/// - If the current element has a `rel=preconnect` attribute, the current attribute
|
||||
/// value is ignored.
|
||||
/// - If the current attribute value is not a URL, it is treated as plain text and
|
||||
/// added to the links vector.
|
||||
/// - If the current attribute name is `id`, the current attribute value is added
|
||||
/// to the fragments set.
|
||||
///
|
||||
/// The current attribute name and value are cleared after processing.
|
||||
fn flush_links(&mut self) {
|
||||
self.update_verbatim_element();
|
||||
|
||||
if !self.include_verbatim
|
||||
&& (!self.verbatim_stack.is_empty() || is_verbatim_elem(&self.current_element.name))
|
||||
{
|
||||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
||||
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
||||
|
||||
// Early return if we don't want to extract links from verbatim
|
||||
// blocks (e.g. preformatted text)
|
||||
if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
|
||||
self.update_verbatim_element_name();
|
||||
return;
|
||||
}
|
||||
|
||||
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
|
||||
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
|
||||
|
||||
// Ignore links with rel=nofollow
|
||||
// This may be set on a different iteration on the same element/tag before,
|
||||
// so we check the boolean separately right after
|
||||
if attr == "rel" && value.contains("nofollow") {
|
||||
self.current_element_nofollow = true;
|
||||
}
|
||||
if self.current_element_nofollow {
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
|
||||
|
||||
let new_urls = match urls {
|
||||
None => extract_plaintext(value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.filter(|url| {
|
||||
// Only accept email addresses, which occur in `href` attributes
|
||||
// and start with `mailto:`. Technically, email addresses could
|
||||
// also occur in plain text, but we don't want to extract those
|
||||
// because of the high false positive rate.
|
||||
//
|
||||
// This ignores links like `<img srcset="v2@1.5x.png">`
|
||||
let is_email = is_email_link(url);
|
||||
let is_mailto = url.starts_with("mailto:");
|
||||
let is_phone = url.starts_with("tel:");
|
||||
let is_href = attr == "href";
|
||||
|
||||
!is_email || (is_mailto && is_href) || (is_phone && is_href)
|
||||
})
|
||||
.map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(name.to_string()),
|
||||
attribute: Some(attr.to_string()),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
};
|
||||
|
||||
self.links.extend(new_urls);
|
||||
|
||||
if attr == "id" {
|
||||
self.fragments.insert(value.to_string());
|
||||
}
|
||||
self.current_attributes.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
if self.current_attributes.get("rel").map_or(false, |rel| {
|
||||
rel.split(',')
|
||||
.any(|r| r.trim() == "nofollow" || r.trim() == "preconnect")
|
||||
}) {
|
||||
self.current_attributes.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
let new_urls = self
|
||||
.extract_urls_from_elem_attr()
|
||||
.into_iter()
|
||||
.filter(|url| {
|
||||
// Only accept email addresses or phone numbers, which
|
||||
// occur in `href` attributes and start with `mailto:`
|
||||
// or `tel:`, respectively
|
||||
//
|
||||
// Technically, email addresses could also occur in
|
||||
// plain text, but we don't want to extract those
|
||||
// because of the high false-positive rate.
|
||||
//
|
||||
// This skips links like `<img srcset="v2@1.5x.png">`
|
||||
let is_email = is_email_link(&url.text);
|
||||
let is_mailto = url.text.starts_with("mailto:");
|
||||
let is_phone = url.text.starts_with("tel:");
|
||||
let is_href = url.attribute.as_deref() == Some("href");
|
||||
|
||||
!is_email || (is_mailto && is_href) || (is_phone && is_href)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.links.extend(new_urls);
|
||||
|
||||
if let Some(id) = self.current_attributes.get("id") {
|
||||
self.fragments.insert(id.to_string());
|
||||
}
|
||||
|
||||
self.current_attributes.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -201,38 +213,41 @@ impl Emitter for &mut LinkExtractor {
|
|||
type Token = ();
|
||||
|
||||
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
|
||||
self.last_start_element.clear();
|
||||
self.last_start_element
|
||||
.extend(last_start_tag.unwrap_or_default());
|
||||
self.current_element.name =
|
||||
String::from_utf8_lossy(last_start_tag.unwrap_or_default()).into_owned();
|
||||
}
|
||||
|
||||
fn emit_eof(&mut self) {
|
||||
self.flush_current_characters();
|
||||
}
|
||||
|
||||
fn emit_error(&mut self, _: Error) {}
|
||||
|
||||
#[inline]
|
||||
fn should_emit_errors(&mut self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn pop_token(&mut self) -> Option<()> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Emit a bunch of plain characters as character tokens.
|
||||
fn emit_string(&mut self, c: &[u8]) {
|
||||
self.current_string.extend(c);
|
||||
self.current_raw_string
|
||||
.push_str(&String::from_utf8_lossy(c));
|
||||
}
|
||||
|
||||
fn init_start_tag(&mut self) {
|
||||
self.flush_current_characters();
|
||||
self.current_element_name.clear();
|
||||
self.current_element_nofollow = false;
|
||||
self.current_element_is_closing = false;
|
||||
self.current_element = Element::default();
|
||||
}
|
||||
|
||||
fn init_end_tag(&mut self) {
|
||||
self.init_start_tag();
|
||||
self.current_element_is_closing = true;
|
||||
self.flush_current_characters();
|
||||
self.current_element = Element {
|
||||
name: String::new(),
|
||||
is_closing: true,
|
||||
};
|
||||
}
|
||||
|
||||
fn init_comment(&mut self) {
|
||||
|
|
@ -240,51 +255,66 @@ impl Emitter for &mut LinkExtractor {
|
|||
}
|
||||
|
||||
fn emit_current_tag(&mut self) -> Option<State> {
|
||||
let next_state = if self.current_element_is_closing {
|
||||
self.flush_links();
|
||||
|
||||
let next_state = if self.current_element.is_closing {
|
||||
None
|
||||
} else {
|
||||
self.last_start_element.clear();
|
||||
self.last_start_element.extend(&self.current_element_name);
|
||||
html5gum::naive_next_state(&self.current_element_name)
|
||||
html5gum::naive_next_state(self.current_element.name.as_bytes())
|
||||
};
|
||||
|
||||
self.flush_old_attribute();
|
||||
next_state
|
||||
}
|
||||
|
||||
fn emit_current_doctype(&mut self) {}
|
||||
|
||||
fn set_self_closing(&mut self) {
|
||||
self.current_element_is_closing = true;
|
||||
self.current_element.is_closing = true;
|
||||
}
|
||||
|
||||
fn set_force_quirks(&mut self) {}
|
||||
|
||||
fn push_tag_name(&mut self, s: &[u8]) {
|
||||
self.current_element_name.extend(s);
|
||||
self.current_element
|
||||
.name
|
||||
.push_str(&String::from_utf8_lossy(s));
|
||||
}
|
||||
|
||||
fn push_comment(&mut self, _: &[u8]) {}
|
||||
|
||||
fn push_doctype_name(&mut self, _: &[u8]) {}
|
||||
|
||||
fn init_doctype(&mut self) {
|
||||
self.flush_current_characters();
|
||||
}
|
||||
|
||||
fn init_attribute(&mut self) {
|
||||
self.flush_old_attribute();
|
||||
self.current_attribute_name.clear();
|
||||
}
|
||||
|
||||
fn push_attribute_name(&mut self, s: &[u8]) {
|
||||
self.current_attribute_name.extend(s);
|
||||
self.current_attribute_name
|
||||
.push_str(&String::from_utf8_lossy(s));
|
||||
}
|
||||
|
||||
fn push_attribute_value(&mut self, s: &[u8]) {
|
||||
self.current_attribute_value.extend(s);
|
||||
let value = String::from_utf8_lossy(s);
|
||||
self.current_attributes
|
||||
.entry(self.current_attribute_name.clone())
|
||||
.and_modify(|v| v.push_str(&value))
|
||||
.or_insert_with(|| value.into_owned());
|
||||
}
|
||||
|
||||
fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
|
||||
|
||||
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
|
||||
|
||||
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
|
||||
|
||||
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
|
||||
|
||||
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
|
||||
self.current_element_is_closing
|
||||
&& !self.current_element_name.is_empty()
|
||||
&& self.current_element_name == self.last_start_element
|
||||
self.current_element.is_closing && !self.current_element.name.is_empty()
|
||||
}
|
||||
|
||||
fn emit_current_comment(&mut self) {}
|
||||
|
|
@ -422,7 +452,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_nofollow() {
|
||||
fn test_exclude_nofollow() {
|
||||
let input = r#"
|
||||
<a rel="nofollow" href="https://foo.com">do not follow me</a>
|
||||
<a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
|
||||
|
|
@ -437,6 +467,15 @@ mod tests {
|
|||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_nofollow_change_order() {
|
||||
let input = r#"
|
||||
<a href="https://foo.com" rel="nofollow">do not follow me</a>
|
||||
"#;
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_script_tags() {
|
||||
let input = r#"
|
||||
|
|
@ -517,17 +556,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_email_false_positive() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let input = r#"<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">"#;
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
|
|
@ -558,4 +587,24 @@ mod tests {
|
|||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_preconnect() {
|
||||
let input = r#"
|
||||
<link rel="preconnect" href="https://example.com">
|
||||
"#;
|
||||
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_preconnect_reverse_order() {
|
||||
let input = r#"
|
||||
<link href="https://example.com" rel="preconnect">
|
||||
"#;
|
||||
|
||||
let uris = extract_html(input, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet};
|
|||
|
||||
use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd};
|
||||
|
||||
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
|
||||
use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri};
|
||||
|
||||
use super::html::html5gum::{extract_html, extract_html_fragments};
|
||||
|
||||
|
|
@ -59,7 +59,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
|
|||
LinkType::Autolink |
|
||||
// Email address in autolink like `<john@example.org>`
|
||||
LinkType::Email =>
|
||||
Some(extract_plaintext(&dest_url)),
|
||||
Some(extract_raw_uri_from_plaintext(&dest_url)),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -91,7 +91,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
|
|||
if inside_code_block && !include_verbatim {
|
||||
None
|
||||
} else {
|
||||
Some(extract_plaintext(&txt))
|
||||
Some(extract_raw_uri_from_plaintext(&txt))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -105,7 +105,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
|
|||
// An inline code node.
|
||||
Event::Code(code) => {
|
||||
if include_verbatim {
|
||||
Some(extract_plaintext(&code))
|
||||
Some(extract_raw_uri_from_plaintext(&code))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ pub mod markdown;
|
|||
mod plaintext;
|
||||
|
||||
use markdown::extract_markdown;
|
||||
use plaintext::extract_plaintext;
|
||||
use plaintext::extract_raw_uri_from_plaintext;
|
||||
|
||||
/// A handler for extracting links from various input formats like Markdown and
|
||||
/// HTML. Allocations should be avoided if possible as this is a
|
||||
|
|
@ -50,13 +50,14 @@ impl Extractor {
|
|||
html::html5gum::extract_html(&input_content.content, self.include_verbatim)
|
||||
}
|
||||
}
|
||||
FileType::Plaintext => extract_plaintext(&input_content.content),
|
||||
FileType::Plaintext => extract_raw_uri_from_plaintext(&input_content.content),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
use reqwest::Url;
|
||||
use std::{collections::HashSet, path::Path};
|
||||
|
||||
|
|
@ -72,20 +73,33 @@ mod tests {
|
|||
let input_content = InputContent::from_string(input, file_type);
|
||||
|
||||
let extractor = Extractor::new(false, false);
|
||||
let uris_html5gum = extractor
|
||||
let uris_html5gum: HashSet<Uri> = extractor
|
||||
.extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect();
|
||||
let uris_html5gum_sorted: Vec<Uri> = {
|
||||
let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
|
||||
uris.sort();
|
||||
uris
|
||||
};
|
||||
|
||||
let extractor = Extractor::new(true, false);
|
||||
let uris_html5ever = extractor
|
||||
let uris_html5ever: HashSet<Uri> = extractor
|
||||
.extract(&input_content)
|
||||
.into_iter()
|
||||
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
|
||||
.collect();
|
||||
let uris_html5ever_sorted: Vec<Uri> = {
|
||||
let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
|
||||
uris.sort();
|
||||
uris
|
||||
};
|
||||
|
||||
assert_eq!(uris_html5gum, uris_html5ever);
|
||||
assert_eq!(
|
||||
uris_html5gum_sorted, uris_html5ever_sorted,
|
||||
"Mismatch between html5gum and html5ever"
|
||||
);
|
||||
uris_html5gum
|
||||
}
|
||||
|
||||
|
|
@ -241,7 +255,8 @@ mod tests {
|
|||
let expected_links = IntoIterator::into_iter([
|
||||
website("https://example.com/"),
|
||||
website("https://example.com/favicon.ico"),
|
||||
website("https://fonts.externalsite.com"),
|
||||
// Note that we exclude `preconnect` links:
|
||||
// website("https://fonts.externalsite.com"),
|
||||
website("https://example.com/docs/"),
|
||||
website("https://example.com/forum"),
|
||||
])
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use crate::{types::uri::raw::RawUri, utils::url};
|
||||
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
pub(crate) fn extract_plaintext(input: &str) -> Vec<RawUri> {
|
||||
pub(crate) fn extract_raw_uri_from_plaintext(input: &str) -> Vec<RawUri> {
|
||||
url::find_links(input)
|
||||
.map(|uri| RawUri::from(uri.as_str()))
|
||||
.collect()
|
||||
|
|
@ -14,7 +14,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_extract_local_links() {
|
||||
let input = "http://127.0.0.1/ and http://127.0.0.1:8888/ are local links.";
|
||||
let links: Vec<RawUri> = extract_plaintext(input);
|
||||
let links: Vec<RawUri> = extract_raw_uri_from_plaintext(input);
|
||||
assert_eq!(
|
||||
links,
|
||||
[
|
||||
|
|
@ -29,7 +29,7 @@ mod tests {
|
|||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let uri = RawUri::from(input.trim_end());
|
||||
|
||||
let uris: Vec<RawUri> = extract_plaintext(input);
|
||||
let uris: Vec<RawUri> = extract_raw_uri_from_plaintext(input);
|
||||
assert_eq!(vec![uri], uris);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue