Merge branch 'BlackDex-optimize-icon-html-parsing' into main

This commit is contained in:
Daniel García 2022-06-26 21:54:10 +02:00
commit 54729f3c1e
No known key found for this signature in database
GPG key ID: FC8A7D14C3CD543A
3 changed files with 102 additions and 106 deletions

155
Cargo.lock generated
View file

@ -378,7 +378,7 @@ dependencies = [
"rand",
"sha2",
"subtle",
"time 0.3.9",
"time 0.3.11",
"version_check",
]
@ -394,7 +394,7 @@ dependencies = [
"publicsuffix",
"serde",
"serde_json",
"time 0.3.9",
"time 0.3.11",
"url 2.2.2",
]
@ -445,12 +445,12 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.8"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38"
checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83"
dependencies = [
"cfg-if",
"lazy_static",
"once_cell",
]
[[package]]
@ -526,7 +526,7 @@ dependencies = [
"cfg-if",
"hashbrown 0.12.1",
"lock_api",
"parking_lot_core 0.9.3",
"parking_lot_core",
]
[[package]]
@ -967,7 +967,7 @@ dependencies = [
"futures-timer",
"no-std-compat",
"nonzero_ext",
"parking_lot 0.12.1",
"parking_lot",
"quanta",
"rand",
"smallvec",
@ -1071,9 +1071,9 @@ dependencies = [
[[package]]
name = "html5gum"
version = "0.4.0"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dad48b66db55322add2819ae1d7bda0c32f3415269a08330679dbc8b0afeb30"
checksum = "3404cc217cc3e11d09c8ac9ccf8b1e540f64477c253d6dc70b5a5074782d934d"
dependencies = [
"jetscii",
]
@ -1179,12 +1179,12 @@ dependencies = [
[[package]]
name = "indexmap"
version = "1.8.2"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6012d540c5baa3589337a98ce73408de9b5a25ec9fc2c6fd6be8f0d39e0ca5a"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
dependencies = [
"autocfg",
"hashbrown 0.11.2",
"hashbrown 0.12.1",
"serde",
]
@ -1246,18 +1246,18 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.57"
version = "0.3.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "671a26f820db17c2a2750743f1dd03bafd15b98c9f30c7c2628c024c05d73397"
checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "jsonwebtoken"
version = "8.1.0"
version = "8.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc9051c17f81bae79440afa041b3a278e1de71bfb96d32454b477fd4703ccb6f"
checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c"
dependencies = [
"base64",
"pem",
@ -1466,9 +1466,9 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.3"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "713d550d9b44d89174e066b7a6217ae06234c10cb47819a88290d2b353c31799"
checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
dependencies = [
"libc",
"log",
@ -1678,9 +1678,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-src"
version = "111.20.0+1.1.1o"
version = "111.21.0+1.1.1p"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92892c4f87d56e376e469ace79f1128fdaded07646ddf73aa0be4706ff712dec"
checksum = "6d0a8313729211913936f1b95ca47a5fc7f2e04cd658c115388287f8a8361008"
dependencies = [
"cc",
]
@ -1699,17 +1699,6 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "parking_lot"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
dependencies = [
"instant",
"lock_api",
"parking_lot_core 0.8.5",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -1717,21 +1706,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core 0.9.3",
]
[[package]]
name = "parking_lot_core"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
dependencies = [
"cfg-if",
"instant",
"libc",
"redox_syscall",
"smallvec",
"winapi",
"parking_lot_core",
]
[[package]]
@ -1947,9 +1922,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro2"
version = "1.0.39"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c54b25569025b7fc9651de43004ae593a75ad88543b17178aa5e1b9c4f15f56f"
checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7"
dependencies = [
"unicode-ident",
]
@ -2009,9 +1984,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quote"
version = "1.0.18"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1"
checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
dependencies = [
"proc-macro2",
]
@ -2024,12 +1999,12 @@ checksum = "3fee2dce59f7a43418e3382c766554c614e06a552d53a8f07ef499ea4b332c0f"
[[package]]
name = "r2d2"
version = "0.8.9"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "545c5bc2b880973c9c10e4067418407a0ccaa3091781d1671d46eb35107cb26f"
checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
dependencies = [
"log",
"parking_lot 0.11.2",
"parking_lot",
"scheduled-thread-pool",
]
@ -2257,7 +2232,7 @@ dependencies = [
"memchr",
"multer",
"num_cpus",
"parking_lot 0.12.1",
"parking_lot",
"pin-project-lite",
"rand",
"ref-cast",
@ -2267,7 +2242,7 @@ dependencies = [
"serde_json",
"state",
"tempfile",
"time 0.3.9",
"time 0.3.11",
"tokio",
"tokio-stream",
"tokio-util 0.7.3",
@ -2316,7 +2291,7 @@ dependencies = [
"smallvec",
"stable-pattern",
"state",
"time 0.3.9",
"time 0.3.11",
"tokio",
"tokio-rustls",
"uncased",
@ -2351,9 +2326,9 @@ dependencies = [
[[package]]
name = "rustversion"
version = "1.0.6"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f"
checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
[[package]]
name = "ryu"
@ -2386,7 +2361,7 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "977a7519bff143a44f842fd07e80ad1329295bd71686457f18e496736f4bf9bf"
dependencies = [
"parking_lot 0.12.1",
"parking_lot",
]
[[package]]
@ -2559,7 +2534,7 @@ dependencies = [
"num-bigint",
"num-traits",
"thiserror",
"time 0.3.9",
"time 0.3.11",
]
[[package]]
@ -2576,9 +2551,9 @@ checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32"
[[package]]
name = "smallvec"
version = "1.8.0"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
checksum = "cc88c725d61fc6c3132893370cac4a0200e3fedf5da8331c570664b1987f5ca2"
[[package]]
name = "socket2"
@ -2634,9 +2609,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
[[package]]
name = "syn"
version = "1.0.96"
version = "1.0.98"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0748dd251e24453cb8717f0354206b91557e4ec8703673a4b30208f2abaf1ebf"
checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
dependencies = [
"proc-macro2",
"quote",
@ -2653,7 +2628,7 @@ dependencies = [
"hostname",
"libc",
"log",
"time 0.3.9",
"time 0.3.11",
]
[[package]]
@ -2720,9 +2695,9 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.9"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd"
checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217"
dependencies = [
"itoa",
"libc",
@ -2763,7 +2738,7 @@ dependencies = [
"mio",
"num_cpus",
"once_cell",
"parking_lot 0.12.1",
"parking_lot",
"pin-project-lite",
"signal-hook-registry",
"socket2",
@ -2889,9 +2864,9 @@ dependencies = [
[[package]]
name = "tower-service"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
[[package]]
name = "tracing"
@ -2919,9 +2894,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.27"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7709595b8878a4965ce5e87ebf880a7d39c9afc6837721b21a5a816a8117d921"
checksum = "7b7358be39f2f274f322d2aaed611acc57f382e8eb1e5b48cb9ae30933495ce7"
dependencies = [
"once_cell",
"valuable",
@ -2993,7 +2968,7 @@ dependencies = [
"lazy_static",
"log",
"lru-cache",
"parking_lot 0.12.1",
"parking_lot",
"resolv-conf",
"smallvec",
"thiserror",
@ -3071,9 +3046,9 @@ checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
[[package]]
name = "unicode-normalization"
version = "0.1.19"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
checksum = "81dee68f85cab8cf68dec42158baf3a79a1cdc065a8b103025965d6ccb7f6cbd"
dependencies = [
"tinyvec",
]
@ -3190,7 +3165,7 @@ dependencies = [
"serde",
"serde_json",
"syslog",
"time 0.3.9",
"time 0.3.11",
"tokio",
"tokio-tungstenite",
"totp-lite",
@ -3248,9 +3223,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27370197c907c55e3f1a9fbe26f44e937fe6451368324e009cba39e139dc08ad"
checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
@ -3258,9 +3233,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53e04185bfa3a779273da532f5025e33398409573f348985af9a1cbf3774d3f4"
checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a"
dependencies = [
"bumpalo",
"lazy_static",
@ -3273,9 +3248,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.30"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f741de44b75e14c35df886aff5f1eb73aa114fa5d4d00dcd37b5e01259bf3b2"
checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f"
dependencies = [
"cfg-if",
"js-sys",
@ -3285,9 +3260,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17cae7ff784d7e83a2fe7611cfe766ecf034111b49deb850a3dc7699c08251f5"
checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -3295,9 +3270,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99ec0dc7a4756fffc231aab1b9f2f578d23cd391390ab27f952ae0c9b3ece20b"
checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048"
dependencies = [
"proc-macro2",
"quote",
@ -3308,15 +3283,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d554b7f530dee5964d9a9468d95c1f8b8acae4f282807e7d27d4b03099a46744"
checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be"
[[package]]
name = "web-sys"
version = "0.3.57"
version = "0.3.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b17e741662c70c8bd24ac5c5b18de314a2c26c32bf8346ee1e6f53de919c283"
checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90"
dependencies = [
"js-sys",
"wasm-bindgen",

View file

@ -84,7 +84,7 @@ uuid = { version = "1.1.2", features = ["v4"] }
# Date and time libraries
chrono = { version = "0.4.19", features = ["clock", "serde"], default-features = false }
chrono-tz = "0.6.1"
time = "0.3.9"
time = "0.3.11"
# Job scheduler
job_scheduler_ng = "2.0.1"
@ -93,7 +93,7 @@ job_scheduler_ng = "2.0.1"
data-encoding = "2.3.2"
# JWT library
jsonwebtoken = "8.1.0"
jsonwebtoken = "8.1.1"
# TOTP library
totp-lite = "2.0.0"
@ -118,7 +118,7 @@ handlebars = { version = "4.3.1", features = ["dir_source"] }
reqwest = { version = "0.11.11", features = ["stream", "json", "gzip", "brotli", "socks", "cookies", "trust-dns"] }
# For favicon extraction from main website
html5gum = "0.4.0"
html5gum = "0.5.2"
regex = { version = "1.5.6", features = ["std", "perf", "unicode-perl"], default-features = false }
data-url = "0.1.1"
bytes = "1.1.0"

View file

@ -19,7 +19,7 @@ use tokio::{
net::lookup_host,
};
use html5gum::{Emitter, EndTag, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer};
use html5gum::{Emitter, EndTag, HtmlString, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer};
use crate::{
error::Error,
@ -433,7 +433,7 @@ async fn get_favicons_node(
for token in dom {
match token {
FaviconToken::StartTag(tag) => {
if tag.name == TAG_LINK
if *tag.name == TAG_LINK
&& tag.attributes.contains_key(ATTR_REL)
&& tag.attributes.contains_key(ATTR_HREF)
{
@ -443,7 +443,7 @@ async fn get_favicons_node(
if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
icon_tags.push(tag);
}
} else if tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) {
} else if *tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) {
let href = std::str::from_utf8(tag.attributes.get(ATTR_HREF).unwrap()).unwrap_or_default();
debug!("Found base href: {href}");
base_url = match base_url.join(href) {
@ -453,7 +453,7 @@ async fn get_favicons_node(
}
}
FaviconToken::EndTag(tag) => {
if tag.name == TAG_HEAD {
if *tag.name == TAG_HEAD {
break;
}
}
@ -830,17 +830,18 @@ impl reqwest::cookie::CookieStore for Jar {
/// Therefor parsing the HTML content is faster.
use std::collections::{BTreeSet, VecDeque};
#[derive(Debug)]
enum FaviconToken {
StartTag(StartTag),
EndTag(EndTag),
}
#[derive(Default)]
#[derive(Default, Debug)]
struct FaviconEmitter {
current_token: Option<FaviconToken>,
last_start_tag: Vec<u8>,
current_attribute: Option<(Vec<u8>, Vec<u8>)>,
seen_attributes: BTreeSet<Vec<u8>>,
last_start_tag: HtmlString,
current_attribute: Option<(HtmlString, HtmlString)>,
seen_attributes: BTreeSet<HtmlString>,
emitted_tokens: VecDeque<FaviconToken>,
}
@ -887,18 +888,38 @@ impl Emitter for FaviconEmitter {
self.seen_attributes.clear();
}
fn emit_current_tag(&mut self) {
fn emit_current_tag(&mut self) -> Option<html5gum::State> {
self.flush_current_attribute();
let mut token = self.current_token.take().unwrap();
let mut emit = false;
match token {
FaviconToken::EndTag(_) => {
FaviconToken::EndTag(ref mut tag) => {
// Always clean seen attributes
self.seen_attributes.clear();
// Only trigger an emit for the </head> tag.
// This is matched, and will break the for-loop.
if *tag.name == b"head" {
emit = true;
}
}
FaviconToken::StartTag(ref mut tag) => {
self.set_last_start_tag(Some(&tag.name));
// Only trriger an emit for <link> and <base> tags.
// These are the only tags we want to parse.
if *tag.name == b"link" || *tag.name == b"base" {
self.set_last_start_tag(Some(&tag.name));
emit = true;
} else {
self.set_last_start_tag(None);
}
}
}
self.emit_token(token);
// Only emit the tags we want to parse.
if emit {
self.emit_token(token);
}
None
}
fn push_tag_name(&mut self, s: &[u8]) {
@ -921,7 +942,7 @@ impl Emitter for FaviconEmitter {
fn init_attribute(&mut self) {
self.flush_current_attribute();
self.current_attribute = Some((Vec::new(), Vec::new()));
self.current_attribute = Some(Default::default());
}
fn push_attribute_name(&mut self, s: &[u8]) {