Remove soup and use a newer html5ever directly

This commit is contained in:
Daniel García 2021-02-07 22:28:02 +01:00
parent 8b660ae090
commit c836f88ff2
No known key found for this signature in database
GPG Key ID: FC8A7D14C3CD543A
3 changed files with 107 additions and 176 deletions

209
Cargo.lock generated
View File

@ -35,12 +35,6 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "autocfg"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
[[package]]
name = "autocfg"
version = "1.0.1"
@ -134,11 +128,13 @@ dependencies = [
"dotenv",
"fern",
"handlebars",
"html5ever",
"idna 0.2.1",
"jsonwebtoken",
"lettre",
"libsqlite3-sys",
"log 0.4.14",
"markup5ever_rcdom",
"multipart",
"newline-converter",
"num-derive",
@ -159,7 +155,6 @@ dependencies = [
"rocket_contrib",
"serde",
"serde_json",
"soup",
"syslog",
"time 0.2.25",
"u2f",
@ -298,15 +293,6 @@ dependencies = [
"parse-zoneinfo",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
dependencies = [
"bitflags",
]
[[package]]
name = "const_fn"
version = "0.4.5"
@ -847,16 +833,16 @@ dependencies = [
[[package]]
name = "html5ever"
version = "0.22.5"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e"
checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [
"log 0.4.14",
"mac",
"markup5ever",
"proc-macro2 0.4.30",
"quote 0.6.13",
"syn 0.15.44",
"proc-macro2 1.0.24",
"quote 1.0.8",
"syn 1.0.60",
]
[[package]]
@ -1005,7 +991,7 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b"
dependencies = [
"autocfg 1.0.1",
"autocfg",
"hashbrown",
]
@ -1170,10 +1156,11 @@ checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "markup5ever"
version = "0.7.5"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "897636f9850c3eef4905a5540683ed53dc9393860f0846cab2c2ddf9939862ff"
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
dependencies = [
"log 0.4.14",
"phf",
"phf_codegen",
"serde",
@ -1184,6 +1171,18 @@ dependencies = [
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
@ -1261,7 +1260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d"
dependencies = [
"adler",
"autocfg 1.0.1",
"autocfg",
]
[[package]]
@ -1431,7 +1430,7 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
dependencies = [
"autocfg 1.0.1",
"autocfg",
"num-integer",
"num-traits",
]
@ -1453,7 +1452,7 @@ version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
dependencies = [
"autocfg 1.0.1",
"autocfg",
"num-traits",
]
@ -1463,7 +1462,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg 1.0.1",
"autocfg",
]
[[package]]
@ -1548,7 +1547,7 @@ version = "0.9.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "921fc71883267538946025deffb622905ecad223c28efbfdef9bb59a0175f3e6"
dependencies = [
"autocfg 1.0.1",
"autocfg",
"cc",
"libc",
"openssl-src",
@ -1735,18 +1734,18 @@ dependencies = [
[[package]]
name = "phf"
version = "0.7.24"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.7.24"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
@ -1754,19 +1753,19 @@ dependencies = [
[[package]]
name = "phf_generator"
version = "0.7.24"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.6.5",
"rand 0.7.3",
]
[[package]]
name = "phf_shared"
version = "0.7.24"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
@ -1952,25 +1951,6 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "rand"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
dependencies = [
"autocfg 0.1.7",
"libc",
"rand_chacha 0.1.1",
"rand_core 0.4.2",
"rand_hc 0.1.0",
"rand_isaac",
"rand_jitter",
"rand_os",
"rand_pcg",
"rand_xorshift",
"winapi 0.3.9",
]
[[package]]
name = "rand"
version = "0.7.3"
@ -1982,6 +1962,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg",
]
[[package]]
@ -1996,16 +1977,6 @@ dependencies = [
"rand_hc 0.3.0",
]
[[package]]
name = "rand_chacha"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
dependencies = [
"autocfg 0.1.7",
"rand_core 0.3.1",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
@ -2059,15 +2030,6 @@ dependencies = [
"getrandom 0.2.2",
]
[[package]]
name = "rand_hc"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4"
dependencies = [
"rand_core 0.3.1",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
@ -2086,57 +2048,13 @@ dependencies = [
"rand_core 0.6.1",
]
[[package]]
name = "rand_isaac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
dependencies = [
"rand_core 0.3.1",
]
[[package]]
name = "rand_jitter"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
dependencies = [
"libc",
"rand_core 0.4.2",
"winapi 0.3.9",
]
[[package]]
name = "rand_os"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
dependencies = [
"cloudabi",
"fuchsia-cprng",
"libc",
"rand_core 0.4.2",
"rdrand",
"winapi 0.3.9",
]
[[package]]
name = "rand_pcg"
version = "0.1.2"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"autocfg 0.1.7",
"rand_core 0.4.2",
]
[[package]]
name = "rand_xorshift"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
dependencies = [
"rand_core 0.3.1",
"rand_core 0.5.1",
]
[[package]]
@ -2564,9 +2482,9 @@ dependencies = [
[[package]]
name = "siphasher"
version = "0.2.3"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]]
name = "slab"
@ -2600,16 +2518,6 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "soup"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee42b8c117ede655c8ffe18dafcd239b23eb3bb7a2c71b1f01237587736f139f"
dependencies = [
"html5ever",
"regex",
]
[[package]]
name = "spin"
version = "0.5.2"
@ -2688,38 +2596,29 @@ checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
[[package]]
name = "string_cache"
version = "0.7.5"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67"
checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a"
dependencies = [
"lazy_static",
"new_debug_unreachable",
"phf_shared",
"precomputed-hash",
"serde",
"string_cache_codegen",
"string_cache_shared",
]
[[package]]
name = "string_cache_codegen"
version = "0.4.4"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6"
checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2 1.0.24",
"quote 1.0.8",
"string_cache_shared",
]
[[package]]
name = "string_cache_shared"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc"
[[package]]
name = "subtle"
version = "2.4.0"
@ -2879,7 +2778,7 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a"
dependencies = [
"autocfg 1.0.1",
"autocfg",
"bytes 1.0.1",
"libc",
"memchr",
@ -3321,6 +3220,18 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
[[package]]
name = "xml5ever"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59"
dependencies = [
"log 0.4.14",
"mac",
"markup5ever",
"time 0.1.44",
]
[[package]]
name = "yansi"
version = "0.5.0"

View File

@ -106,7 +106,8 @@ newline-converter = "0.1.0"
handlebars = { version = "3.5.2", features = ["dir_source"] }
# For favicon extraction from main website
soup = "0.5.0"
html5ever = "0.25.1"
markup5ever_rcdom = "0.1.0"
regex = { version = "1.4.3", features = ["std", "perf"], default-features = false }
data-url = "0.1.0"

View File

@ -11,7 +11,6 @@ use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::{blocking::Client, blocking::Response, header, Url};
use rocket::{http::ContentType, http::Cookie, response::Content, Route};
use soup::prelude::*;
use crate::{error::Error, util::Cached, CONFIG};
@ -332,6 +331,42 @@ impl Icon {
}
}
fn get_favicons_node(node: &std::rc::Rc<markup5ever_rcdom::Node>, icons: &mut Vec<Icon>, url: &Url) {
if let markup5ever_rcdom::NodeData::Element { name, attrs, .. } = &node.data {
if name.local.as_ref() == "link" {
let mut has_rel = false;
let mut href = None;
let mut sizes = None;
let attrs = attrs.borrow();
for attr in attrs.iter() {
let attr_name = attr.name.local.as_ref();
let attr_value = attr.value.as_ref();
if attr_name == "rel" && ICON_REL_REGEX.is_match(attr_value) {
has_rel = true;
} else if attr_name == "href" {
href = Some(attr_value);
} else if attr_name == "sizes" {
sizes = Some(attr_value);
}
}
if has_rel && href.is_some() {
if let Ok(full_href) = url.join(&href.unwrap()).map(|h| h.into_string()) {
let priority = get_icon_priority(&full_href, sizes);
icons.push(Icon::new(priority, full_href));
}
}
}
}
// TODO: Might want to limit the recursion depth?
for child in node.children.borrow().iter() {
get_favicons_node(child, icons, url);
}
}
struct IconUrlResult {
iconlist: Vec<Icon>,
cookies: String,
@ -431,30 +466,14 @@ fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// 512KB should be more than enough for the HTML, though as we only really need
// the HTML header, it could potentially be reduced even further
let limited_reader = content.take(512 * 1024);
let mut limited_reader = content.take(512 * 1024);
let soup = Soup::from_reader(limited_reader)?;
// Search for and filter
let favicons = soup
.tag("link")
.attr("rel", ICON_REL_REGEX.clone()) // Only use icon rels
.attr_name("href") // Make sure there is a href
.find_all();
// Loop through all the found icons and determine it's priority
for favicon in favicons {
let sizes = favicon.get("sizes");
let href = favicon.get("href").unwrap();
// Skip invalid url's
let full_href = match url.join(&href) {
Ok(h) => h.into_string(),
_ => continue,
};
let priority = get_icon_priority(&full_href, sizes);
iconlist.push(Icon::new(priority, full_href))
}
use html5ever::tendril::TendrilSink;
let dom = html5ever::parse_document(markup5ever_rcdom::RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut limited_reader)?;
get_favicons_node(&dom.document, &mut iconlist, &url);
} else {
// Add the default favicon.ico to the list with just the given domain
iconlist.push(Icon::new(35, format!("{}/favicon.ico", ssldomain)));
@ -506,7 +525,7 @@ fn get_page_with_cookies(url: &str, cookie_str: &str, referer: &str) -> Result<R
/// priority1 = get_icon_priority("http://example.com/path/to/a/favicon.png", "32x32");
/// priority2 = get_icon_priority("https://example.com/path/to/a/favicon.ico", "");
/// ```
fn get_icon_priority(href: &str, sizes: Option<String>) -> u8 {
fn get_icon_priority(href: &str, sizes: Option<&str>) -> u8 {
// Check if there is a dimension set
let (width, height) = parse_sizes(sizes);
@ -554,7 +573,7 @@ fn get_icon_priority(href: &str, sizes: Option<String>) -> u8 {
/// let (width, height) = parse_sizes("x128x128"); // (128, 128)
/// let (width, height) = parse_sizes("32"); // (0, 0)
/// ```
fn parse_sizes(sizes: Option<String>) -> (u16, u16) {
fn parse_sizes(sizes: Option<&str>) -> (u16, u16) {
let mut width: u16 = 0;
let mut height: u16 = 0;