Updated icon fetching and crates.

- Updated some crates
- Updated icon fetching code:
  + Use a cookie jar and set Max-Age to 2 minutes for all cookies
  + Locate the base href tag to fix some locations
  + Changed User-Agent (Helps on some sites to get HTML instead of JS)
  + Reduced HTML code limit from 512KB to 384KB
  + Allow some large icons higer-up in the sort
  + Allow GIF images
  + Ignore cookie_store and hyper::client debug messages
This commit is contained in:
BlackDex 2021-05-16 15:29:13 +02:00
parent aba5b234af
commit f270f2ed65
4 changed files with 272 additions and 112 deletions

198
Cargo.lock generated
View File

@ -321,6 +321,49 @@ dependencies = [
"version_check 0.9.3",
]
[[package]]
name = "cookie"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffdf8865bac3d9a3bde5bde9088ca431b11f5d37c7a578b8086af77248b76627"
dependencies = [
"percent-encoding 2.1.0",
"time 0.2.26",
"version_check 0.9.3",
]
[[package]]
name = "cookie_store"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3818dfca4b0cb5211a659bbcbb94225b7127407b2b135e650d717bfb78ab10d3"
dependencies = [
"cookie 0.14.4",
"idna 0.2.3",
"log 0.4.14",
"publicsuffix 1.5.6",
"serde",
"serde_json",
"time 0.2.26",
"url 2.2.2",
]
[[package]]
name = "cookie_store"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55b4ac5559dd39f7bdc516f769cb412b151585d8886d216871a8435ed7f862cd"
dependencies = [
"cookie 0.15.0",
"idna 0.2.3",
"log 0.4.14",
"publicsuffix 2.1.0",
"serde",
"serde_json",
"time 0.2.26",
"url 2.2.2",
]
[[package]]
name = "core-foundation"
version = "0.9.1"
@ -339,9 +382,9 @@ checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b"
[[package]]
name = "cpufeatures"
version = "0.1.1"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
checksum = "ed00c67cb5d0a7d64a44f6ad2668db7e7530311dd53ea79bcd4fb022c64911c8"
dependencies = [
"libc",
]
@ -395,7 +438,7 @@ checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57"
[[package]]
name = "data-url"
version = "0.1.0"
source = "git+https://github.com/servo/rust-url?rev=540ede02d0771824c0c80ff9f57fe8eff38b1291#540ede02d0771824c0c80ff9f57fe8eff38b1291"
source = "git+https://github.com/servo/rust-url?rev=eb7330b5296c0d43816d1346211b74182bb4ae37#eb7330b5296c0d43816d1346211b74182bb4ae37"
dependencies = [
"matches",
]
@ -648,9 +691,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [
"futures-channel",
"futures-core",
@ -663,9 +706,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [
"futures-core",
"futures-sink",
@ -673,15 +716,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]]
name = "futures-executor"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [
"futures-core",
"futures-task",
@ -690,16 +733,17 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]]
name = "futures-macro"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [
"autocfg",
"proc-macro-hack",
"proc-macro2 1.0.26",
"quote 1.0.9",
@ -708,22 +752,23 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]]
name = "futures-task"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]]
name = "futures-util"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [
"autocfg",
"futures-channel",
"futures-core",
"futures-io",
@ -841,6 +886,12 @@ version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
[[package]]
name = "hermit-abi"
version = "0.1.18"
@ -920,9 +971,9 @@ dependencies = [
[[package]]
name = "httparse"
version = "1.4.0"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a1ce40d6fc9764887c2fdc7305c3dcc429ba11ff981c1509416afd5697e4437"
checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68"
[[package]]
name = "httpdate"
@ -1027,7 +1078,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3"
dependencies = [
"autocfg",
"hashbrown",
"hashbrown 0.9.1",
]
[[package]]
@ -1072,9 +1123,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
dependencies = [
"wasm-bindgen",
]
@ -1123,9 +1174,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "lettre"
version = "0.10.0-beta.4"
version = "0.10.0-rc.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b7fd0c394e97e38d87bd2dfdf91983ab406b044a0bfd4e5b5c82bdfa0324526"
checksum = "4be4ff7e8bcb0e0c6902815554a286889b0e99b4ea6e898afb7b9f53174b1929"
dependencies = [
"base64 0.13.0",
"fastrand",
@ -1572,9 +1623,9 @@ dependencies = [
[[package]]
name = "openssl-probe"
version = "0.1.2"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de"
checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a"
[[package]]
name = "openssl-src"
@ -1909,6 +1960,34 @@ dependencies = [
"unicode-xid 0.2.2",
]
[[package]]
name = "psl-types"
version = "2.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66b398073e7cdd6f05934389a8f5961e3aabfa66675b6f440df4e2c793d51a4f"
[[package]]
name = "publicsuffix"
version = "1.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95b4ce31ff0a27d93c8de1849cf58162283752f065a90d508f1105fa6c9a213f"
dependencies = [
"idna 0.2.3",
"url 2.2.2",
]
[[package]]
name = "publicsuffix"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3ac055aef7cc7a1caefbc65144be879e862467dcd9b8a8d57b64a13e7dce15d"
dependencies = [
"byteorder",
"hashbrown 0.11.2",
"idna 0.2.3",
"psl-types",
]
[[package]]
name = "quick-error"
version = "1.2.3"
@ -2134,6 +2213,8 @@ dependencies = [
"async-compression",
"base64 0.13.0",
"bytes 1.0.1",
"cookie 0.14.4",
"cookie_store 0.12.0",
"encoding_rs",
"futures-core",
"futures-util",
@ -2152,6 +2233,7 @@ dependencies = [
"serde",
"serde_json",
"serde_urlencoded",
"time 0.2.26",
"tokio",
"tokio-native-tls",
"tokio-socks",
@ -2248,7 +2330,7 @@ name = "rocket_http"
version = "0.5.0-dev"
source = "git+https://github.com/SergioBenitez/Rocket?rev=263e39b5b429de1913ce7e3036575a7b4d88b6d7#263e39b5b429de1913ce7e3036575a7b4d88b6d7"
dependencies = [
"cookie",
"cookie 0.14.4",
"hyper 0.10.16",
"hyper-sync-rustls",
"indexmap",
@ -2391,18 +2473,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "serde"
version = "1.0.125"
version = "1.0.126"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.125"
version = "1.0.126"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d"
checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43"
dependencies = [
"proc-macro2 1.0.26",
"quote 1.0.9",
@ -2459,9 +2541,9 @@ dependencies = [
[[package]]
name = "sha-1"
version = "0.9.5"
version = "0.9.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659df5fc3ce22274daac600ffb845300bd2125bcfaec047823075afdab81c00"
checksum = "8c4cfa741c5832d0ef7fab46cabed29c2aae926db0b11bb2069edd8db5e64e16"
dependencies = [
"block-buffer 0.9.0",
"cfg-if 1.0.0",
@ -2804,9 +2886,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.5.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg",
"bytes 1.0.1",
@ -2841,9 +2923,9 @@ dependencies = [
[[package]]
name = "tokio-util"
version = "0.6.6"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e"
checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592"
dependencies = [
"bytes 1.0.1",
"futures-core",
@ -3054,9 +3136,12 @@ name = "vaultwarden"
version = "1.0.0"
dependencies = [
"backtrace",
"bytes 1.0.1",
"chashmap",
"chrono",
"chrono-tz",
"cookie 0.15.0",
"cookie_store 0.15.0",
"data-encoding",
"data-url",
"diesel",
@ -3095,6 +3180,7 @@ dependencies = [
"time 0.2.26",
"tracing",
"u2f",
"url 2.2.2",
"uuid",
"yubico",
]
@ -3152,9 +3238,9 @@ checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasm-bindgen"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
dependencies = [
"cfg-if 1.0.0",
"serde",
@ -3164,9 +3250,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
dependencies = [
"bumpalo",
"lazy_static",
@ -3179,9 +3265,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.23"
version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
@ -3191,9 +3277,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
dependencies = [
"quote 1.0.9",
"wasm-bindgen-macro-support",
@ -3201,9 +3287,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
dependencies = [
"proc-macro2 1.0.26",
"quote 1.0.9",
@ -3214,15 +3300,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
[[package]]
name = "web-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
dependencies = [
"js-sys",
"wasm-bindgen",
@ -3345,7 +3431,7 @@ dependencies = [
"hmac 0.10.1",
"rand 0.8.3",
"reqwest",
"sha-1 0.9.5",
"sha-1 0.9.6",
"threadpool",
"url 1.7.2",
]

View File

@ -32,7 +32,13 @@ rocket = { version = "0.5.0-dev", features = ["tls"], default-features = false }
rocket_contrib = "0.5.0-dev"
# HTTP client
reqwest = { version = "0.11.3", features = ["blocking", "json", "gzip", "brotli", "socks"] }
reqwest = { version = "0.11.3", features = ["blocking", "json", "gzip", "brotli", "socks", "cookies"] }
# Used for custom short lived cookie jar
cookie = "0.15.0"
cookie_store = "0.15.0"
bytes = "1.0.1"
url = "2.2.2"
# multipart/form-data support
multipart = { version = "0.17.1", features = ["server"], default-features = false }
@ -47,7 +53,7 @@ rmpv = "0.4.7"
chashmap = "2.2.2"
# A generic serialization/deserialization framework
serde = { version = "1.0.125", features = ["derive"] }
serde = { version = "1.0.126", features = ["derive"] }
serde_json = "1.0.64"
# Logging
@ -103,7 +109,7 @@ num-derive = "0.3.3"
# Email libraries
tracing = { version = "0.1.26", features = ["log"] } # Needed to have lettre trace logging used when SMTP_DEBUG is enabled.
lettre = { version = "0.10.0-beta.4", features = ["smtp-transport", "builder", "serde", "native-tls", "hostname", "tracing"], default-features = false }
lettre = { version = "0.10.0-rc.1", features = ["smtp-transport", "builder", "serde", "native-tls", "hostname", "tracing"], default-features = false }
# Template library
handlebars = { version = "3.5.5", features = ["dir_source"] }
@ -137,7 +143,7 @@ rocket = { git = 'https://github.com/SergioBenitez/Rocket', rev = '263e39b5b429d
rocket_contrib = { git = 'https://github.com/SergioBenitez/Rocket', rev = '263e39b5b429de1913ce7e3036575a7b4d88b6d7' }
# For favicon extraction from main website
data-url = { git = 'https://github.com/servo/rust-url', package="data-url", rev = '540ede02d0771824c0c80ff9f57fe8eff38b1291' }
data-url = { git = 'https://github.com/servo/rust-url', package="data-url", rev = 'eb7330b5296c0d43816d1346211b74182bb4ae37' }
# The maintainer of the `job_scheduler` crate doesn't seem to have responded
# to any issues or PRs for almost a year (as of April 2021). This hopefully

View File

@ -3,14 +3,14 @@ use std::{
fs::{create_dir_all, remove_file, symlink_metadata, File},
io::prelude::*,
net::{IpAddr, ToSocketAddrs},
sync::RwLock,
sync::{Arc, RwLock},
time::{Duration, SystemTime},
};
use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::{blocking::Client, blocking::Response, header, Url};
use rocket::{http::ContentType, http::Cookie, response::Content, Route};
use reqwest::{blocking::Client, blocking::Response, header};
use rocket::{http::ContentType, response::Content, Route};
use crate::{
error::Error,
@ -25,19 +25,17 @@ pub fn routes() -> Vec<Route> {
static CLIENT: Lazy<Client> = Lazy::new(|| {
// Generate the default headers
let mut default_headers = header::HeaderMap::new();
default_headers.insert(header::USER_AGENT, header::HeaderValue::from_static("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15"));
default_headers.insert(header::ACCEPT_LANGUAGE, header::HeaderValue::from_static("en-US,en;q=0.8"));
default_headers
.insert(header::USER_AGENT, header::HeaderValue::from_static("Links (2.22; Linux X86_64; GNU C; text)"));
default_headers
.insert(header::ACCEPT, header::HeaderValue::from_static("text/html, text/*;q=0.5, image/*, */*;q=0.1"));
default_headers.insert(header::ACCEPT_LANGUAGE, header::HeaderValue::from_static("en,*;q=0.1"));
default_headers.insert(header::CACHE_CONTROL, header::HeaderValue::from_static("no-cache"));
default_headers.insert(header::PRAGMA, header::HeaderValue::from_static("no-cache"));
default_headers.insert(
header::ACCEPT,
header::HeaderValue::from_static(
"text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*;q=0.8",
),
);
// Reuse the client between requests
get_reqwest_client_builder()
.cookie_provider(Arc::new(Jar::default()))
.timeout(Duration::from_secs(CONFIG.icon_download_timeout()))
.default_headers(default_headers)
.build()
@ -80,7 +78,7 @@ fn is_valid_domain(domain: &str) -> bool {
const ALLOWED_CHARS: &str = "_-.";
// If parsing the domain fails using Url, it will not work with reqwest.
if let Err(parse_error) = Url::parse(format!("https://{}", domain).as_str()) {
if let Err(parse_error) = url::Url::parse(format!("https://{}", domain).as_str()) {
debug!("Domain parse error: '{}' - {:?}", domain, parse_error);
return false;
} else if domain.is_empty()
@ -360,7 +358,51 @@ impl Icon {
}
}
fn get_favicons_node(node: &std::rc::Rc<markup5ever_rcdom::Node>, icons: &mut Vec<Icon>, url: &Url) {
/// Iterates over the HTML document to find <base href="http://domain.tld">
/// When found it will stop the iteration and the found base href will be shared deref via `base_href`.
///
/// # Arguments
/// * `node` - A Parsed HTML document via html5ever::parse_document()
/// * `base_href` - a mutable url::Url which will be overwritten when a base href tag has been found.
///
fn get_base_href(node: &std::rc::Rc<markup5ever_rcdom::Node>, base_href: &mut url::Url) -> bool {
if let markup5ever_rcdom::NodeData::Element {
name,
attrs,
..
} = &node.data
{
if name.local.as_ref() == "base" {
let attrs = attrs.borrow();
for attr in attrs.iter() {
let attr_name = attr.name.local.as_ref();
let attr_value = attr.value.as_ref();
if attr_name == "href" {
debug!("Found base href: {}", attr_value);
*base_href = match base_href.join(attr_value) {
Ok(href) => href,
_ => base_href.clone(),
};
return true;
}
}
return true;
}
}
// TODO: Might want to limit the recursion depth?
for child in node.children.borrow().iter() {
// Check if we got a true back and stop the iter.
// This means we found a <base> tag and can stop processing the html.
if get_base_href(child, base_href) {
return true;
}
}
false
}
fn get_favicons_node(node: &std::rc::Rc<markup5ever_rcdom::Node>, icons: &mut Vec<Icon>, url: &url::Url) {
if let markup5ever_rcdom::NodeData::Element {
name,
attrs,
@ -406,12 +448,11 @@ fn get_favicons_node(node: &std::rc::Rc<markup5ever_rcdom::Node>, icons: &mut Ve
struct IconUrlResult {
iconlist: Vec<Icon>,
cookies: String,
referer: String,
}
/// Returns a Result/Tuple which holds a Vector IconList and a string which holds the cookies from the last response.
/// There will always be a result with a string which will contain https://example.com/favicon.ico and an empty string for the cookies.
/// Returns a IconUrlResult which holds a Vector IconList and a string which holds the referer.
/// There will always two items within the iconlist which holds http(s)://domain.tld/favicon.ico.
/// This does not mean that that location does exists, but it is the default location browser use.
///
/// # Argument
@ -419,8 +460,8 @@ struct IconUrlResult {
///
/// # Example
/// ```
/// let (mut iconlist, cookie_str) = get_icon_url("github.com")?;
/// let (mut iconlist, cookie_str) = get_icon_url("gitlab.com")?;
/// let icon_result = get_icon_url("github.com")?;
/// let icon_result = get_icon_url("vaultwarden.discourse.group")?;
/// ```
fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// Default URL with secure and insecure schemes
@ -468,32 +509,12 @@ fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// Create the iconlist
let mut iconlist: Vec<Icon> = Vec::new();
// Create the cookie_str to fill it all the cookies from the response
// These cookies can be used to request/download the favicon image.
// Some sites have extra security in place with for example XSRF Tokens.
let mut cookie_str = "".to_string();
let mut referer = "".to_string();
let mut referer = String::from("");
if let Ok(content) = resp {
// Extract the URL from the respose in case redirects occured (like @ gitlab.com)
let url = content.url().clone();
// Get all the cookies and pass it on to the next function.
// Needed for XSRF Cookies for example (like @ mijn.ing.nl)
let raw_cookies = content.headers().get_all("set-cookie");
cookie_str = raw_cookies
.iter()
.filter_map(|raw_cookie| raw_cookie.to_str().ok())
.map(|cookie_str| {
if let Ok(cookie) = Cookie::parse(cookie_str) {
format!("{}={}; ", cookie.name(), cookie.value())
} else {
String::new()
}
})
.collect::<String>();
// Set the referer to be used on the final request, some sites check this.
// Mostly used to prevent direct linking and other security resons.
referer = url.as_str().to_string();
@ -501,16 +522,17 @@ fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// Add the default favicon.ico to the list with the domain the content responded from.
iconlist.push(Icon::new(35, String::from(url.join("/favicon.ico").unwrap())));
// 512KB should be more than enough for the HTML, though as we only really need
// the HTML header, it could potentially be reduced even further
let mut limited_reader = content.take(512 * 1024);
// 384KB should be more than enough for the HTML, though as we only really need the HTML header.
let mut limited_reader = content.take(384 * 1024);
use html5ever::tendril::TendrilSink;
let dom = html5ever::parse_document(markup5ever_rcdom::RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut limited_reader)?;
get_favicons_node(&dom.document, &mut iconlist, &url);
let mut base_url: url::Url = url;
get_base_href(&dom.document, &mut base_url);
get_favicons_node(&dom.document, &mut iconlist, &base_url);
} else {
// Add the default favicon.ico to the list with just the given domain
iconlist.push(Icon::new(35, format!("{}/favicon.ico", ssldomain)));
@ -523,24 +545,20 @@ fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// There always is an icon in the list, so no need to check if it exists, and just return the first one
Ok(IconUrlResult {
iconlist,
cookies: cookie_str,
referer,
})
}
fn get_page(url: &str) -> Result<Response, Error> {
get_page_with_cookies(url, "", "")
get_page_with_referer(url, "")
}
fn get_page_with_cookies(url: &str, cookie_str: &str, referer: &str) -> Result<Response, Error> {
if is_domain_blacklisted(Url::parse(url).unwrap().host_str().unwrap_or_default()) {
fn get_page_with_referer(url: &str, referer: &str) -> Result<Response, Error> {
if is_domain_blacklisted(url::Url::parse(url).unwrap().host_str().unwrap_or_default()) {
err!("Favicon rel linked to a blacklisted domain!");
}
let mut client = CLIENT.get(url);
if !cookie_str.is_empty() {
client = client.header("Cookie", cookie_str)
}
if !referer.is_empty() {
client = client.header("Referer", referer)
}
@ -573,7 +591,7 @@ fn get_icon_priority(href: &str, sizes: Option<&str>) -> u8 {
1
} else if width == 64 {
2
} else if (24..=128).contains(&width) {
} else if (24..=192).contains(&width) {
3
} else if width == 16 {
4
@ -661,7 +679,7 @@ fn download_icon(domain: &str) -> Result<(Vec<u8>, Option<&str>), Error> {
_ => warn!("Extracted icon from data:image uri is invalid"),
};
} else {
match get_page_with_cookies(&icon.href, &icon_result.cookies, &icon_result.referer) {
match get_page_with_referer(&icon.href, &icon_result.referer) {
Ok(mut res) => {
res.copy_to(&mut buffer)?;
// Check if the icon type is allowed, else try an icon from the list.
@ -706,7 +724,54 @@ fn get_icon_type(bytes: &[u8]) -> Option<&'static str> {
[0, 0, 1, 0, ..] => Some("x-icon"),
[82, 73, 70, 70, ..] => Some("webp"),
[255, 216, 255, ..] => Some("jpeg"),
[71, 73, 70, 56, ..] => Some("gif"),
[66, 77, ..] => Some("bmp"),
_ => None,
}
}
/// This is an implementation of the default Cookie Jar from Reqwest and reqwest_cookie_store build by pfernie.
/// The default cookie jar used by Reqwest keeps all the cookies based upon the Max-Age or Expires which could be a long time.
/// That could be used for tracking, to prevent this we force the lifespan of the cookies to always be max two minutes.
/// A Cookie Jar is needed because some sites force a redirect with cookies to verify if a request uses cookies or not.
use cookie_store::CookieStore;
#[derive(Default)]
pub struct Jar(RwLock<CookieStore>);
impl reqwest::cookie::CookieStore for Jar {
fn set_cookies(&self, cookie_headers: &mut dyn Iterator<Item = &header::HeaderValue>, url: &url::Url) {
use cookie::{Cookie as RawCookie, ParseError as RawCookieParseError};
use time::Duration;
let mut cookie_store = self.0.write().unwrap();
let cookies = cookie_headers.filter_map(|val| {
std::str::from_utf8(val.as_bytes())
.map_err(RawCookieParseError::from)
.and_then(RawCookie::parse)
.map(|mut c| {
c.set_expires(None);
c.set_max_age(Some(Duration::minutes(2)));
c.into_owned()
})
.ok()
});
cookie_store.store_response_cookies(cookies, url);
}
fn cookies(&self, url: &url::Url) -> Option<header::HeaderValue> {
use bytes::Bytes;
let cookie_store = self.0.read().unwrap();
let s = cookie_store
.get_request_values(url)
.map(|(name, value)| format!("{}={}", name, value))
.collect::<Vec<_>>()
.join("; ");
if s.is_empty() {
return None;
}
header::HeaderValue::from_maybe_shared(Bytes::from(s)).ok()
}
}

View File

@ -122,6 +122,9 @@ fn init_logging(level: log::LevelFilter) -> Result<(), fern::InitError> {
// Never show html5ever and hyper::proto logs, too noisy
.level_for("html5ever", log::LevelFilter::Off)
.level_for("hyper::proto", log::LevelFilter::Off)
.level_for("hyper::client", log::LevelFilter::Off)
// Prevent cookie_store logs
.level_for("cookie_store", log::LevelFilter::Off)
.chain(std::io::stdout());
// Enable smtp debug logging only specifically for smtp when need.