mirror of
https://github.com/LemmyNet/lemmy
synced 2024-11-10 06:54:12 +00:00
Adding clearurls crate to clean tracking params from links and markdown. (#5018)
* Adding clearurls crate to clean tracking params from links and markdown. - Thanks to @jenrdikw for creating this - Fixes #4905 * Upgrading to new version of clearurls * Fix clippy
This commit is contained in:
parent
ff939e04fd
commit
5febf2b8fb
5 changed files with 59 additions and 23 deletions
15
Cargo.lock
generated
15
Cargo.lock
generated
|
@ -877,6 +877,20 @@ version = "0.7.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clearurls"
|
||||||
|
version = "0.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e291c00af89ac0a5b400d9ba46a682e38015ae3cd8926dbbe85b3b864d550be3"
|
||||||
|
dependencies = [
|
||||||
|
"linkify",
|
||||||
|
"percent-encoding",
|
||||||
|
"regex",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clokwerk"
|
name = "clokwerk"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
|
@ -2781,6 +2795,7 @@ dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
|
"clearurls",
|
||||||
"deser-hjson",
|
"deser-hjson",
|
||||||
"diesel",
|
"diesel",
|
||||||
"doku",
|
"doku",
|
||||||
|
|
|
@ -49,6 +49,7 @@ use lemmy_utils::{
|
||||||
utils::{
|
utils::{
|
||||||
markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
|
markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
|
||||||
slurs::{build_slur_regex, remove_slurs},
|
slurs::{build_slur_regex, remove_slurs},
|
||||||
|
validation::clean_urls_in_text,
|
||||||
},
|
},
|
||||||
CACHE_DURATION_FEDERATION,
|
CACHE_DURATION_FEDERATION,
|
||||||
};
|
};
|
||||||
|
@ -947,6 +948,7 @@ pub async fn process_markdown(
|
||||||
context: &LemmyContext,
|
context: &LemmyContext,
|
||||||
) -> LemmyResult<String> {
|
) -> LemmyResult<String> {
|
||||||
let text = remove_slurs(text, slur_regex);
|
let text = remove_slurs(text, slur_regex);
|
||||||
|
let text = clean_urls_in_text(&text);
|
||||||
|
|
||||||
markdown_check_for_blocked_urls(&text, url_blocklist)?;
|
markdown_check_for_blocked_urls(&text, url_blocklist)?;
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ use i_love_jesus::CursorKey;
|
||||||
use lemmy_utils::{
|
use lemmy_utils::{
|
||||||
error::{LemmyErrorExt, LemmyErrorType, LemmyResult},
|
error::{LemmyErrorExt, LemmyErrorType, LemmyResult},
|
||||||
settings::SETTINGS,
|
settings::SETTINGS,
|
||||||
utils::validation::clean_url_params,
|
utils::validation::clean_url,
|
||||||
};
|
};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use rustls::{
|
use rustls::{
|
||||||
|
@ -305,7 +305,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
|
||||||
// An empty string is an erase
|
// An empty string is an erase
|
||||||
Some("") => Ok(Some(None)),
|
Some("") => Ok(Some(None)),
|
||||||
Some(str_url) => Url::parse(str_url)
|
Some(str_url) => Url::parse(str_url)
|
||||||
.map(|u| Some(Some(clean_url_params(&u).into())))
|
.map(|u| Some(Some(clean_url(&u).into())))
|
||||||
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
|
@ -316,7 +316,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
|
||||||
pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult<Option<DbUrl>> {
|
pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult<Option<DbUrl>> {
|
||||||
match opt {
|
match opt {
|
||||||
Some(str_url) => Url::parse(str_url)
|
Some(str_url) => Url::parse(str_url)
|
||||||
.map(|u| Some(clean_url_params(&u).into()))
|
.map(|u| Some(clean_url(&u).into()))
|
||||||
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
.with_lemmy_type(LemmyErrorType::InvalidUrl),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,6 +81,7 @@ markdown-it = { version = "0.6.1", optional = true }
|
||||||
ts-rs = { workspace = true, optional = true }
|
ts-rs = { workspace = true, optional = true }
|
||||||
enum-map = { workspace = true, optional = true }
|
enum-map = { workspace = true, optional = true }
|
||||||
cfg-if = "1"
|
cfg-if = "1"
|
||||||
|
clearurls = { version = "0.0.4", features = ["linkify"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
reqwest = { workspace = true }
|
reqwest = { workspace = true }
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
|
use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
|
||||||
|
use clearurls::UrlCleaner;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use regex::{Regex, RegexBuilder, RegexSet};
|
use regex::{Regex, RegexBuilder, RegexSet};
|
||||||
use std::sync::LazyLock;
|
use std::sync::LazyLock;
|
||||||
|
@ -10,12 +11,8 @@ static VALID_MATRIX_ID_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
||||||
.expect("compile regex")
|
.expect("compile regex")
|
||||||
});
|
});
|
||||||
// taken from https://en.wikipedia.org/wiki/UTM_parameters
|
// taken from https://en.wikipedia.org/wiki/UTM_parameters
|
||||||
static CLEAN_URL_PARAMS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
static URL_CLEANER: LazyLock<UrlCleaner> =
|
||||||
Regex::new(
|
LazyLock::new(|| UrlCleaner::from_embedded_rules().expect("compile clearurls"));
|
||||||
r"^(utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid)=",
|
|
||||||
)
|
|
||||||
.expect("compile regex")
|
|
||||||
});
|
|
||||||
const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];
|
const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];
|
||||||
|
|
||||||
const BODY_MAX_LENGTH: usize = 10000;
|
const BODY_MAX_LENGTH: usize = 10000;
|
||||||
|
@ -257,16 +254,22 @@ pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult<Option
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn clean_url_params(url: &Url) -> Url {
|
/// Cleans a url of tracking parameters.
|
||||||
let mut url_out = url.clone();
|
pub fn clean_url(url: &Url) -> Url {
|
||||||
if let Some(query) = url.query() {
|
match URL_CLEANER.clear_single_url(url) {
|
||||||
let new_query = query
|
Ok(res) => res.into_owned(),
|
||||||
.split_inclusive('&')
|
// If there are any errors, just return the original url
|
||||||
.filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(q))
|
Err(_) => url.clone(),
|
||||||
.collect::<String>();
|
}
|
||||||
url_out.set_query(Some(&new_query));
|
}
|
||||||
|
|
||||||
|
/// Cleans all the links in a string of tracking parameters.
|
||||||
|
pub fn clean_urls_in_text(text: &str) -> String {
|
||||||
|
match URL_CLEANER.clear_text(text) {
|
||||||
|
Ok(res) => res.into_owned(),
|
||||||
|
// If there are any errors, just return the original text
|
||||||
|
Err(_) => text.to_owned(),
|
||||||
}
|
}
|
||||||
url_out
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn check_site_visibility_valid(
|
pub fn check_site_visibility_valid(
|
||||||
|
@ -357,7 +360,8 @@ mod tests {
|
||||||
build_and_check_regex,
|
build_and_check_regex,
|
||||||
check_site_visibility_valid,
|
check_site_visibility_valid,
|
||||||
check_urls_are_valid,
|
check_urls_are_valid,
|
||||||
clean_url_params,
|
clean_url,
|
||||||
|
clean_urls_in_text,
|
||||||
is_url_blocked,
|
is_url_blocked,
|
||||||
is_valid_actor_name,
|
is_valid_actor_name,
|
||||||
is_valid_bio_field,
|
is_valid_bio_field,
|
||||||
|
@ -378,18 +382,32 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_clean_url_params() -> LemmyResult<()> {
|
fn test_clean_url_params() -> LemmyResult<()> {
|
||||||
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user%20&id=123")?;
|
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123")?;
|
||||||
let cleaned = clean_url_params(&url);
|
let cleaned = clean_url(&url);
|
||||||
let expected = Url::parse("https://example.com/path/123?user+name=random+user%20&id=123")?;
|
let expected = Url::parse("https://example.com/path/123?user+name=random+user&id=123")?;
|
||||||
assert_eq!(expected.to_string(), cleaned.to_string());
|
assert_eq!(expected.to_string(), cleaned.to_string());
|
||||||
|
|
||||||
let url = Url::parse("https://example.com/path/123")?;
|
let url = Url::parse("https://example.com/path/123")?;
|
||||||
let cleaned = clean_url_params(&url);
|
let cleaned = clean_url(&url);
|
||||||
assert_eq!(url.to_string(), cleaned.to_string());
|
assert_eq!(url.to_string(), cleaned.to_string());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_clean_body() -> LemmyResult<()> {
|
||||||
|
let text = "[a link](https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123)";
|
||||||
|
let cleaned = clean_urls_in_text(text);
|
||||||
|
let expected = "[a link](https://example.com/path/123?user+name=random+user&id=123)";
|
||||||
|
assert_eq!(expected.to_string(), cleaned.to_string());
|
||||||
|
|
||||||
|
let text = "[a link](https://example.com/path/123)";
|
||||||
|
let cleaned = clean_urls_in_text(text);
|
||||||
|
assert_eq!(text.to_string(), cleaned);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn regex_checks() {
|
fn regex_checks() {
|
||||||
assert!(is_valid_post_title("hi").is_err());
|
assert!(is_valid_post_title("hi").is_err());
|
||||||
|
|
Loading…
Reference in a new issue