Merge branch 'rework-pronoun-field-matching'

2024-11-21 19:13:03 +00:00 · 2023-07-14 18:56:40 +02:00 · 2023-07-14 18:56:40 +02:00 · 28921ce46c
commit 28921ce46c
parent 36f8f1e4c2 777724c9eb
10 changed files with 497 additions and 73 deletions
--- a/.github/workflows/codequality.yml
+++ b/.github/workflows/codequality.yml
@ -33,3 +33,12 @@ jobs:
        run: npm ci
      - name: Run eslint
        run: npm run lint
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install dependencies
+        run: npm ci
+      - name: Run tests
+        run: npm test
--- a/jsconfig.json
+++ b/jsconfig.json
@ -2,7 +2,8 @@
 	"compilerOptions": {
 		"module": "ESNext",
 		"target": "ESNext",
-		"checkJs": true
+		"checkJs": true,
+		"moduleResolution": "nodenext"
 	},
-	"include": ["src/**/*.js"]
+	"include": ["src/**/*.js", "tests/**/*.js"]
 }
--- a/package-lock.json
+++ b/package-lock.json
@ -6,6 +6,7 @@
 		"": {
 			"name": "protoots",
 			"dependencies": {
+				"sanitize-html": "^2.11.0",
 				"webextension-polyfill": "^0.10.0"
 			},
 			"devDependencies": {
@ -15,6 +16,7 @@
 				"eslint-config-prettier": "^8.8.0",
 				"npm-run-all": "^4.1.5",
 				"prettier": "^2.8.8",
+				"uvu": "^0.5.6",
 				"web-ext": "^7.6.2"
 			}
 		},
@ -2013,7 +2015,6 @@
 			"version": "4.3.1",
 			"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
 			"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
-			"dev": true,
 			"engines": {
 				"node": ">=0.10.0"
 			}
@ -2073,6 +2074,24 @@
 				"node": ">=0.4.0"
 			}
 		},
+		"node_modules/dequal": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+			"integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+			"dev": true,
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/diff": {
+			"version": "5.1.0",
+			"resolved": "https://registry.npmjs.org/diff/-/diff-5.1.0.tgz",
+			"integrity": "sha512-D+mk+qE8VC/PAUrlAU34N+VfXev0ghe5ywmpqrawphmVZc1bEfn56uo9qpyGp1p4xpzOHkSW4ztBd6L7Xx4ACw==",
+			"dev": true,
+			"engines": {
+				"node": ">=0.3.1"
+			}
+		},
 		"node_modules/doctrine": {
 			"version": "3.0.0",
 			"resolved": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz",
@ -2089,7 +2108,6 @@
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
 			"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
-			"dev": true,
 			"dependencies": {
 				"domelementtype": "^2.3.0",
 				"domhandler": "^5.0.2",
@ -2103,7 +2121,6 @@
 			"version": "2.3.0",
 			"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
 			"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@ -2115,7 +2132,6 @@
 			"version": "5.0.3",
 			"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
 			"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
-			"dev": true,
 			"dependencies": {
 				"domelementtype": "^2.3.0"
 			},
@ -2130,7 +2146,6 @@
 			"version": "3.1.0",
 			"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
 			"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
-			"dev": true,
 			"dependencies": {
 				"dom-serializer": "^2.0.0",
 				"domelementtype": "^2.3.0",
@ -2213,7 +2228,6 @@
 			"version": "4.5.0",
 			"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
 			"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
-			"dev": true,
 			"engines": {
 				"node": ">=0.12"
 			},
@ -2386,7 +2400,6 @@
 			"version": "4.0.0",
 			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
 			"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
-			"dev": true,
 			"engines": {
 				"node": ">=10"
 			},
@ -3407,7 +3420,6 @@
 			"version": "8.0.2",
 			"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
 			"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
-			"dev": true,
 			"funding": [
 				"https://github.com/fb55/htmlparser2?sponsor=1",
 				{
@ -3835,6 +3847,14 @@
 				"node": ">=8"
 			}
 		},
+		"node_modules/is-plain-object": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz",
+			"integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
 		"node_modules/is-regex": {
 			"version": "1.1.4",
 			"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@ -4202,6 +4222,15 @@
 				"json-buffer": "3.0.1"
 			}
 		},
+		"node_modules/kleur": {
+			"version": "4.1.5",
+			"resolved": "https://registry.npmjs.org/kleur/-/kleur-4.1.5.tgz",
+			"integrity": "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==",
+			"dev": true,
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/latest-version": {
 			"version": "7.0.0",
 			"resolved": "https://registry.npmjs.org/latest-version/-/latest-version-7.0.0.tgz",
@ -4517,6 +4546,15 @@
 				"node": "*"
 			}
 		},
+		"node_modules/mri": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/mri/-/mri-1.2.0.tgz",
+			"integrity": "sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==",
+			"dev": true,
+			"engines": {
+				"node": ">=4"
+			}
+		},
 		"node_modules/ms": {
 			"version": "2.1.2",
 			"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
@ -4621,7 +4659,6 @@
 			"version": "3.3.6",
 			"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
 			"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "github",
@ -5192,6 +5229,11 @@
 				"url": "https://github.com/sponsors/sindresorhus"
 			}
 		},
+		"node_modules/parse-srcset": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz",
+			"integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
+		},
 		"node_modules/parse5": {
 			"version": "7.1.2",
 			"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
@ -5302,8 +5344,7 @@
 		"node_modules/picocolors": {
 			"version": "1.0.0",
 			"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
-			"integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
-			"dev": true
+			"integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ=="
 		},
 		"node_modules/pidtree": {
 			"version": "0.3.1",
@ -5368,7 +5409,6 @@
 			"version": "8.4.21",
 			"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.21.tgz",
 			"integrity": "sha512-tP7u/Sn/dVxK2NnruI4H9BG+x+Wxz6oeZ1cJ8P6G/PZY0IKk4k/63TDsQf2kQq3+qoJeLm2kIBUNlZe3zgb4Zg==",
-			"dev": true,
 			"funding": [
 				{
 					"type": "opencollective",
@ -5933,6 +5973,18 @@
 				"queue-microtask": "^1.2.2"
 			}
 		},
+		"node_modules/sade": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/sade/-/sade-1.8.1.tgz",
+			"integrity": "sha512-xal3CZX1Xlo/k4ApwCFrHVACi9fBqJ7V+mwhBsuf/1IOKbBy098Fex+Wa/5QMubw09pSZ/u8EY8PWgevJsXp1A==",
+			"dev": true,
+			"dependencies": {
+				"mri": "^1.1.0"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
 		"node_modules/safe-buffer": {
 			"version": "5.2.1",
 			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@ -5989,6 +6041,19 @@
 			"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
 			"dev": true
 		},
+		"node_modules/sanitize-html": {
+			"version": "2.11.0",
+			"resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.11.0.tgz",
+			"integrity": "sha512-BG68EDHRaGKqlsNjJ2xUB7gpInPA8gVx/mvjO743hZaeMCZ2DwzW7xvsqZ+KNU4QKwj86HJ3uu2liISf2qBBUA==",
+			"dependencies": {
+				"deepmerge": "^4.2.2",
+				"escape-string-regexp": "^4.0.0",
+				"htmlparser2": "^8.0.0",
+				"is-plain-object": "^5.0.0",
+				"parse-srcset": "^1.0.2",
+				"postcss": "^8.3.11"
+			}
+		},
 		"node_modules/sax": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz",
@ -6149,7 +6214,6 @@
 			"version": "1.0.2",
 			"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz",
 			"integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==",
-			"dev": true,
 			"engines": {
 				"node": ">=0.10.0"
 			}
@ -6768,6 +6832,24 @@
 				"uuid": "dist/bin/uuid"
 			}
 		},
+		"node_modules/uvu": {
+			"version": "0.5.6",
+			"resolved": "https://registry.npmjs.org/uvu/-/uvu-0.5.6.tgz",
+			"integrity": "sha512-+g8ENReyr8YsOc6fv/NVJs2vFdHBnBNdfE49rshrTzDWOlUx4Gq7KOS2GD8eqhy2j+Ejq29+SbKH8yjkAqXqoA==",
+			"dev": true,
+			"dependencies": {
+				"dequal": "^2.0.0",
+				"diff": "^5.0.0",
+				"kleur": "^4.0.3",
+				"sade": "^1.7.3"
+			},
+			"bin": {
+				"uvu": "bin.js"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
 		"node_modules/validate-npm-package-license": {
 			"version": "3.0.4",
 			"resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz",
--- a/package.json
+++ b/package.json
@ -8,7 +8,8 @@
 		"watch:webext": "web-ext run --keep-profile-changes --profile-create-if-missing --firefox-profile=.firefox-profile/",
 		"format": "prettier --write --ignore-path .gitignore .",
 		"package": "run-s -l build:**",
-		"lint": "eslint src/"
+		"lint": "eslint src/",
+		"test": "uvu tests/"
 	},
 	"devDependencies": {
 		"@sprout2000/esbuild-copy-plugin": "1.1.8",
@ -17,6 +18,7 @@
 		"eslint-config-prettier": "^8.8.0",
 		"npm-run-all": "^4.1.5",
 		"prettier": "^2.8.8",
+		"uvu": "^0.5.6",
 		"web-ext": "^7.6.2"
 	},
 	"prettier": {
@ -25,10 +27,12 @@
 		"useTabs": true,
 		"trailingComma": "all"
 	},
+	"type": "module",
 	"webExt": {
 		"sourceDir": "dist/"
 	},
 	"dependencies": {
+		"sanitize-html": "^2.11.0",
 		"webextension-polyfill": "^0.10.0"
 	}
 }
--- a/src/content_scripts/protoots.js
+++ b/src/content_scripts/protoots.js
@ -23,7 +23,6 @@ import {
 	accountNameFromURL,
 	addTypeAttribute,
 	normaliseAccountName,
-	sanitizePronouns,
 } from "../libs/protootshelpers";

 //before anything else, check whether we're on a Mastodon page
@ -208,7 +207,7 @@ async function addProplate(element) {
 		if (pronouns == "null" && !isLogging()) {
 			return;
 		}
-		proplate.innerHTML = sanitizePronouns(pronouns);
+		proplate.innerText = pronouns;
 		//TODO?: alt text
 		proplate.classList.add("protoots-proplate");
 		if (accountName == "jasmin@queer.group" || accountName == "vivien@queer.group") {
--- a/src/libs/fetchPronouns.js
+++ b/src/libs/fetchPronouns.js
@ -1,10 +1,10 @@
 import { debug, error, info, log, warn } from "./logging";
 import { cachePronouns, getPronouns } from "./caching";
 import { normaliseAccountName } from "./protootshelpers";
+import { extractFromStatus } from "./pronouns";

 const cacheMaxAge = 24 * 60 * 60 * 1000; // time after which cached pronouns should be checked again: 24h
 let conversationsCache;
-const fieldNames = ["pronouns", "pronoun", "professional nouns", "pronomen"];

 /**
 * Fetches pronouns associated with account name.
@ -13,6 +13,7 @@ const fieldNames = ["pronouns", "pronoun", "professional nouns", "pronomen"];
 * @param {string | undefined} dataID ID of the object being requested, in case cache misses.
 * @param {string} accountName The account name, used for caching. Should have the "@" prefix.
 * @param {string} type Type of data-id
+ * @returns {string} The pronouns if we have any, otherwise "null".
 */
 export async function fetchPronouns(dataID, accountName, type) {
 	// log(`searching for ${account_name}`);
@ -63,12 +64,13 @@ export async function fetchPronouns(dataID, accountName, type) {
 	log(`Fetching ${type} failed, trying notification instead.`);
 	if (!status) status = await fetchNotification(dataID); //fallback for glitch-soc notifications

-	const PronounField = getPronounField(status, accountName);
-	if (PronounField == "null") {
-		//TODO: if no field check bio
+	let pronouns = await extractFromStatus(status);
+	if (!pronouns) {
+		pronouns = "null";
 		info(`no pronouns found for ${accountName}, cached null`);
 	}
-	return PronounField;
+	await cachePronouns(accountName, pronouns);
+	return pronouns;
 }

 /**
@ -166,37 +168,6 @@ async function fetchConversations() {
 	return conversations;
 }

-/**
- * Searches for fields labelled "pronouns" in the statuses' author.
- * If found returns the value of said field.
- *
- * @param {any} status
- * @param {string} accountName
- * @returns {string} Author pronouns if found. Otherwise returns "null"
- */
-function getPronounField(status, accountName) {
-	// get account from status and pull out fields
-	const account = status.account;
-	const fields = account.fields;
-
-	for (const field of fields) {
-		//match fields against fieldNames
-		for (const searchTerm of fieldNames) {
-			if (field.name.toLowerCase().includes(searchTerm)) {
-				debug(`${account.acct}: ${field.value}`);
-
-				cachePronouns(accountName, field.value);
-				return field.value;
-			}
-		}
-	}
-
-	//if not returned by this point no field with pronouns was found
-
-	cachePronouns(accountName, "null");
-	return "null";
-}
-
 /**
 * Fetches the current access token for the user.
 * @returns {Promise<string>} The accessToken for the current user if we are logged in.
--- a/src/libs/pronouns.js
+++ b/src/libs/pronouns.js
@ -0,0 +1,276 @@
+import sanitizeHtml from "sanitize-html";
+
+const fieldMatchers = [/pro.*nouns?/i, "pronomen"];
+const knownPronounUrls = [
+	/pronouns\.page\/:?([\w/@]+)/,
+	/pronouns\.within\.lgbt\/([\w/]+)/,
+	/pronouns\.cc\/pronouns\/([\w/]+)/,
+];
+
+/**
+ * Tries to extract the pronouns for the given status.
+ * This is done by searching for pronoun fields that match the {@see fieldMatchers}.
+ *
+ * If found, it sanitizes and returns the value of said field.
+ *
+ * @param {any} status
+ * @returns {Promise<string|null>} Author pronouns if found. Otherwise returns null.
+ */
+export async function extractFromStatus(status) {
+	// get account from status and pull out fields
+	const account = status.account;
+	const { fields, note } = account;
+	let pronouns;
+
+	if (fields) {
+		for (const f of fields) {
+			pronouns = await extractFromField(f);
+			if (pronouns) break;
+		}
+	}
+
+	if (!pronouns && note) {
+		pronouns = extractFromBio(note);
+	}
+
+	pronouns = sanitizePronouns(pronouns);
+	return pronouns;
+}
+
+/**
+ * @param {{name: string, value: string}} field The field value
+ * @returns {Promise<string|null>} The pronouns or null.
+ */
+async function extractFromField(field) {
+	let pronounsRaw;
+	for (const matcher of fieldMatchers) {
+		if (typeof matcher === "string" && field.name.toLowerCase().includes(matcher)) {
+			pronounsRaw = field.value;
+			break;
+		} else if (field.name.match(matcher)) {
+			pronounsRaw = field.value;
+			break;
+		}
+	}
+
+	if (!pronounsRaw) return null;
+	let text = sanitizeHtml(pronounsRaw, { allowedTags: [], allowedAttributes: {} });
+	// If one of pronoun URLs matches, overwrite the current known value.
+	for (const knownUrlRe of knownPronounUrls) {
+		if (!knownUrlRe.test(pronounsRaw)) continue;
+		text = pronounsRaw.match(knownUrlRe)[1];
+	}
+
+	// Right now, only the pronoun.page regex matches the @usernames.
+	if (text.charAt(0) === "@") {
+		text = await queryPronounsFromPronounsPage(text.substring(1));
+	}
+
+	if (!text) return null;
+	return text;
+}
+
+/**
+ * Queries the pronouns from the pronouns.page API.
+ * @param {string} username The username of the person.
+ * @returns {Promise<string|null>} The pronouns that have set the "yes" opinion.
+ */
+async function queryPronounsFromPronounsPage(username) {
+	// Example page: https://en.pronouns.page/api/profile/get/andrea?version=2
+	const resp = await fetch(`https://en.pronouns.page/api/profile/get/${username}?version=2`);
+	if (resp.status >= 400) {
+		return null;
+	}
+
+	const { profiles } = await resp.json();
+	if (!profiles) return null;
+
+	// Unfortunately, pronouns.page does not return a 404 if a profile does not exist, but an empty profiles object. :clown_face:
+	if (!Object.keys(profiles).length) return null;
+
+	let pronouns;
+	// Query the pronouns in the following language order:
+	// 1. The mastodon interface language
+	// 2. The spoken languages according to the user
+	// 3. The english language.
+	const languages = [document.documentElement.lang, ...window.navigator.languages, "en"];
+	for (const lang of languages) {
+		if (lang in profiles) {
+			pronouns = profiles[lang].pronouns;
+			break;
+		}
+	}
+
+	// If we don't have a value yet, just take the first profile.
+	if (!pronouns) pronouns = profiles[0].pronouns;
+
+	let val = pronouns.find((x) => x.opinion === "yes" || x.opinion === "meh").value;
+	val = sanitizePronounPageValue(val);
+	return val;
+}
+
+/**
+ * @param {string} val
+ */
+function sanitizePronounPageValue(val) {
+	if (!val.startsWith("https://")) return val;
+
+	val = val.replace(/https?:\/\/.+\.pronouns\.page\/:?/, "");
+
+	if (val === "no-pronouns") val = "no pronouns";
+	return val;
+}
+
+/**
+ * Sanitizes the pronoun field by removing various long information parts.
+ * As of today, this just removes custom emojis from the field.
+ * If the passed string is not defined, null is returned.
+ *
+ * @param {string} str The input string.
+ * @returns {string|null} The sanitized string.
+ */
+function sanitizePronouns(str) {
+	if (!str) return null;
+
+	// Remove all custom emojis with the :shortcode: format.
+	str = str.replace(/:[\w_]+:/gi, "");
+
+	// We still might have URLs in our text, for example, if people redirect some domain to pronouns.page.
+	// We filter them out, because they would not be clickable anyways and provide no benefit.
+	str = str
+		.split(" ")
+		.filter((x) => {
+			// Let's try to build an URL and if it looks like one, filter it out.
+			try {
+				const u = new URL(x);
+				return !u.protocol.startsWith("http");
+			} catch {
+				return true;
+			}
+		})
+		.join(" ");
+
+	// Remove trailing characters that are used as separators.
+	str = str.replace(/[-| /]+$/, "");
+
+	// Finally, remove leading and trailing whitespace.
+	str = str.trim();
+
+	// If the result is empty, return null, otherwise the empty string.
+	return str === "" ? null : str;
+}
+
+const knownPronouns = [
+	"ae",
+	"aer",
+	"aers",
+	"aerself",
+	"co",
+	"co's",
+	"cos",
+	"coself",
+	"e",
+	"eir",
+	"eirs",
+	"em",
+	"ems",
+	"emself",
+	"es",
+	"ey",
+	"fae",
+	"faer",
+	"faers",
+	"faerself",
+	"he",
+	"her",
+	"hers",
+	"herself",
+	"him",
+	"himself",
+	"hir",
+	"hirs",
+	"hirself",
+	"his",
+	"hu",
+	"hum",
+	"hus",
+	"huself",
+	"it",
+	"its",
+	"itself",
+	"ne",
+	"nem",
+	"nemself",
+	"nir",
+	"nirs",
+	"nirself",
+	"one",
+	"one's",
+	"oneself",
+	"per",
+	"pers",
+	"perself",
+	"s/he",
+	"she",
+	"their",
+	"theirs",
+	"them",
+	"themself",
+	"themselves",
+	"they",
+	"thon",
+	"thon's",
+	"thons",
+	"thonself",
+	"ve",
+	"ver",
+	"vers",
+	"verself",
+	"vi",
+	"vim",
+	"vims",
+	"vimself",
+	"vir",
+	"virs",
+	"virself",
+	"vis",
+	"xe",
+	"xem",
+	"xemself",
+	"xyr",
+	"xyrs",
+	"ze",
+	"zhe",
+	"zher",
+	"zhers",
+	"zherself",
+	"zir",
+	"zirs",
+	"zirself",
+];
+
+/**
+ * Tries to extract pronouns from the bio/note. Only "known" pronouns are returned, which is
+ * a compromise for the pattern matching. At no point we want to limit the pronouns used by persons.
+ * @param {string} bio The bio
+ * @returns {string|null} The result or null
+ */
+function extractFromBio(bio) {
+	const exactMatches = bio.matchAll(/(\w+) ?\/ ?(\w+)/gi);
+	for (const [match, subjective, objective] of exactMatches) {
+		if (knownPronouns.includes(subjective) && knownPronouns.includes(objective)) {
+			return match.replaceAll(" ", "");
+		}
+	}
+
+	const followedByColon = bio.matchAll(/pronouns?:\W+([\w/+]+)/gi);
+	for (const match of followedByColon) {
+		return match.pop(); // first group is last entry in array
+	}
+	const anyAllPronouns = bio.match(/(any|all) +pronouns/gi);
+	if (anyAllPronouns) {
+		return anyAllPronouns[0];
+	}
+
+	return null;
+}
--- a/src/libs/protootshelpers.js
+++ b/src/libs/protootshelpers.js
@ -32,24 +32,6 @@ export function accountNameFromURL(url) {
 	return username;
 }

-/**
- * Sanitizes the pronoun field by removing various long information parts.
- * As of today, this just removes custom emojis from the field.
- * If the passed string is not defined, an empty string is returned.
- *
- * @param {string} str The input string.
- * @returns The sanitized string.
- */
-export function sanitizePronouns(str) {
-	if (!str) return "";
-
-	// Remove all custom emojis with the :shortcode: format.
-	str = str.replace(/:[\w_]+:/gi, "");
-
-	// Finally, remove leading and trailing whitespace.
-	return str.trim();
-}
-
 /**
 * Checks which type an element is and adds the according protoots-type attribute
 * @param {HTMLElement} ActionElement
--- a/src/manifest.json
+++ b/src/manifest.json
@ -10,7 +10,7 @@

 	"description": "puts pronouns next to usernames on mastodon",
 	"homepage_url": "https://github.com/ItsVipra/ProToots",
-	"permissions": ["storage"],
+	"permissions": ["storage", "https://en.pronouns.page/api/profile/get/*"],

 	"browser_action": {
 		"default_icon": "icons/icon small_size/icon small_size.png",
--- a/tests/extractPronouns.spec.js
+++ b/tests/extractPronouns.spec.js
@ -0,0 +1,100 @@
+import { suite } from "uvu";
+import * as assert from "uvu/assert";
+import * as pronouns from "../src/libs/pronouns.js";
+
+const extract = suite("field extraction");
+const validFields = [
+	"pronoun",
+	"pronouns",
+	"PRONOUNS",
+	"professional nouns",
+	"pronomen",
+	"Pronouns / Pronomen",
+];
+
+for (const field of validFields) {
+	extract(`${field} is extracted`, async () => {
+		const result = await pronouns.extractFromStatus({
+			account: {
+				fields: [{ name: field, value: "pro/nouns" }],
+			},
+		});
+		assert.equal("pro/nouns", result);
+	});
+}
+
+extract.run();
+
+const valueExtractionSuite = suite("value extraction");
+valueExtractionSuite.before(() => {
+	global.window = {
+		// @ts-ignore
+		navigator: {
+			languages: ["en"],
+		},
+	};
+	global.document = {
+		// @ts-ignore
+		documentElement: {
+			lang: "de",
+		},
+	};
+});
+valueExtractionSuite.after(() => {
+	global.window = undefined;
+	global.document = undefined;
+});
+const valueExtractionTests = [
+	["she/her", "she/her"], // exact match
+	["they and them", "they and them"], // exact match with multiple words
+	["they/them (https://pronouns.page/they/them)", "they/them"], // plain-text "URL" with additional text
+	["https://en.pronouns.page/they/them", "they/them"], // plain-text "URLs"
+	["pronouns.page/they/them", "they/them"], // plain-text "URLs" without scheme
+	[`<a href="https://en.pronouns.page/they/them"></a>`, "they/them"], // HTML-formatted URLs
+	[`<a href="https://en.pronouns.page/@Vipra"></a>`, "she/her"], // pronoun pages with usernames
+	[
+		`<a href="https://en.pronouns.page/@definitely_not_existing_username_on_pronouns_page"></a>`,
+		null,
+	], // 404 errors
+	[`<a href="https://de.pronouns.page/:Katze"></a>`, "Katze"], // custom pronouns
+	[`<a href="https://de.pronouns.page/@benaryorg"></a>`, "Katze"], // custom pronouns in profile
+	[`:theythem:`, null], // emojis shortcodes used for pronouns
+	[
+		// This is an actual example from a Mastodon field, with example.com redirecting to pronouns.page.
+		`dey/denen, es/ihm - <a href="https://example.com" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://</span><span class="">example.com</span><span class="invisible"></span></a>`,
+		"dey/denen, es/ihm",
+	],
+];
+for (const [input, expects] of valueExtractionTests) {
+	valueExtractionSuite(input, async () => {
+		const result = await pronouns.extractFromStatus({
+			account: {
+				fields: [{ name: "pronouns", value: input }],
+			},
+		});
+		assert.equal(result, expects);
+	});
+}
+
+valueExtractionSuite.run();
+
+const bioExtractSuite = suite("bio extraction");
+const bioExtractTests = [
+	["I'm cute and my pronouns are she/her", "she/her"], // exact match
+	["my pronouns are helicopter/joke", null], // not on allowlist
+	["pronouns: uwu/owo", "uwu/owo"], // followed by pronoun pattern
+	["pronouns: any", "any"], // followed by pronoun pattern
+	["I'm cute af (she / they)", "she/they"], // with whitespace between pronouns
+	["pronouns: any/all", "any/all"], // any pronouns
+	["any pronouns", "any pronouns"], // any pronouns
+];
+for (const [input, expects] of bioExtractTests) {
+	bioExtractSuite(input, async () => {
+		const result = await pronouns.extractFromStatus({
+			account: { note: input },
+		});
+		assert.equal(result, expects);
+	});
+}
+
+bioExtractSuite.run();