Improve pronoun extraction for multiple definitions

2024-11-10 14:14:19 +00:00 · 2023-07-26 12:33:13 +02:00 · 2023-07-26 12:33:13 +02:00 · af53085681
commit af53085681
parent 48acb49695
2 changed files with 45 additions and 8 deletions
--- a/src/libs/pronouns.js
+++ b/src/libs/pronouns.js
@ -319,13 +319,30 @@ const knownPronouns = [
 function searchForKnownPronouns(text) {
 	if (!text) return null;

-	const exactMatches = text.matchAll(/(\w+) ?\/ ?(\w+)/gi);
-	for (const [match, subjective, objective] of exactMatches) {
-		if (
-			knownPronouns.includes(subjective.toLowerCase()) &&
-			knownPronouns.includes(objective.toLowerCase())
-		) {
-			return match.replaceAll(" ", "");
+	// This is a rather complex regular expression to search for pronouns. Therefore, here's the explanation
+	// in plain English: We search for all words that are followed by a slash (/) or comma (,),
+	// which are followed by at least one another word that matches this pattern.
+	//
+	// Why not just two of them? Well, for combinations of multiple subjective pronouns, like "sie/she/elle",
+	// we wanna display the whole set of pronouns if possible.
+	const exactMatches = text.matchAll(/(\w+) ?[/,] ?((\w+)[ /,]{0,2}){1,}/gi);
+	for (const [match] of exactMatches) {
+		// Once we have our match, split it by the known separators and check sequentially
+		// whether we know one of the pronouns. If that's the case, return everything in the match
+		// that's followed by this pronoun.
+		//
+		// Unfortunately, in the above case ("sie/she/elle"), it would return just "she/elle", because
+		// we don't know about common localized pronouns yet. And we can't return the whole set,
+		// because pronoun URLs like pronoun.page/they/them would return something like "page/they/them",
+		// which obviously is wrong.
+		const parts = match.split(/[/,]/).map((x) => x.trim());
+		for (const p of parts) {
+			if (knownPronouns.includes(p.toLowerCase())) {
+				let res = match.substring(match.indexOf(p));
+				res = res.replaceAll(" ", "");
+				res = res.trim();
+				return res;
+			}
 		}
 	}

--- a/tests/extractPronouns.spec.js
+++ b/tests/extractPronouns.spec.js
@ -62,6 +62,7 @@ valueExtractionSuite.after(() => {
 });
 const valueExtractionTests = [
 	["she/her", "she/her"], // exact match
+	["es,ihr / they, them", "es,ihr / they, them"], // exact match with multiple values, comma-separated
 	["they and them", "they and them"], // exact match with multiple words
 	["they/them (https://pronouns.page/they/them)", "they/them"], // plain-text "URL" with additional text
 	["https://en.pronouns.page/they/them", "they/them"], // plain-text "URLs"
@ -134,12 +135,31 @@ const endToEndTests = [
 		note: "https://en.pronouns.page/they/them",
 		expect: "they/them",
 	},
-
 	{
 		name: "find pronouns.page link in unknown field name",
 		fields: [{ name: "gender: not found", value: "https://en.pronouns.page/they/them" }],
 		expect: "they/them",
 	},
+	{
+		name: "multiple languages and one emoji",
+		fields: [{ name: "Pronomina/Pronouns", value: ":hehim: er, ihm / he, him" }],
+		expect: "er, ihm / he, him",
+	},
+	{
+		name: "not just pronouns in field",
+		fields: [{ name: "RL stats :loading_indicator:", value: "30 | :heart: | She/her" }],
+		expect: "She/her",
+	},
+	{
+		name: "multiple subjects in field name",
+		fields: [{ name: "She/sie/zij/elle", value: "etc" }],
+		expect: "She/sie/zij/elle",
+	},
+	{
+		name: "more complete pronoun definition in bio",
+		note: ":speech_bubble: e/em/eir",
+		expect: "e/em/eir",
+	},
 ];
 const endToEndTestSuite = suite("end to end tests");
 for (const { name, fields, expect, note } of endToEndTests) {