Improve pronoun extraction for multiple definitions

This commit is contained in:
nachtjasmin 2023-07-26 12:33:13 +02:00
parent 48acb49695
commit af53085681
No known key found for this signature in database
2 changed files with 45 additions and 8 deletions

View file

@ -319,13 +319,30 @@ const knownPronouns = [
function searchForKnownPronouns(text) {
if (!text) return null;
const exactMatches = text.matchAll(/(\w+) ?\/ ?(\w+)/gi);
for (const [match, subjective, objective] of exactMatches) {
if (
knownPronouns.includes(subjective.toLowerCase()) &&
knownPronouns.includes(objective.toLowerCase())
) {
return match.replaceAll(" ", "");
// This is a rather complex regular expression to search for pronouns. Therefore, here's the explanation
// in plain English: We search for all words that are followed by a slash (/) or comma (,),
// which are followed by at least one another word that matches this pattern.
//
// Why not just two of them? Well, for combinations of multiple subjective pronouns, like "sie/she/elle",
// we wanna display the whole set of pronouns if possible.
const exactMatches = text.matchAll(/(\w+) ?[/,] ?((\w+)[ /,]{0,2}){1,}/gi);
for (const [match] of exactMatches) {
// Once we have our match, split it by the known separators and check sequentially
// whether we know one of the pronouns. If that's the case, return everything in the match
// that's followed by this pronoun.
//
// Unfortunately, in the above case ("sie/she/elle"), it would return just "she/elle", because
// we don't know about common localized pronouns yet. And we can't return the whole set,
// because pronoun URLs like pronoun.page/they/them would return something like "page/they/them",
// which obviously is wrong.
const parts = match.split(/[/,]/).map((x) => x.trim());
for (const p of parts) {
if (knownPronouns.includes(p.toLowerCase())) {
let res = match.substring(match.indexOf(p));
res = res.replaceAll(" ", "");
res = res.trim();
return res;
}
}
}

View file

@ -62,6 +62,7 @@ valueExtractionSuite.after(() => {
});
const valueExtractionTests = [
["she/her", "she/her"], // exact match
["es,ihr / they, them", "es,ihr / they, them"], // exact match with multiple values, comma-separated
["they and them", "they and them"], // exact match with multiple words
["they/them (https://pronouns.page/they/them)", "they/them"], // plain-text "URL" with additional text
["https://en.pronouns.page/they/them", "they/them"], // plain-text "URLs"
@ -134,12 +135,31 @@ const endToEndTests = [
note: "https://en.pronouns.page/they/them",
expect: "they/them",
},
{
name: "find pronouns.page link in unknown field name",
fields: [{ name: "gender: not found", value: "https://en.pronouns.page/they/them" }],
expect: "they/them",
},
{
name: "multiple languages and one emoji",
fields: [{ name: "Pronomina/Pronouns", value: ":hehim: er, ihm / he, him" }],
expect: "er, ihm / he, him",
},
{
name: "not just pronouns in field",
fields: [{ name: "RL stats :loading_indicator:", value: "30 | :heart: | She/her" }],
expect: "She/her",
},
{
name: "multiple subjects in field name",
fields: [{ name: "She/sie/zij/elle", value: "etc" }],
expect: "She/sie/zij/elle",
},
{
name: "more complete pronoun definition in bio",
note: ":speech_bubble: e/em/eir",
expect: "e/em/eir",
},
];
const endToEndTestSuite = suite("end to end tests");
for (const { name, fields, expect, note } of endToEndTests) {