mirror of
https://github.com/ItsVipra/ProToots
synced 2024-11-10 14:14:19 +00:00
Improve pronoun extraction for multiple definitions
This commit is contained in:
parent
48acb49695
commit
af53085681
2 changed files with 45 additions and 8 deletions
|
@ -319,13 +319,30 @@ const knownPronouns = [
|
|||
function searchForKnownPronouns(text) {
|
||||
if (!text) return null;
|
||||
|
||||
const exactMatches = text.matchAll(/(\w+) ?\/ ?(\w+)/gi);
|
||||
for (const [match, subjective, objective] of exactMatches) {
|
||||
if (
|
||||
knownPronouns.includes(subjective.toLowerCase()) &&
|
||||
knownPronouns.includes(objective.toLowerCase())
|
||||
) {
|
||||
return match.replaceAll(" ", "");
|
||||
// This is a rather complex regular expression to search for pronouns. Therefore, here's the explanation
|
||||
// in plain English: We search for all words that are followed by a slash (/) or comma (,),
|
||||
// which are followed by at least one another word that matches this pattern.
|
||||
//
|
||||
// Why not just two of them? Well, for combinations of multiple subjective pronouns, like "sie/she/elle",
|
||||
// we wanna display the whole set of pronouns if possible.
|
||||
const exactMatches = text.matchAll(/(\w+) ?[/,] ?((\w+)[ /,]{0,2}){1,}/gi);
|
||||
for (const [match] of exactMatches) {
|
||||
// Once we have our match, split it by the known separators and check sequentially
|
||||
// whether we know one of the pronouns. If that's the case, return everything in the match
|
||||
// that's followed by this pronoun.
|
||||
//
|
||||
// Unfortunately, in the above case ("sie/she/elle"), it would return just "she/elle", because
|
||||
// we don't know about common localized pronouns yet. And we can't return the whole set,
|
||||
// because pronoun URLs like pronoun.page/they/them would return something like "page/they/them",
|
||||
// which obviously is wrong.
|
||||
const parts = match.split(/[/,]/).map((x) => x.trim());
|
||||
for (const p of parts) {
|
||||
if (knownPronouns.includes(p.toLowerCase())) {
|
||||
let res = match.substring(match.indexOf(p));
|
||||
res = res.replaceAll(" ", "");
|
||||
res = res.trim();
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -62,6 +62,7 @@ valueExtractionSuite.after(() => {
|
|||
});
|
||||
const valueExtractionTests = [
|
||||
["she/her", "she/her"], // exact match
|
||||
["es,ihr / they, them", "es,ihr / they, them"], // exact match with multiple values, comma-separated
|
||||
["they and them", "they and them"], // exact match with multiple words
|
||||
["they/them (https://pronouns.page/they/them)", "they/them"], // plain-text "URL" with additional text
|
||||
["https://en.pronouns.page/they/them", "they/them"], // plain-text "URLs"
|
||||
|
@ -134,12 +135,31 @@ const endToEndTests = [
|
|||
note: "https://en.pronouns.page/they/them",
|
||||
expect: "they/them",
|
||||
},
|
||||
|
||||
{
|
||||
name: "find pronouns.page link in unknown field name",
|
||||
fields: [{ name: "gender: not found", value: "https://en.pronouns.page/they/them" }],
|
||||
expect: "they/them",
|
||||
},
|
||||
{
|
||||
name: "multiple languages and one emoji",
|
||||
fields: [{ name: "Pronomina/Pronouns", value: ":hehim: er, ihm / he, him" }],
|
||||
expect: "er, ihm / he, him",
|
||||
},
|
||||
{
|
||||
name: "not just pronouns in field",
|
||||
fields: [{ name: "RL stats :loading_indicator:", value: "30 | :heart: | She/her" }],
|
||||
expect: "She/her",
|
||||
},
|
||||
{
|
||||
name: "multiple subjects in field name",
|
||||
fields: [{ name: "She/sie/zij/elle", value: "etc" }],
|
||||
expect: "She/sie/zij/elle",
|
||||
},
|
||||
{
|
||||
name: "more complete pronoun definition in bio",
|
||||
note: ":speech_bubble: e/em/eir",
|
||||
expect: "e/em/eir",
|
||||
},
|
||||
];
|
||||
const endToEndTestSuite = suite("end to end tests");
|
||||
for (const { name, fields, expect, note } of endToEndTests) {
|
||||
|
|
Loading…
Reference in a new issue