From 854a597e568115bc2e65371aff5e16e7205af304 Mon Sep 17 00:00:00 2001 From: nachtjasmin Date: Tue, 11 Jul 2023 00:06:30 +0200 Subject: [PATCH] Allow extraction of pronouns from bio/note Closes #1 --- src/libs/fetchPronouns.js | 1 - src/libs/pronouns.js | 153 +++++++++++++++++++++++++++++++--- tests/extractPronouns.spec.js | 18 ++++ 3 files changed, 159 insertions(+), 13 deletions(-) diff --git a/src/libs/fetchPronouns.js b/src/libs/fetchPronouns.js index 003816d..1137d78 100644 --- a/src/libs/fetchPronouns.js +++ b/src/libs/fetchPronouns.js @@ -63,7 +63,6 @@ export async function fetchPronouns(dataID, accountName, type) { let pronouns = await extractFromStatus(status); if (!pronouns) { pronouns = "null"; - //TODO: if no field check bio info(`no pronouns found for ${accountName}, cached null`); } await cachePronouns(accountName, pronouns); diff --git a/src/libs/pronouns.js b/src/libs/pronouns.js index dad45f2..e6947f2 100644 --- a/src/libs/pronouns.js +++ b/src/libs/pronouns.js @@ -19,21 +19,39 @@ const knownPronounUrls = [ export async function extractFromStatus(status) { // get account from status and pull out fields const account = status.account; - const fields = account.fields; + const { fields, note } = account; + let pronouns; - let pronounsRaw; - for (const field of fields) { - // TODO: add ranking of fields - if (pronounsRaw) break; - - for (const matcher of fieldMatchers) { - if (typeof matcher === "string" && field.name.toLowerCase().includes(matcher)) { - pronounsRaw = field.value; - } else if (field.name.match(matcher)) { - pronounsRaw = field.value; - } + if (fields) { + for (const f of fields) { + pronouns = await extractFromField(f); + if (pronouns) break; } } + + if (!pronouns && note) { + pronouns = extractFromBio(note); + } + + return pronouns; +} + +/** + * @param {{name: string, value: string}} field The field value + * @returns {Promise} The pronouns or null. + */ +async function extractFromField(field) { + let pronounsRaw; + for (const matcher of fieldMatchers) { + if (typeof matcher === "string" && field.name.toLowerCase().includes(matcher)) { + pronounsRaw = field.value; + break; + } else if (field.name.match(matcher)) { + pronounsRaw = field.value; + break; + } + } + if (!pronounsRaw) return null; let text = sanitizeHtml(pronounsRaw, { allowedTags: [], allowedAttributes: {} }); // If one of pronoun URLs matches, overwrite the current known value. @@ -101,3 +119,114 @@ function sanitizePronounPageValue(val) { if (val === "no-pronouns") val = "no pronouns"; return val; } + +const knownPronouns = [ + "ae", + "aer", + "aers", + "aerself", + "co", + "co's", + "cos", + "coself", + "e", + "eir", + "eirs", + "em", + "ems", + "emself", + "es", + "ey", + "fae", + "faer", + "faers", + "faerself", + "he", + "her", + "hers", + "herself", + "him", + "himself", + "hir", + "hirs", + "hirself", + "his", + "hu", + "hum", + "hus", + "huself", + "it", + "its", + "itself", + "ne", + "nem", + "nemself", + "nir", + "nirs", + "nirself", + "one", + "one's", + "oneself", + "per", + "pers", + "perself", + "s/he", + "she", + "their", + "theirs", + "them", + "themself", + "themselves", + "they", + "thon", + "thon's", + "thons", + "thonself", + "ve", + "ver", + "vers", + "verself", + "vi", + "vim", + "vims", + "vimself", + "vir", + "virs", + "virself", + "vis", + "xe", + "xem", + "xemself", + "xyr", + "xyrs", + "ze", + "zhe", + "zher", + "zhers", + "zherself", + "zir", + "zirs", + "zirself", +]; + +/** + * Tries to extract pronouns from the bio/note. Only "known" pronouns are returned, which is + * a compromise for the pattern matching. At no point we want to limit the pronouns used by persons. + * @param {string} bio The bio + * @returns {string|null} The result or null + */ +function extractFromBio(bio) { + const exactMatches = bio.matchAll(/(\w+)\/(\w+)/gi); + for (const [match, subjective, objective] of exactMatches) { + if (knownPronouns.includes(subjective) && knownPronouns.includes(objective)) { + return match; + } + } + + const followedByColon = bio.matchAll(/pronouns?:\W+([\w/+]+)/gi); + for (const match of followedByColon) { + return match.pop(); // first group is last entry in array + } + + return null; +} diff --git a/tests/extractPronouns.spec.js b/tests/extractPronouns.spec.js index 0930064..3299c59 100644 --- a/tests/extractPronouns.spec.js +++ b/tests/extractPronouns.spec.js @@ -71,3 +71,21 @@ for (const [input, expects] of valueExtractionTests) { } valueExtractionSuite.run(); + +const bioExtractSuite = suite("bio extraction"); +const bioExtractTests = [ + ["I'm cute and my pronouns are she/her", "she/her"], // exact match + ["my pronouns are helicopter/joke", null], // not on allowlist + ["pronouns: uwu/owo", "uwu/owo"], // followed by pronoun pattern + ["pronouns: any", "any"], // followed by pronoun pattern, +]; +for (const [input, expects] of bioExtractTests) { + bioExtractSuite(input, async () => { + const result = await pronouns.extractFromStatus({ + account: { note: input }, + }); + assert.equal(result, expects); + }); +} + +bioExtractSuite.run();