From 536d9e482d4ebc012855372b9fcfa4f022d1618a Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Fri, 29 Sep 2023 10:39:56 +0200 Subject: [PATCH] [chore/bugfix] Deinterface text.Formatter, allow underscores in hashtags (#2233) --- internal/processing/account/account.go | 2 +- internal/processing/status/create.go | 2 +- internal/processing/status/status.go | 2 +- internal/text/emojionly.go | 70 ---- internal/text/formatter.go | 34 +- internal/text/formatter_test.go | 32 +- internal/text/goldmark_custom_renderer.go | 423 ++++++++++++++++++++++ internal/text/goldmark_extension.go | 313 ---------------- internal/text/goldmark_parsers.go | 281 ++++++++++++++ internal/text/goldmark_plaintext.go | 4 +- internal/text/markdown.go | 53 ++- internal/text/markdown_test.go | 27 +- internal/text/normalize.go | 34 +- internal/text/plain.go | 176 ++++++--- internal/text/plain_test.go | 51 +-- internal/text/replace.go | 161 -------- internal/text/util.go | 51 +++ internal/util/statustools.go | 37 -- 18 files changed, 1040 insertions(+), 713 deletions(-) delete mode 100644 internal/text/emojionly.go create mode 100644 internal/text/goldmark_custom_renderer.go delete mode 100644 internal/text/goldmark_extension.go create mode 100644 internal/text/goldmark_parsers.go delete mode 100644 internal/text/replace.go create mode 100644 internal/text/util.go delete mode 100644 internal/util/statustools.go diff --git a/internal/processing/account/account.go b/internal/processing/account/account.go index 4432fd5f3..06caffaec 100644 --- a/internal/processing/account/account.go +++ b/internal/processing/account/account.go @@ -41,7 +41,7 @@ type Processor struct { mediaManager *media.Manager oauthServer oauth.Server filter *visibility.Filter - formatter text.Formatter + formatter *text.Formatter federator federation.Federator parseMention gtsmodel.ParseMentionFunc } diff --git a/internal/processing/status/create.go b/internal/processing/status/create.go index d671ea8c4..4d4f7c574 100644 --- a/internal/processing/status/create.go +++ b/internal/processing/status/create.go @@ -277,7 +277,7 @@ func processLanguage(ctx context.Context, form *apimodel.AdvancedStatusCreateFor return nil } -func processContent(ctx context.Context, dbService db.DB, formatter text.Formatter, parseMention gtsmodel.ParseMentionFunc, form *apimodel.AdvancedStatusCreateForm, accountID string, status *gtsmodel.Status) error { +func processContent(ctx context.Context, dbService db.DB, formatter *text.Formatter, parseMention gtsmodel.ParseMentionFunc, form *apimodel.AdvancedStatusCreateForm, accountID string, status *gtsmodel.Status) error { // if there's nothing in the status at all we can just return early if form.Status == "" { status.Content = "" diff --git a/internal/processing/status/status.go b/internal/processing/status/status.go index bd8457eb8..432f945fc 100644 --- a/internal/processing/status/status.go +++ b/internal/processing/status/status.go @@ -31,7 +31,7 @@ type Processor struct { federator federation.Federator converter *typeutils.Converter filter *visibility.Filter - formatter text.Formatter + formatter *text.Formatter parseMention gtsmodel.ParseMentionFunc } diff --git a/internal/text/emojionly.go b/internal/text/emojionly.go deleted file mode 100644 index f4f200b21..000000000 --- a/internal/text/emojionly.go +++ /dev/null @@ -1,70 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package text - -import ( - "bytes" - "context" - - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" - "github.com/superseriousbusiness/gotosocial/internal/log" - "github.com/yuin/goldmark" - "github.com/yuin/goldmark/parser" - "github.com/yuin/goldmark/renderer/html" - "github.com/yuin/goldmark/util" -) - -func (f *formatter) FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult { - result := &FormatResult{ - Mentions: []*gtsmodel.Mention{}, - Tags: []*gtsmodel.Tag{}, - Emojis: []*gtsmodel.Emoji{}, - } - // parse markdown text into html, using custom renderer to add hashtag/mention links - md := goldmark.New( - goldmark.WithRendererOptions( - html.WithXHTML(), - html.WithHardWraps(), - ), - goldmark.WithParser( - parser.NewParser( - parser.WithBlockParsers( - util.Prioritized(newPlaintextParser(), 500), - ), - ), - ), - goldmark.WithExtensions( - &customRenderer{f, ctx, pmf, authorID, statusID, true, result}, - ), - ) - - var htmlContentBytes bytes.Buffer - err := md.Convert([]byte(plain), &htmlContentBytes) - if err != nil { - log.Errorf(ctx, "error formatting plaintext to HTML: %s", err) - } - result.HTML = htmlContentBytes.String() - - // clean anything dangerous out of the HTML - result.HTML = SanitizeToHTML(result.HTML) - - // shrink ray - result.HTML = MinifyHTML(result.HTML) - - return result -} diff --git a/internal/text/formatter.go b/internal/text/formatter.go index 0e5e0b554..8f7e6e1f6 100644 --- a/internal/text/formatter.go +++ b/internal/text/formatter.go @@ -24,29 +24,25 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) -// Formatter wraps some logic and functions for parsing statuses and other text input into nice html. -// Each of the member functions returns a struct containing the formatted HTML and any tags, mentions, and -// emoji that were found in the text. -type Formatter interface { - // FromPlain parses an HTML text from a plaintext. - FromPlain(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult - // FromPlainNoParagraph parses an HTML text from a plaintext, without wrapping the resulting text in

tags. - FromPlainNoParagraph(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult - // FromMarkdown parses an HTML text from a markdown-formatted text. - FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, md string) *FormatResult - // FromPlainEmojiOnly parses an HTML text from a plaintext, only parsing emojis and not mentions etc. - FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult -} +// FormatFunc is fulfilled by FromPlain, +// FromPlainNoParagraph, and FromMarkdown. +type FormatFunc func( + ctx context.Context, + parseMention gtsmodel.ParseMentionFunc, + authorID string, + statusID string, + text string, +) *FormatResult -type FormatFunc func(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, text string) *FormatResult - -type formatter struct { +// Formatter wraps logic and functions for parsing +// statuses and other text input into nice html. +type Formatter struct { db db.DB } -// NewFormatter returns a new Formatter interface for parsing statuses and other text input into nice html. -func NewFormatter(db db.DB) Formatter { - return &formatter{ +// NewFormatter returns a new Formatter. +func NewFormatter(db db.DB) *Formatter { + return &Formatter{ db: db, } } diff --git a/internal/text/formatter_test.go b/internal/text/formatter_test.go index 403ba8e8e..cce9970b2 100644 --- a/internal/text/formatter_test.go +++ b/internal/text/formatter_test.go @@ -48,7 +48,7 @@ type TextStandardTestSuite struct { testEmojis map[string]*gtsmodel.Emoji // module being tested - formatter text.Formatter + formatter *text.Formatter } func (suite *TextStandardTestSuite) SetupSuite() { @@ -85,14 +85,32 @@ func (suite *TextStandardTestSuite) TearDownTest() { testrig.StandardDBTeardown(suite.db) } -func (suite *TextStandardTestSuite) FromMarkdown(text string) *text.FormatResult { - return suite.formatter.FromMarkdown(context.Background(), suite.parseMention, suite.testAccounts["local_account_1"].ID, "status_ID", text) +func (suite *TextStandardTestSuite) FromMarkdown(input string) *text.FormatResult { + return suite.formatter.FromMarkdown( + context.Background(), + suite.parseMention, + suite.testAccounts["local_account_1"].ID, + "dummy_status_ID", + input, + ) } -func (suite *TextStandardTestSuite) FromPlain(text string) *text.FormatResult { - return suite.formatter.FromPlain(context.Background(), suite.parseMention, suite.testAccounts["local_account_1"].ID, "status_ID", text) +func (suite *TextStandardTestSuite) FromPlain(input string) *text.FormatResult { + return suite.formatter.FromPlain( + context.Background(), + suite.parseMention, + suite.testAccounts["local_account_1"].ID, + "dummy_status_ID", + input, + ) } -func (suite *TextStandardTestSuite) FromPlainNoParagraph(text string) *text.FormatResult { - return suite.formatter.FromPlainNoParagraph(context.Background(), suite.parseMention, suite.testAccounts["local_account_1"].ID, "status_ID", text) +func (suite *TextStandardTestSuite) FromPlainNoParagraph(input string) *text.FormatResult { + return suite.formatter.FromPlainNoParagraph( + context.Background(), + suite.parseMention, + suite.testAccounts["local_account_1"].ID, + "dummmy_status_ID", + input, + ) } diff --git a/internal/text/goldmark_custom_renderer.go b/internal/text/goldmark_custom_renderer.go new file mode 100644 index 000000000..438692577 --- /dev/null +++ b/internal/text/goldmark_custom_renderer.go @@ -0,0 +1,423 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package text + +import ( + "context" + "errors" + "strings" + + "github.com/superseriousbusiness/gotosocial/internal/db" + "github.com/superseriousbusiness/gotosocial/internal/gtscontext" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/id" + "github.com/superseriousbusiness/gotosocial/internal/log" + "github.com/superseriousbusiness/gotosocial/internal/uris" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + mdutil "github.com/yuin/goldmark/util" +) + +// customRenderer fulfils the following goldmark interfaces: +// +// - renderer.NodeRenderer +// - goldmark.Extender. +// +// It is used as a goldmark extension by FromMarkdown and +// (variants of) FromPlain. +// +// The custom renderer extracts and re-renders mentions, hashtags, +// and emojis that are encountered during parsing, writing out valid +// HTML representations of these elements. +// +// The customRenderer has the following side effects: +// +// - May use its db connection to retrieve existing and/or +// store new mentions, hashtags, and emojis. +// - May update its *FormatResult to append discovered +// mentions, hashtags, and emojis to it. +type customRenderer struct { + ctx context.Context + db db.DB + parseMention gtsmodel.ParseMentionFunc + accountID string + statusID string + emojiOnly bool + result *FormatResult +} + +func (cr *customRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { + reg.Register(kindMention, cr.renderMention) + reg.Register(kindHashtag, cr.renderHashtag) + reg.Register(kindEmoji, cr.renderEmoji) +} + +func (cr *customRenderer) Extend(markdown goldmark.Markdown) { + // 1000 is set as the lowest + // priority, but it's arbitrary. + const prio = 1000 + + if cr.emojiOnly { + // Parse + render only emojis. + markdown.Parser().AddOptions( + parser.WithInlineParsers( + mdutil.Prioritized(new(emojiParser), prio), + ), + ) + } else { + // Parse + render emojis, mentions, hashtags. + markdown.Parser().AddOptions(parser.WithInlineParsers( + mdutil.Prioritized(new(emojiParser), prio), + mdutil.Prioritized(new(mentionParser), prio), + mdutil.Prioritized(new(hashtagParser), prio), + )) + } + + // Add this custom renderer. + markdown.Renderer().AddOptions( + renderer.WithNodeRenderers( + mdutil.Prioritized(cr, prio), + ), + ) +} + +/* + MENTION RENDERING STUFF +*/ + +// renderMention takes a mention +// ast.Node and renders it as HTML. +func (cr *customRenderer) renderMention( + w mdutil.BufWriter, + source []byte, + node ast.Node, + entering bool, +) (ast.WalkStatus, error) { + if !entering { + return ast.WalkSkipChildren, nil + } + + // This function is registered + // only for kindMention, and + // should not be called for + // any other node type. + n, ok := node.(*mention) + if !ok { + log.Panic(cr.ctx, "type assertion failed") + } + + // Get raw mention string eg., '@someone@domain.org'. + text := string(n.Segment.Value(source)) + + // Handle mention and get text to render. + text = cr.handleMention(text) + + // Write returned text into HTML. + if _, err := w.WriteString(text); err != nil { + // We don't have much recourse if this fails. + log.Errorf(cr.ctx, "error writing HTML: %s", err) + } + + return ast.WalkSkipChildren, nil +} + +// handleMention takes a string in the form '@username@domain.com' +// or '@localusername', and does the following: +// +// - Parse the mention string into a *gtsmodel.Mention. +// - Insert mention into database if necessary. +// - Add mention to cr.results.Mentions slice. +// - Return mention rendered as nice HTML. +// +// If the mention is invalid or cannot be created, +// the unaltered input text will be returned instead. +func (cr *customRenderer) handleMention(text string) string { + mention, err := cr.parseMention(cr.ctx, text, cr.accountID, cr.statusID) + if err != nil { + log.Errorf(cr.ctx, "error parsing mention %s from status: %s", text, err) + return text + } + + if cr.statusID != "" { + if err := cr.db.PutMention(cr.ctx, mention); err != nil { + log.Errorf(cr.ctx, "error putting mention in db: %s", err) + return text + } + } + + // Append mention to result if not done already. + // + // This prevents multiple occurences of mention + // in the same status generating multiple + // entries for the same mention in result. + func() { + for _, m := range cr.result.Mentions { + if mention.TargetAccountID == m.TargetAccountID { + // Already appended. + return + } + } + + // Not appended yet. + cr.result.Mentions = append(cr.result.Mentions, mention) + }() + + if mention.TargetAccount == nil { + // Fetch mention target account if not yet populated. + mention.TargetAccount, err = cr.db.GetAccountByID( + gtscontext.SetBarebones(cr.ctx), + mention.TargetAccountID, + ) + if err != nil { + log.Errorf(cr.ctx, "error populating mention target account: %v", err) + return text + } + } + + // Replace the mention with the formatted mention content, + // eg. `@someone@domain.org` becomes: + // `@someone` + var b strings.Builder + b.WriteString(`@`) + b.WriteString(mention.TargetAccount.Username) + b.WriteString(``) + return b.String() +} + +/* + HASHTAG RENDERING STUFF +*/ + +// renderHashtag takes a hashtag +// ast.Node and renders it as HTML. +func (cr *customRenderer) renderHashtag( + w mdutil.BufWriter, + source []byte, + node ast.Node, + entering bool, +) (ast.WalkStatus, error) { + if !entering { + return ast.WalkSkipChildren, nil + } + + // This function is registered + // only for kindHashtag, and + // should not be called for + // any other node type. + n, ok := node.(*hashtag) + if !ok { + log.Panic(cr.ctx, "type assertion failed") + } + + // Get raw hashtag string eg., '#SomeHashtag'. + text := string(n.Segment.Value(source)) + + // Handle hashtag and get text to render. + text = cr.handleHashtag(text) + + // Write returned text into HTML. + if _, err := w.WriteString(text); err != nil { + // We don't have much recourse if this fails. + log.Errorf(cr.ctx, "error writing HTML: %s", err) + } + + return ast.WalkSkipChildren, nil +} + +// handleHashtag takes a string in the form '#SomeHashtag', +// and does the following: +// +// - Normalize + validate the hashtag. +// - Get or create hashtag in the db. +// - Add hashtag to cr.results.Tags slice. +// - Return hashtag rendered as nice HTML. +// +// If the hashtag is invalid or cannot be retrieved, +// the unaltered input text will be returned instead. +func (cr *customRenderer) handleHashtag(text string) string { + normalized, ok := NormalizeHashtag(text) + if !ok { + // Not a valid hashtag. + return text + } + + getOrCreateHashtag := func(name string) (*gtsmodel.Tag, error) { + var ( + tag *gtsmodel.Tag + err error + ) + + // Check if we have a tag with this name already. + tag, err = cr.db.GetTagByName(cr.ctx, name) + if err != nil && !errors.Is(err, db.ErrNoEntries) { + return nil, gtserror.Newf("db error getting tag %s: %w", name, err) + } + + if tag != nil { + // We had it! + return tag, nil + } + + // We didn't have a tag with + // this name, create one. + tag = >smodel.Tag{ + ID: id.NewULID(), + Name: name, + } + + if err = cr.db.PutTag(cr.ctx, tag); err != nil { + return nil, gtserror.Newf("db error putting new tag %s: %w", name, err) + } + + return tag, nil + } + + tag, err := getOrCreateHashtag(normalized) + if err != nil { + log.Errorf(cr.ctx, "error generating hashtags from status: %s", err) + return text + } + + // Append tag to result if not done already. + // + // This prevents multiple uses of a tag in + // the same status generating multiple + // entries for the same tag in result. + func() { + for _, t := range cr.result.Tags { + if tag.ID == t.ID { + // Already appended. + return + } + } + + // Not appended yet. + cr.result.Tags = append(cr.result.Tags, tag) + }() + + // Replace tag with the formatted tag content, eg. `#SomeHashtag` becomes: + // `` + var b strings.Builder + b.WriteString(``) + + return b.String() +} + +/* + EMOJI RENDERING STUFF +*/ + +// renderEmoji doesn't actually turn an emoji +// ast.Node into HTML, but instead only adds it to +// the custom renderer results for later processing. +func (cr *customRenderer) renderEmoji( + w mdutil.BufWriter, + source []byte, + node ast.Node, + entering bool, +) (ast.WalkStatus, error) { + if !entering { + return ast.WalkSkipChildren, nil + } + + // This function is registered + // only for kindEmoji, and + // should not be called for + // any other node type. + n, ok := node.(*emoji) + if !ok { + log.Panic(cr.ctx, "type assertion failed") + } + + // Get raw emoji string eg., ':boobs:'. + text := string(n.Segment.Value(source)) + + // Handle emoji and get text to render. + text = cr.handleEmoji(text) + + // Write returned text into HTML. + if _, err := w.WriteString(text); err != nil { + // We don't have much recourse if this fails. + log.Errorf(cr.ctx, "error writing HTML: %s", err) + } + + return ast.WalkSkipChildren, nil +} + +// handleEmoji takes a string in the form ':some_emoji:', +// and does the following: +// +// - Try to get emoji from the db. +// - Add emoji to cr.results.Emojis slice if found and useable. +// +// This function will always return the unaltered input +// text, since emojification is handled elsewhere. +func (cr *customRenderer) handleEmoji(text string) string { + // Check if text points to a valid + // local emoji by using its shortcode. + // + // The shortcode is the text + // between enclosing ':' chars. + shortcode := strings.Trim(text, ":") + + // Try to fetch emoji as a locally stored emoji. + emoji, err := cr.db.GetEmojiByShortcodeDomain(cr.ctx, shortcode, "") + if err != nil && !errors.Is(err, db.ErrNoEntries) { + log.Errorf(nil, "db error getting local emoji with shortcode %s: %s", shortcode, err) + } + + if emoji == nil { + // No emoji found for this + // shortcode, oh well! + return text + } + + if *emoji.Disabled || !*emoji.VisibleInPicker { + // Emoji was found but not useable. + return text + } + + // Emoji was found and useable. + // Append to result if not done already. + // + // This prevents multiple uses of an emoji + // in the same status generating multiple + // entries for the same emoji in result. + func() { + for _, e := range cr.result.Emojis { + if emoji.Shortcode == e.Shortcode { + // Already appended. + return + } + } + + // Not appended yet. + cr.result.Emojis = append(cr.result.Emojis, emoji) + }() + + return text +} diff --git a/internal/text/goldmark_extension.go b/internal/text/goldmark_extension.go deleted file mode 100644 index a12c618dc..000000000 --- a/internal/text/goldmark_extension.go +++ /dev/null @@ -1,313 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package text - -import ( - "context" - "fmt" - "strings" - - "github.com/superseriousbusiness/gotosocial/internal/db" - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" - "github.com/superseriousbusiness/gotosocial/internal/log" - "github.com/superseriousbusiness/gotosocial/internal/regexes" - "github.com/superseriousbusiness/gotosocial/internal/util" - "github.com/yuin/goldmark" - "github.com/yuin/goldmark/ast" - "github.com/yuin/goldmark/parser" - "github.com/yuin/goldmark/renderer" - "github.com/yuin/goldmark/text" - mdutil "github.com/yuin/goldmark/util" -) - -// A goldmark extension that parses potential mentions and hashtags separately from regular -// text, so that they stay as one contiguous text fragment in the AST, and then renders -// them separately too, to avoid scanning normal text for mentions and tags. - -// mention and hashtag fulfil the goldmark ast.Node interface. -type mention struct { - ast.BaseInline - Segment text.Segment -} - -type hashtag struct { - ast.BaseInline - Segment text.Segment -} - -type emoji struct { - ast.BaseInline - Segment text.Segment -} - -var ( - kindMention = ast.NewNodeKind("Mention") - kindHashtag = ast.NewNodeKind("Hashtag") - kindEmoji = ast.NewNodeKind("Emoji") -) - -func (n *mention) Kind() ast.NodeKind { - return kindMention -} - -func (n *hashtag) Kind() ast.NodeKind { - return kindHashtag -} - -func (n *emoji) Kind() ast.NodeKind { - return kindEmoji -} - -// Dump can be used for debugging. -func (n *mention) Dump(source []byte, level int) { - fmt.Printf("%sMention: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) -} - -func (n *hashtag) Dump(source []byte, level int) { - fmt.Printf("%sHashtag: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) -} - -func (n *emoji) Dump(source []byte, level int) { - fmt.Printf("%sEmoji: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) -} - -// newMention and newHashtag create a goldmark ast.Node from a goldmark text.Segment. -// The contained segment is used in rendering. -func newMention(s text.Segment) *mention { - return &mention{ - BaseInline: ast.BaseInline{}, - Segment: s, - } -} - -func newHashtag(s text.Segment) *hashtag { - return &hashtag{ - BaseInline: ast.BaseInline{}, - Segment: s, - } -} - -func newEmoji(s text.Segment) *emoji { - return &emoji{ - BaseInline: ast.BaseInline{}, - Segment: s, - } -} - -// mentionParser and hashtagParser fulfil the goldmark parser.InlineParser interface. -type mentionParser struct{} - -type hashtagParser struct{} - -type emojiParser struct{} - -func (p *mentionParser) Trigger() []byte { - return []byte{'@'} -} - -func (p *hashtagParser) Trigger() []byte { - return []byte{'#'} -} - -func (p *emojiParser) Trigger() []byte { - return []byte{':'} -} - -func (p *mentionParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { - before := block.PrecendingCharacter() - line, segment := block.PeekLine() - - if !util.IsMentionOrHashtagBoundary(before) { - return nil - } - - // unideal for performance but makes use of existing regex - loc := regexes.MentionFinder.FindIndex(line) - switch { - case loc == nil: - fallthrough - case loc[0] != 0: // fail if not found at start - return nil - default: - block.Advance(loc[1]) - return newMention(segment.WithStop(segment.Start + loc[1])) - } -} - -func (p *hashtagParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { - before := block.PrecendingCharacter() - line, segment := block.PeekLine() - s := string(line) - - if !util.IsMentionOrHashtagBoundary(before) || len(s) == 1 { - return nil - } - - for i, r := range s { - switch { - case r == '#' && i == 0: - // ignore initial # - continue - case !util.IsPlausiblyInHashtag(r) && !util.IsMentionOrHashtagBoundary(r): - // Fake hashtag, don't trust it - return nil - case util.IsMentionOrHashtagBoundary(r): - if i <= 1 { - // empty - return nil - } - // End of hashtag - block.Advance(i) - return newHashtag(segment.WithStop(segment.Start + i)) - } - } - // If we don't find invalid characters before the end of the line then it's all hashtag, babey - block.Advance(segment.Len()) - return newHashtag(segment) -} - -func (p *emojiParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { - line, segment := block.PeekLine() - - // unideal for performance but makes use of existing regex - loc := regexes.EmojiFinder.FindIndex(line) - switch { - case loc == nil: - fallthrough - case loc[0] != 0: // fail if not found at start - return nil - default: - block.Advance(loc[1]) - return newEmoji(segment.WithStop(segment.Start + loc[1])) - } -} - -// customRenderer fulfils both the renderer.NodeRenderer and goldmark.Extender interfaces. -// It is created in FromMarkdown and FromPlain to be used as a goldmark extension, and the -// fields are used to report tags and mentions to the caller for use as metadata. -type customRenderer struct { - f *formatter - ctx context.Context - parseMention gtsmodel.ParseMentionFunc - accountID string - statusID string - emojiOnly bool - result *FormatResult -} - -func (r *customRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { - reg.Register(kindMention, r.renderMention) - reg.Register(kindHashtag, r.renderHashtag) - reg.Register(kindEmoji, r.renderEmoji) -} - -func (r *customRenderer) Extend(m goldmark.Markdown) { - // 1000 is set as the lowest priority, but it's arbitrary - m.Parser().AddOptions(parser.WithInlineParsers( - mdutil.Prioritized(&emojiParser{}, 1000), - )) - if !r.emojiOnly { - m.Parser().AddOptions(parser.WithInlineParsers( - mdutil.Prioritized(&mentionParser{}, 1000), - mdutil.Prioritized(&hashtagParser{}, 1000), - )) - } - m.Renderer().AddOptions(renderer.WithNodeRenderers( - mdutil.Prioritized(r, 1000), - )) -} - -// renderMention and renderHashtag take a mention or a hashtag ast.Node and render it as HTML. -func (r *customRenderer) renderMention(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { - if !entering { - return ast.WalkSkipChildren, nil - } - - n, ok := node.(*mention) // this function is only registered for kindMention - if !ok { - log.Panic(r.ctx, "type assertion failed") - } - text := string(n.Segment.Value(source)) - - html := r.replaceMention(text) - - // we don't have much recourse if this fails - if _, err := w.WriteString(html); err != nil { - log.Errorf(r.ctx, "error writing HTML: %s", err) - } - return ast.WalkSkipChildren, nil -} - -func (r *customRenderer) renderHashtag(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { - if !entering { - return ast.WalkSkipChildren, nil - } - - n, ok := node.(*hashtag) // this function is only registered for kindHashtag - if !ok { - log.Panic(r.ctx, "type assertion failed") - } - text := string(n.Segment.Value(source)) - - html := r.replaceHashtag(text) - - _, err := w.WriteString(html) - // we don't have much recourse if this fails - if err != nil { - log.Errorf(r.ctx, "error writing HTML: %s", err) - } - return ast.WalkSkipChildren, nil -} - -// renderEmoji doesn't turn an emoji into HTML, but adds it to the metadata. -func (r *customRenderer) renderEmoji(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { - if !entering { - return ast.WalkSkipChildren, nil - } - - n, ok := node.(*emoji) // this function is only registered for kindEmoji - if !ok { - log.Panic(r.ctx, "type assertion failed") - } - text := string(n.Segment.Value(source)) - shortcode := text[1 : len(text)-1] - - emoji, err := r.f.db.GetEmojiByShortcodeDomain(r.ctx, shortcode, "") - if err != nil { - if err != db.ErrNoEntries { - log.Errorf(nil, "error getting local emoji with shortcode %s: %s", shortcode, err) - } - } else if *emoji.VisibleInPicker && !*emoji.Disabled { - listed := false - for _, e := range r.result.Emojis { - if e.Shortcode == emoji.Shortcode { - listed = true - break - } - } - if !listed { - r.result.Emojis = append(r.result.Emojis, emoji) - } - } - - // we don't have much recourse if this fails - if _, err := w.WriteString(text); err != nil { - log.Errorf(r.ctx, "error writing HTML: %s", err) - } - return ast.WalkSkipChildren, nil -} diff --git a/internal/text/goldmark_parsers.go b/internal/text/goldmark_parsers.go new file mode 100644 index 000000000..b7cf4f9e9 --- /dev/null +++ b/internal/text/goldmark_parsers.go @@ -0,0 +1,281 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package text + +import ( + "fmt" + "strings" + + "github.com/superseriousbusiness/gotosocial/internal/regexes" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/text" +) + +/* + MENTION PARSER STUFF +*/ + +// mention fulfils the goldmark +// ast.Node interface. +type mention struct { + ast.BaseInline + Segment text.Segment +} + +var kindMention = ast.NewNodeKind("Mention") + +func (n *mention) Kind() ast.NodeKind { + return kindMention +} + +func (n *mention) Dump(source []byte, level int) { + fmt.Printf("%sMention: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) +} + +// newMention creates a goldmark ast.Node +// from a text.Segment. The contained segment +// is used in rendering. +func newMention(s text.Segment) *mention { + return &mention{ + BaseInline: ast.BaseInline{}, + Segment: s, + } +} + +// mentionParser fulfils the goldmark +// parser.InlineParser interface. +type mentionParser struct{} + +// Mention parsing is triggered by the `@` symbol +// which appears at the beginning of a mention. +func (p *mentionParser) Trigger() []byte { + return []byte{'@'} +} + +func (p *mentionParser) Parse( + _ ast.Node, + block text.Reader, + _ parser.Context, +) ast.Node { + // If preceding character is not a valid boundary + // character, then this cannot be a valid mention. + if !isMentionBoundary(block.PrecendingCharacter()) { + return nil + } + + line, segment := block.PeekLine() + + // Ascertain location of mention in the line + // that starts with the trigger character. + loc := regexes.MentionFinder.FindIndex(line) + if loc == nil || loc[0] != 0 { + // Noop if not found or + // not found at start. + return nil + } + + // Advance the block to + // the end of the mention. + block.Advance(loc[1]) + + // mention ast.Node spans from the + // beginning of this segment up to + // the last character of the mention. + return newMention( + segment.WithStop( + segment.Start + loc[1], + ), + ) +} + +/* + HASHTAG PARSER STUFF +*/ + +// hashtag fulfils the goldmark +// ast.Node interface. +type hashtag struct { + ast.BaseInline + Segment text.Segment +} + +var kindHashtag = ast.NewNodeKind("Hashtag") + +func (n *hashtag) Kind() ast.NodeKind { + return kindHashtag +} + +func (n *hashtag) Dump(source []byte, level int) { + fmt.Printf("%sHashtag: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) +} + +// newHashtag creates a goldmark ast.Node +// from a text.Segment. The contained segment +// is used in rendering. +func newHashtag(s text.Segment) *hashtag { + return &hashtag{ + BaseInline: ast.BaseInline{}, + Segment: s, + } +} + +type hashtagParser struct{} + +// Hashtag parsing is triggered by a '#' symbol +// which appears at the beginning of a hashtag. +func (p *hashtagParser) Trigger() []byte { + return []byte{'#'} +} + +func (p *hashtagParser) Parse( + _ ast.Node, + block text.Reader, + _ parser.Context, +) ast.Node { + // If preceding character is not a valid boundary + // character, then this cannot be a valid hashtag. + if !isHashtagBoundary(block.PrecendingCharacter()) { + return nil + } + + var ( + line, segment = block.PeekLine() + lineStr = string(line) + lineStrLen = len(lineStr) + ) + + if lineStrLen <= 1 { + // This is probably just + // a lonely '#' char. + return nil + } + + // Iterate through the runes in the detected + // hashtag string until we reach either: + // - A weird character (bad). + // - The end of the hashtag (ok). + // - The end of the string (also ok). + for i, r := range lineStr { + switch { + case r == '#' && i == 0: + // Ignore initial '#'. + continue + + case !isPlausiblyInHashtag(r) && + !isHashtagBoundary(r): + // Weird non-boundary character + // in the hashtag. Don't trust it. + return nil + + case isHashtagBoundary(r): + // Reached closing hashtag + // boundary. Advance block + // to the end of the hashtag. + block.Advance(i) + + // hashtag ast.Node spans from + // the beginning of this segment + // up to the boundary character. + return newHashtag( + segment.WithStop( + segment.Start + i, + ), + ) + } + } + + // No invalid or boundary characters before the + // end of the line: it's all hashtag, baby 😎 + // + // Advance block to the end of the segment. + block.Advance(segment.Len()) + + // hashtag ast.Node spans + // the entire segment. + return newHashtag(segment) +} + +/* + EMOJI PARSER STUFF +*/ + +// emoji fulfils the goldmark +// ast.Node interface. +type emoji struct { + ast.BaseInline + Segment text.Segment +} + +var kindEmoji = ast.NewNodeKind("Emoji") + +func (n *emoji) Kind() ast.NodeKind { + return kindEmoji +} + +func (n *emoji) Dump(source []byte, level int) { + fmt.Printf("%sEmoji: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) +} + +// newEmoji creates a goldmark ast.Node +// from a text.Segment. The contained +// segment is used in rendering. +func newEmoji(s text.Segment) *emoji { + return &emoji{ + BaseInline: ast.BaseInline{}, + Segment: s, + } +} + +type emojiParser struct{} + +// Emoji parsing is triggered by a ':' char +// which appears at the start of the emoji. +func (p *emojiParser) Trigger() []byte { + return []byte{':'} +} + +func (p *emojiParser) Parse( + _ ast.Node, + block text.Reader, + _ parser.Context, +) ast.Node { + line, segment := block.PeekLine() + + // Ascertain location of emoji in the line + // that starts with the trigger character. + loc := regexes.EmojiFinder.FindIndex(line) + if loc == nil || loc[0] != 0 { + // Noop if not found or + // not found at start. + return nil + } + + // Advance the block to + // the end of the emoji. + block.Advance(loc[1]) + + // emoji ast.Node spans from the + // beginning of this segment up to + // the last character of the emoji. + return newEmoji( + segment.WithStop( + segment.Start + loc[1], + ), + ) +} diff --git a/internal/text/goldmark_plaintext.go b/internal/text/goldmark_plaintext.go index 635fdfc33..a27328317 100644 --- a/internal/text/goldmark_plaintext.go +++ b/internal/text/goldmark_plaintext.go @@ -26,7 +26,7 @@ import ( // plaintextParser implements goldmark.parser.BlockParser type plaintextParser struct{} -var defaultPlaintextParser = &plaintextParser{} +var defaultPlaintextParser = new(plaintextParser) func newPlaintextParser() parser.BlockParser { return defaultPlaintextParser @@ -64,7 +64,7 @@ func (b *plaintextParser) CanAcceptIndentedLine() bool { // plaintextParserNoParagraph implements goldmark.parser.BlockParser type plaintextParserNoParagraph struct{} -var defaultPlaintextParserNoParagraph = &plaintextParserNoParagraph{} +var defaultPlaintextParserNoParagraph = new(plaintextParserNoParagraph) func newPlaintextParserNoParagraph() parser.BlockParser { return defaultPlaintextParserNoParagraph diff --git a/internal/text/markdown.go b/internal/text/markdown.go index ecc49673b..6fc1bd2f0 100644 --- a/internal/text/markdown.go +++ b/internal/text/markdown.go @@ -28,38 +28,55 @@ import ( "github.com/yuin/goldmark/renderer/html" ) -func (f *formatter) FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, markdownText string) *FormatResult { - result := &FormatResult{ - Mentions: []*gtsmodel.Mention{}, - Tags: []*gtsmodel.Tag{}, - Emojis: []*gtsmodel.Emoji{}, - } +// FromMarkdown fulfils FormatFunc by parsing +// the given markdown input into a FormatResult. +func (f *Formatter) FromMarkdown( + ctx context.Context, + parseMention gtsmodel.ParseMentionFunc, + authorID string, + statusID string, + input string, +) *FormatResult { + result := new(FormatResult) - // parse markdown text into html, using custom renderer to add hashtag/mention links + // Instantiate goldmark parser for + // markdown, using custom renderer + // to add hashtag/mention links. md := goldmark.New( goldmark.WithRendererOptions( html.WithXHTML(), html.WithHardWraps(), - html.WithUnsafe(), // allows raw HTML + // Allows raw HTML. We sanitize + // at the end so this is OK. + html.WithUnsafe(), ), goldmark.WithExtensions( - &customRenderer{f, ctx, pmf, authorID, statusID, false, result}, - extension.Linkify, // turns URLs into links + &customRenderer{ + ctx, + f.db, + parseMention, + authorID, + statusID, + false, // emojiOnly = false. + result, + }, + extension.Linkify, // Turns URLs into links. extension.Strikethrough, ), ) - var htmlContentBytes bytes.Buffer - err := md.Convert([]byte(markdownText), &htmlContentBytes) - if err != nil { - log.Errorf(ctx, "error formatting markdown to HTML: %s", err) + // Parse input into HTML. + var htmlBytes bytes.Buffer + if err := md.Convert( + []byte(input), + &htmlBytes, + ); err != nil { + log.Errorf(ctx, "error formatting markdown input to HTML: %s", err) } - result.HTML = htmlContentBytes.String() - // clean anything dangerous out of the HTML + // Clean and shrink HTML. + result.HTML = htmlBytes.String() result.HTML = SanitizeToHTML(result.HTML) - - // shrink ray result.HTML = MinifyHTML(result.HTML) return result diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index cc466df6c..98ed3a96b 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -76,10 +76,16 @@ const ( mdWithLinkExpected = "

Check out this code, i heard it was written by a sloth https://github.com/superseriousbusiness/gotosocial

" mdObjectInCodeBlock = "@foss_satan@fossbros-anonymous.io this is how to mention a user\n```\n@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n```\nhope that helps" mdObjectInCodeBlockExpected = "

@foss_satan this is how to mention a user

@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n

hope that helps

" - mdItalicHashtag = "_#hashtag_" - mdItalicHashtagExpected = "

#hashtag

" - mdItalicHashtags = "_#hashtag #hashtag #hashtag_" - mdItalicHashtagsExpected = "

#hashtag #hashtag #hashtag

" + // Hashtags can be italicized but only with *, not _. + mdItalicHashtag = "*#hashtag*" + mdItalicHashtagExpected = "

#hashtag

" + mdItalicHashtags = "*#hashtag #hashtag #hashtag*" + mdItalicHashtagsExpected = "

#hashtag #hashtag #hashtag

" + // Hashtags can end with or contain _ but not start with it. + mdUnderscorePrefixHashtag = "_#hashtag" + mdUnderscorePrefixHashtagExpected = "

_#hashtag

" + mdUnderscoreSuffixHashtag = "#hashtag_" + mdUnderscoreSuffixHashtagExpected = "

#hashtag_

" // BEWARE: sneaky unicode business going on. // the first ö is one rune, the second ö is an o with a combining diacritic. mdUnnormalizedHashtag = "#hellöthere #hellöthere" @@ -194,6 +200,19 @@ func (suite *MarkdownTestSuite) TestParseItalicHashtags() { suite.Equal(mdItalicHashtagsExpected, formatted.HTML) } +func (suite *MarkdownTestSuite) TestParseHashtagUnderscorePrefix() { + formatted := suite.FromMarkdown(mdUnderscorePrefixHashtag) + suite.Equal(mdUnderscorePrefixHashtagExpected, formatted.HTML) + suite.Empty(formatted.Tags) +} + +func (suite *MarkdownTestSuite) TestParseHashtagUnderscoreSuffix() { + formatted := suite.FromMarkdown(mdUnderscoreSuffixHashtag) + suite.Equal(mdUnderscoreSuffixHashtagExpected, formatted.HTML) + suite.NotEmpty(formatted.Tags) + suite.Equal("hashtag_", formatted.Tags[0].Name) +} + func (suite *MarkdownTestSuite) TestParseUnnormalizedHashtag() { formatted := suite.FromMarkdown(mdUnnormalizedHashtag) suite.Equal(mdUnnormalizedHashtagExpected, formatted.HTML) diff --git a/internal/text/normalize.go b/internal/text/normalize.go index 14caf6311..d2e633d1e 100644 --- a/internal/text/normalize.go +++ b/internal/text/normalize.go @@ -20,7 +20,6 @@ package text import ( "strings" - "github.com/superseriousbusiness/gotosocial/internal/util" "golang.org/x/text/unicode/norm" ) @@ -36,8 +35,10 @@ const ( // // Finally, it will do a check on the normalized string to // ensure that it's below maximumHashtagLength chars, and -// contains only unicode letters and numbers. If this passes, -// returned bool will be true. +// contains only letters, numbers, and underscores (and not +// *JUST* underscores). +// +// If all this passes, returned bool will be true. func NormalizeHashtag(text string) (string, bool) { // This normalization is specifically to avoid cases // where visually-identical hashtags are stored with @@ -47,14 +48,31 @@ func NormalizeHashtag(text string) (string, bool) { // with parent characters to form regular letter symbols. normalized := norm.NFC.String(strings.TrimPrefix(text, "#")) - // Validate normalized. - ok := true + // Validate normalized result. + var ( + notJustUnderscores = false + onlyPermittedChars = true + lengthOK = true + ) + for i, r := range normalized { - if i >= maximumHashtagLength || !util.IsPermittedInHashtag(r) { - ok = false + if r != '_' { + // This isn't an underscore, + // so the whole hashtag isn't + // just underscores. + notJustUnderscores = true + } + + if i >= maximumHashtagLength { + lengthOK = false + break + } + + if !isPermittedInHashtag(r) { + onlyPermittedChars = false break } } - return normalized, ok + return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) } diff --git a/internal/text/plain.go b/internal/text/plain.go index 330ebfb15..1456fd016 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -30,66 +30,150 @@ import ( "github.com/yuin/goldmark/util" ) -func (f *formatter) fromPlain( +// FromPlain fulfils FormatFunc by parsing +// the given plaintext input into a FormatResult. +func (f *Formatter) FromPlain( ctx context.Context, - ptParser parser.Parser, - pmf gtsmodel.ParseMentionFunc, + parseMention gtsmodel.ParseMentionFunc, authorID string, statusID string, - plain string, + input string, ) *FormatResult { - result := &FormatResult{ - Mentions: []*gtsmodel.Mention{}, - Tags: []*gtsmodel.Tag{}, - Emojis: []*gtsmodel.Emoji{}, - } - - // Parse markdown into html, using custom renderer - // to add hashtag/mention links and emoji images. - md := goldmark.New( - goldmark.WithRendererOptions( - html.WithXHTML(), - html.WithHardWraps(), - ), - goldmark.WithParser(ptParser), // use parser we were passed - goldmark.WithExtensions( - &customRenderer{f, ctx, pmf, authorID, statusID, false, result}, - extension.Linkify, // turns URLs into links - ), - ) - - var htmlContentBytes bytes.Buffer - if err := md.Convert([]byte(plain), &htmlContentBytes); err != nil { - log.Errorf(ctx, "error formatting plaintext to HTML: %s", err) - } - result.HTML = htmlContentBytes.String() - - // Clean anything dangerous out of resulting HTML. - result.HTML = SanitizeToHTML(result.HTML) - - // Shrink ray! - result.HTML = MinifyHTML(result.HTML) - - return result -} - -func (f *formatter) FromPlain(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult { - ptParser := parser.NewParser( + // Initialize standard block parser + // that wraps result in

tags. + plainTextParser := parser.NewParser( parser.WithBlockParsers( util.Prioritized(newPlaintextParser(), 500), ), ) - return f.fromPlain(ctx, ptParser, pmf, authorID, statusID, plain) + return f.fromPlain( + ctx, + plainTextParser, + false, // emojiOnly = false + parseMention, + authorID, + statusID, + input, + ) } -func (f *formatter) FromPlainNoParagraph(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult { - ptParser := parser.NewParser( +// FromPlainNoParagraph fulfils FormatFunc by parsing +// the given plaintext input into a FormatResult. +// +// Unlike FromPlain, it will not wrap the resulting +// HTML in

tags, making it useful for parsing +// short fragments of text that oughtn't be formally +// wrapped as a paragraph. +func (f *Formatter) FromPlainNoParagraph( + ctx context.Context, + parseMention gtsmodel.ParseMentionFunc, + authorID string, + statusID string, + input string, +) *FormatResult { + // Initialize block parser that + // doesn't wrap result in

tags. + plainTextParser := parser.NewParser( parser.WithBlockParsers( - // Initialize block parser that doesn't wrap in

tags. util.Prioritized(newPlaintextParserNoParagraph(), 500), ), ) - return f.fromPlain(ctx, ptParser, pmf, authorID, statusID, plain) + return f.fromPlain( + ctx, + plainTextParser, + false, // emojiOnly = false + parseMention, + authorID, + statusID, + input, + ) +} + +// FromPlainEmojiOnly fulfils FormatFunc by parsing +// the given plaintext input into a FormatResult. +// +// Unlike FromPlain, it will only parse emojis with +// the custom renderer, leaving aside mentions and tags. +func (f *Formatter) FromPlainEmojiOnly( + ctx context.Context, + parseMention gtsmodel.ParseMentionFunc, + authorID string, + statusID string, + input string, +) *FormatResult { + // Initialize standard block parser + // that wraps result in

tags. + plainTextParser := parser.NewParser( + parser.WithBlockParsers( + util.Prioritized(newPlaintextParser(), 500), + ), + ) + + return f.fromPlain( + ctx, + plainTextParser, + true, // emojiOnly = true + parseMention, + authorID, + statusID, + input, + ) +} + +// fromPlain parses the given input text +// using the given plainTextParser, and +// returns the result. +func (f *Formatter) fromPlain( + ctx context.Context, + plainTextParser parser.Parser, + emojiOnly bool, + parseMention gtsmodel.ParseMentionFunc, + authorID string, + statusID string, + input string, +) *FormatResult { + result := new(FormatResult) + + // Instantiate goldmark parser for + // plaintext, using custom renderer + // to add hashtag/mention links. + md := goldmark.New( + goldmark.WithRendererOptions( + html.WithXHTML(), + html.WithHardWraps(), + ), + // Use whichever plaintext + // parser we were passed. + goldmark.WithParser(plainTextParser), + goldmark.WithExtensions( + &customRenderer{ + ctx, + f.db, + parseMention, + authorID, + statusID, + emojiOnly, + result, + }, + extension.Linkify, // Turns URLs into links. + ), + ) + + // Parse input into HTML. + var htmlBytes bytes.Buffer + if err := md.Convert( + []byte(input), + &htmlBytes, + ); err != nil { + log.Errorf(ctx, "error formatting plaintext input to HTML: %s", err) + } + + // Clean and shrink HTML. + result.HTML = htmlBytes.String() + result.HTML = SanitizeToHTML(result.HTML) + result.HTML = MinifyHTML(result.HTML) + + return result } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index dfcf8b953..43cc588c5 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -20,7 +20,6 @@ package text_test import ( "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" ) @@ -85,7 +84,7 @@ that link shouldn't come out formatted as a mention!` func (suite *PlainTestSuite) TestDeriveMentionsEmpty() { statusText := `` menchies := suite.FromPlain(statusText).Mentions - assert.Len(suite.T(), menchies, 0) + suite.Len(menchies, 0) } func (suite *PlainTestSuite) TestDeriveHashtagsOK() { @@ -98,7 +97,9 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() { here's a link with a fragment: https://example.org/whatever#ahhh here's another link with a fragment: https://example.org/whatever/#ahhh -(#ThisShouldAlsoWork) #this_should_be_split +(#ThisShouldAlsoWork) #this_should_not_be_split + +#__ <- just underscores, shouldn't work #111111 thisalsoshouldn'twork#### ## @@ -108,24 +109,24 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() { ` tags := suite.FromPlain(statusText).Tags - assert.Len(suite.T(), tags, 13) - assert.Equal(suite.T(), "testing123", tags[0].Name) - assert.Equal(suite.T(), "also", tags[1].Name) - assert.Equal(suite.T(), "thisshouldwork", tags[2].Name) - assert.Equal(suite.T(), "dupe", tags[3].Name) - assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4].Name) - assert.Equal(suite.T(), "this", tags[5].Name) - assert.Equal(suite.T(), "111111", tags[6].Name) - assert.Equal(suite.T(), "alimentación", tags[7].Name) - assert.Equal(suite.T(), "saúde", tags[8].Name) - assert.Equal(suite.T(), "lävistää", tags[9].Name) - assert.Equal(suite.T(), "ö", tags[10].Name) - assert.Equal(suite.T(), "네", tags[11].Name) - assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[12].Name) + suite.Len(tags, 13) + suite.Equal("testing123", tags[0].Name) + suite.Equal("also", tags[1].Name) + suite.Equal("thisshouldwork", tags[2].Name) + suite.Equal("dupe", tags[3].Name) + suite.Equal("ThisShouldAlsoWork", tags[4].Name) + suite.Equal("this_should_not_be_split", tags[5].Name) + suite.Equal("111111", tags[6].Name) + suite.Equal("alimentación", tags[7].Name) + suite.Equal("saúde", tags[8].Name) + suite.Equal("lävistää", tags[9].Name) + suite.Equal("ö", tags[10].Name) + suite.Equal("네", tags[11].Name) + suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name) statusText = `#올빼미 hej` tags = suite.FromPlain(statusText).Tags - assert.Equal(suite.T(), "올빼미", tags[0].Name) + suite.Equal("올빼미", tags[0].Name) } func (suite *PlainTestSuite) TestDeriveMultiple() { @@ -137,20 +138,20 @@ func (suite *PlainTestSuite) TestDeriveMultiple() { f := suite.FromPlain(statusText) - assert.Len(suite.T(), f.Mentions, 1) - assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", f.Mentions[0].NameString) + suite.Len(f.Mentions, 1) + suite.Equal("@foss_satan@fossbros-anonymous.io", f.Mentions[0].NameString) - assert.Len(suite.T(), f.Tags, 1) - assert.Equal(suite.T(), "hashtag", f.Tags[0].Name) + suite.Len(f.Tags, 1) + suite.Equal("hashtag", f.Tags[0].Name) - assert.Len(suite.T(), f.Emojis, 0) + suite.Len(f.Emojis, 0) } func (suite *PlainTestSuite) TestZalgoHashtag() { statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?` f := suite.FromPlain(statusText) - assert.Len(suite.T(), f.Tags, 1) - assert.Equal(suite.T(), "praying", f.Tags[0].Name) + suite.Len(f.Tags, 1) + suite.Equal("praying", f.Tags[0].Name) } func TestPlainTestSuite(t *testing.T) { diff --git a/internal/text/replace.go b/internal/text/replace.go deleted file mode 100644 index db72aaf1d..000000000 --- a/internal/text/replace.go +++ /dev/null @@ -1,161 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package text - -import ( - "errors" - "strings" - - "github.com/superseriousbusiness/gotosocial/internal/db" - "github.com/superseriousbusiness/gotosocial/internal/gtscontext" - "github.com/superseriousbusiness/gotosocial/internal/gtserror" - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" - "github.com/superseriousbusiness/gotosocial/internal/id" - "github.com/superseriousbusiness/gotosocial/internal/log" - "github.com/superseriousbusiness/gotosocial/internal/uris" -) - -// replaceMention takes a string in the form @username@domain.com or @localusername -func (r *customRenderer) replaceMention(text string) string { - mention, err := r.parseMention(r.ctx, text, r.accountID, r.statusID) - if err != nil { - log.Errorf(r.ctx, "error parsing mention %s from status: %s", text, err) - return text - } - - if r.statusID != "" { - if err := r.f.db.PutMention(r.ctx, mention); err != nil { - log.Errorf(r.ctx, "error putting mention in db: %s", err) - return text - } - } - - // only append if it's not been listed yet - listed := false - for _, m := range r.result.Mentions { - if mention.ID == m.ID { - listed = true - break - } - } - if !listed { - r.result.Mentions = append(r.result.Mentions, mention) - } - - if mention.TargetAccount == nil { - // Fetch mention target account if not yet populated. - mention.TargetAccount, err = r.f.db.GetAccountByID( - gtscontext.SetBarebones(r.ctx), - mention.TargetAccountID, - ) - if err != nil { - log.Errorf(r.ctx, "error populating mention target account: %v", err) - return text - } - } - - // The mention's target is our target - targetAccount := mention.TargetAccount - - var b strings.Builder - - // replace the mention with the formatted mention content - // @targetAccount.Username - b.WriteString(`@`) - b.WriteString(targetAccount.Username) - b.WriteString(``) - return b.String() -} - -// replaceHashtag takes a string in the form #SomeHashtag, and will normalize -// it before adding it to the db (or just getting it from the db if it already -// exists) and turning it into HTML. -func (r *customRenderer) replaceHashtag(text string) string { - normalized, ok := NormalizeHashtag(text) - if !ok { - // Not a valid hashtag. - return text - } - - tag, err := r.getOrCreateHashtag(normalized) - if err != nil { - log.Errorf(r.ctx, "error generating hashtags from status: %s", err) - return text - } - - // Append tag to result if not done already. - // - // This prevents multiple uses of a tag in - // the same status generating multiple - // entries for the same tag in result. - func() { - for _, t := range r.result.Tags { - if tag.ID == t.ID { - // Already appended. - return - } - } - - // Not appended yet. - r.result.Tags = append(r.result.Tags, tag) - }() - - // Replace tag with the formatted tag content, eg. `#SomeHashtag` becomes: - // `` - var b strings.Builder - b.WriteString(``) - - return b.String() -} - -func (r *customRenderer) getOrCreateHashtag(name string) (*gtsmodel.Tag, error) { - var ( - tag *gtsmodel.Tag - err error - ) - - // Check if we have a tag with this name already. - tag, err = r.f.db.GetTagByName(r.ctx, name) - if err != nil && !errors.Is(err, db.ErrNoEntries) { - return nil, gtserror.Newf("db error getting tag %s: %w", name, err) - } - - if tag != nil { - // We had it! - return tag, nil - } - - // We didn't have a tag with - // this name, create one. - tag = >smodel.Tag{ - ID: id.NewULID(), - Name: name, - } - - if err = r.f.db.PutTag(r.ctx, tag); err != nil { - return nil, gtserror.Newf("db error putting new tag %s: %w", name, err) - } - - return tag, nil -} diff --git a/internal/text/util.go b/internal/text/util.go new file mode 100644 index 000000000..204c64838 --- /dev/null +++ b/internal/text/util.go @@ -0,0 +1,51 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package text + +import "unicode" + +func isPlausiblyInHashtag(r rune) bool { + // Marks are allowed during parsing + // prior to normalization, but not after, + // since they may be combined into letters + // during normalization. + return unicode.IsMark(r) || + isPermittedInHashtag(r) +} + +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || + unicode.IsNumber(r) || + r == '_' +} + +// isHashtagBoundary returns true if rune r +// is a recognized break character for before +// or after a #hashtag. +func isHashtagBoundary(r rune) bool { + return unicode.IsSpace(r) || + (unicode.IsPunct(r) && r != '_') +} + +// isMentionBoundary returns true if rune r +// is a recognized break character for before +// or after a @mention. +func isMentionBoundary(r rune) bool { + return unicode.IsSpace(r) || + unicode.IsPunct(r) +} diff --git a/internal/util/statustools.go b/internal/util/statustools.go deleted file mode 100644 index c56cf84ce..000000000 --- a/internal/util/statustools.go +++ /dev/null @@ -1,37 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package util - -import ( - "unicode" -) - -func IsPlausiblyInHashtag(r rune) bool { - // Marks are allowed during parsing, prior to normalization, but not after, - // since they may be combined into letters during normalization. - return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) -} - -func IsPermittedInHashtag(r rune) bool { - return unicode.IsLetter(r) || unicode.IsNumber(r) -} - -// Decides where to break before or after a #hashtag or @mention -func IsMentionOrHashtagBoundary(r rune) bool { - return unicode.IsSpace(r) || unicode.IsPunct(r) -}