[bugfix] Use better plaintext representation of status for filtering (#3301)

* [bugfix] Use better plaintext representation of status for filtering

* add new deps to readme

* lint

* update tests

* update regexes

* address review comments

* remove now unused xxhash

* whoops, wrong logger

* Merge branch 'main' into status_filtering_bugfix

* put cache in caches struct

* pain
This commit is contained in:
tobi 2024-09-16 14:00:23 +02:00 committed by GitHub
parent 6dd936fbe1
commit efd1a4f717
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 2685 additions and 64 deletions

View file

@ -273,6 +273,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
- [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
- [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
- [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
- [k3a/html2text](https://github.com/k3a/html2text); HTML-to-text conversion. [MIT License](https://spdx.org/licenses/MIT.html).
- [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
- [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
- [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).

1
go.mod
View file

@ -40,6 +40,7 @@ require (
github.com/gorilla/feeds v1.2.0
github.com/gorilla/websocket v1.5.2
github.com/jackc/pgx/v5 v5.7.1
github.com/k3a/html2text v1.2.1
github.com/microcosm-cc/bluemonday v1.0.27
github.com/miekg/dns v1.1.62
github.com/minio/minio-go/v7 v7.0.76

2
go.sum
View file

@ -384,6 +384,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=

View file

@ -47,6 +47,11 @@ type Caches struct {
// Webfinger provides access to the webfinger URL cache.
Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min
// TTL cache of statuses -> filterable text fields.
// To ensure up-to-date fields, cache is keyed as:
// `[status.ID][status.UpdatedAt.Unix()]`
StatusesFilterableFields *ttl.Cache[string, []string]
// prevent pass-by-value.
_ nocopy
}
@ -109,6 +114,7 @@ func (c *Caches) Init() {
c.initUserMuteIDs()
c.initWebfinger()
c.initVisibility()
c.initStatusesFilterableFields()
}
// Start will start any caches that require a background
@ -119,6 +125,10 @@ func (c *Caches) Start() {
tryUntil("starting webfinger cache", 5, func() bool {
return c.Webfinger.Start(5 * time.Minute)
})
tryUntil("starting statusesFilterableFields cache", 5, func() bool {
return c.StatusesFilterableFields.Start(5 * time.Minute)
})
}
// Stop will stop any caches that require a background
@ -127,6 +137,7 @@ func (c *Caches) Stop() {
log.Infof(nil, "stop: %p", c)
tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)
}
// Sweep will sweep all the available caches to ensure none
@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {
24*time.Hour,
)
}
func (c *Caches) initStatusesFilterableFields() {
c.StatusesFilterableFields = new(ttl.Cache[string, []string])
c.StatusesFilterableFields.Init(
0,
512,
1*time.Hour,
)
}

View file

@ -20,6 +20,8 @@ package gtsmodel
import (
"regexp"
"time"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
// Filter stores a filter created by a local account.
@ -61,14 +63,23 @@ type FilterKeyword struct {
// Compile will compile this FilterKeyword as a prepared regular expression.
func (k *FilterKeyword) Compile() (err error) {
var wordBreak string
if k.WholeWord != nil && *k.WholeWord {
wordBreak = `\b`
var (
wordBreakStart string
wordBreakEnd string
)
if util.PtrOrZero(k.WholeWord) {
// Either word boundary or
// whitespace or start of line.
wordBreakStart = `(?:\b|\s|^)`
// Either word boundary or
// whitespace or end of line.
wordBreakEnd = `(?:\b|\s|$)`
}
// Compile keyword filter regexp.
quoted := regexp.QuoteMeta(k.Keyword)
k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak)
k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)
return // caller is expected to wrap this error
}

View file

@ -21,6 +21,8 @@ import (
"context"
"errors"
"fmt"
"slices"
"strconv"
"strings"
"time"
@ -35,7 +37,6 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/language"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/media"
"github.com/superseriousbusiness/gotosocial/internal/text"
"github.com/superseriousbusiness/gotosocial/internal/uris"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
return nil, nil
}
// Extract text fields from the status that we will match filters against.
fields := filterableTextFields(s)
// Key this status based on ID + last updated time,
// to ensure we always filter on latest version.
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
// Check if we have filterable fields cached for this status.
cache := c.state.Caches.StatusesFilterableFields
fields, stored := cache.Get(statusKey)
if !stored {
// We don't have filterable fields
// cached, calculate + cache now.
fields = filterableFields(s)
cache.Set(statusKey, fields)
}
// Record all matching warn filters and the reasons they matched.
filterResults := make([]apimodel.FilterResult, 0, len(filters))
for _, filter := range filters {
if !filterAppliesInContext(filter, filterContext) {
// Filter doesn't apply to this context.
continue
}
if filter.Expired(now) {
// Filter doesn't apply
// to this context.
continue
}
// List all matching keywords.
if filter.Expired(now) {
// Filter doesn't
// apply anymore.
continue
}
// Assemble matching keywords (if any) from this filter.
keywordMatches := make([]string, 0, len(filter.Keywords))
for _, filterKeyword := range filter.Keywords {
var isMatch bool
for _, field := range fields {
if filterKeyword.Regexp.MatchString(field) {
isMatch = true
break
}
}
if isMatch {
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
for _, keyword := range filter.Keywords {
// Check if at least one filterable field
// in the status matches on this filter.
if slices.ContainsFunc(
fields,
func(field string) bool {
return keyword.Regexp.MatchString(field)
},
) {
// At least one field matched on this filter.
keywordMatches = append(keywordMatches, keyword.Keyword)
}
}
@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
return filterResults, nil
}
// filterableTextFields returns all text from a status that we might want to filter on:
// - content
// - content warning
// - media descriptions
// - poll options
func filterableTextFields(s *gtsmodel.Status) []string {
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)
if s.Content != "" {
fields = append(fields, text.SanitizeToPlaintext(s.Content))
}
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
if s.Poll != nil {
for _, option := range s.Poll.Options {
if option != "" {
fields = append(fields, option)
}
}
}
return fields
}
// filterAppliesInContext returns whether a given filter applies in a given context.
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
switch filterContext {

View file

@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {
// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
testStatus := suite.testStatuses["admin_account_status_1"]
testStatus := new(gtsmodel.Status)
*testStatus = *suite.testStatuses["admin_account_status_1"]
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`
if boost {
// Modify a fixture boost into a boost of the above status.
boostStatus := suite.testStatuses["admin_account_status_4"]
boostStatus.BoostOf = testStatus
boostStatus.BoostOfID = testStatus.ID
testStatus = boostStatus
boost, err := suite.typeconverter.StatusToBoost(
context.Background(),
testStatus,
suite.testAccounts["admin_account"],
"",
)
if err != nil {
suite.FailNow(err.Error())
}
testStatus = boost
}
requestingAccount := suite.testAccounts["local_account_1"]
@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
[]*gtsmodel.Filter{filter},
nil,
)
if suite.NoError(err) {
suite.NotEmpty(apiStatus.Filtered)
if err != nil {
suite.FailNow(err.Error())
}
suite.NotEmpty(apiStatus.Filtered)
}
func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {

View file

@ -27,6 +27,7 @@ import (
"strconv"
"strings"
"github.com/k3a/html2text"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -284,3 +285,64 @@ func ContentToContentLanguage(
return contentStr, langTagStr
}
// filterableFields returns text fields from
// a status that we might want to filter on:
//
// - content warning
// - content (converted to plaintext from HTML)
// - media descriptions
// - poll options
//
// Each field should be filtered separately.
// This avoids scenarios where false-positive
// multiple-word matches can be made by matching
// the last word of one field + the first word
// of the next field together.
func filterableFields(s *gtsmodel.Status) []string {
// Estimate length of fields.
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)
// Content warning / title.
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
// Status content. Though we have raw text
// available for statuses created on our
// instance, use the html2text version to
// remove markdown-formatting characters
// and ensure more consistent filtering.
if s.Content != "" {
text := html2text.HTML2TextWithOptions(
s.Content,
html2text.WithLinksInnerText(),
html2text.WithUnixLineBreaks(),
)
if text != "" {
fields = append(fields, text)
}
}
// Media descriptions.
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
// Poll options.
if s.Poll != nil {
for _, opt := range s.Poll.Options {
if opt != "" {
fields = append(fields, opt)
}
}
}
return fields
}

View file

@ -21,6 +21,7 @@ import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/language"
@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) {
}
}
}
func TestFilterableText(t *testing.T) {
type testcase struct {
status *gtsmodel.Status
expectedFields []string
}
for _, testcase := range []testcase{
{
status: &gtsmodel.Status{
ContentWarning: "This is a test status",
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
},
expectedFields: []string{
"This is a test status",
"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.",
},
},
{
status: &gtsmodel.Status{
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
},
expectedFields: []string{
"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>",
},
},
{
status: &gtsmodel.Status{
ContentWarning: "Nerd stuff",
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
Attachments: []*gtsmodel.MediaAttachment{
{
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
},
{
Description: `Another media attachment`,
},
},
Poll: &gtsmodel.Poll{
Options: []string{
"Poll option 1",
"Poll option 2",
},
},
},
expectedFields: []string{
"Nerd stuff",
"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.",
"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.",
"Another media attachment",
"Poll option 1",
"Poll option 2",
},
},
} {
fields := filterableFields(testcase.status)
assert.Equal(t, testcase.expectedFields, fields)
}
}

10
vendor/github.com/k3a/html2text/.travis.yml generated vendored Normal file
View file

@ -0,0 +1,10 @@
language: go
go:
- master
before_install:
- go get github.com/axw/gocov/gocov
- go get github.com/mattn/goveralls
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
script:
- $HOME/gopath/bin/goveralls -service=travis-ci

21
vendor/github.com/k3a/html2text/LICENSE generated vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

60
vendor/github.com/k3a/html2text/README.md generated vendored Normal file
View file

@ -0,0 +1,60 @@
[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
# html2text
A simple Golang package to convert HTML to plain text (without non-standard dependencies).
It converts HTML tags to text and also parses HTML entities into characters they represent.
A `<head>` section of the HTML document, as well as most other tags are stripped out but
links are properly converted into their href attribute.
It can be used for converting HTML emails into text.
Some tests are installed as well.
Uses semantic versioning and no breaking changes are planned.
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
## Install
```bash
go get github.com/k3a/html2text
```
## Usage
```go
package main
import (
"fmt"
"github.com/k3a/html2text"
)
func main() {
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
plain := html2text.HTML2Text(html)
fmt.Println(plain)
}
/* Outputs:
clean text
*/
```
To see all features, please look info `html2text_test.go`.
## Alternatives
- https://github.com/jaytaylor/html2text (heavier, with more features)
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
## License
MIT

2046
vendor/github.com/k3a/html2text/entity.go generated vendored Normal file

File diff suppressed because it is too large Load diff

333
vendor/github.com/k3a/html2text/html2text.go generated vendored Normal file
View file

@ -0,0 +1,333 @@
package html2text
import (
"bytes"
"regexp"
"strconv"
"strings"
)
// Line break constants
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
const (
WIN_LBR = "\r\n"
UNIX_LBR = "\n"
)
var legacyLBR = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
type options struct {
lbr string
linksInnerText bool
listPrefix string
}
func newOptions() *options {
// apply defaults
return &options{
lbr: WIN_LBR,
}
}
// Option is a functional option
type Option func(*options)
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
func WithUnixLineBreaks() Option {
return func(o *options) {
o.lbr = UNIX_LBR
}
}
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
// Example: click news <http://bit.ly/2n4wXRs>
func WithLinksInnerText() Option {
return func(o *options) {
o.linksInnerText = true
}
}
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
func WithListSupportPrefix(prefix string) Option {
return func(o *options) {
o.listPrefix = prefix
}
}
// WithListSupport formats <ul> and <li> lists with " - " prefix
func WithListSupport() Option {
return WithListSupportPrefix(" - ")
}
func parseHTMLEntity(entName string) (string, bool) {
if r, ok := entity[entName]; ok {
return string(r), true
}
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
var (
err error
n int64
digits = match[1]
)
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
n, err = strconv.ParseInt(digits[1:], 16, 64)
} else {
n, err = strconv.ParseInt(digits, 10, 64)
}
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
return string(rune(n)), true
}
}
return "", false
}
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
func SetUnixLbr(b bool) {
if b {
legacyLBR = UNIX_LBR
} else {
legacyLBR = WIN_LBR
}
}
// HTMLEntitiesToText decodes HTML entities inside a provided
// string and returns decoded text
func HTMLEntitiesToText(htmlEntsText string) string {
outBuf := bytes.NewBufferString("")
inEnt := false
for i, r := range htmlEntsText {
switch {
case r == ';' && inEnt:
inEnt = false
continue
case r == '&': //possible html entity
entName := ""
isEnt := false
// parse the entity name - max 10 chars
chars := 0
for _, er := range htmlEntsText[i+1:] {
if er == ';' {
isEnt = true
break
} else {
entName += string(er)
}
chars++
if chars == 10 {
break
}
}
if isEnt {
if ent, isEnt := parseHTMLEntity(entName); isEnt {
outBuf.WriteString(ent)
inEnt = true
continue
}
}
}
if !inEnt {
outBuf.WriteRune(r)
}
}
return outBuf.String()
}
func writeSpace(outBuf *bytes.Buffer) {
bts := outBuf.Bytes()
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
outBuf.WriteString(" ")
}
}
// HTML2Text converts html into a text form
func HTML2Text(html string) string {
var opts []Option
if legacyLBR == UNIX_LBR {
opts = append(opts, WithUnixLineBreaks())
}
return HTML2TextWithOptions(html, opts...)
}
// HTML2TextWithOptions converts html into a text form with additional options
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
opts := newOptions()
for _, opt := range reqOpts {
opt(opts)
}
inLen := len(html)
tagStart := 0
inEnt := false
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
shouldOutput := true
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
hrefs := []string{}
// new line cannot be printed at the beginning or
// for <p> after a new line created by previous <p></p>
canPrintNewline := false
outBuf := bytes.NewBufferString("")
for i, r := range html {
if inLen > 0 && i == inLen-1 {
// prevent new line at the end of the document
canPrintNewline = false
}
switch {
// skip new lines and spaces adding a single space if not there yet
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
if shouldOutput && badTagStackDepth == 0 && !inEnt {
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
writeSpace(outBuf)
}
continue
case r == ';' && inEnt: // end of html entity
inEnt = false
continue
case r == '&' && shouldOutput: // possible html entity
entName := ""
isEnt := false
// parse the entity name - max 10 chars
chars := 0
for _, er := range html[i+1:] {
if er == ';' {
isEnt = true
break
} else {
entName += string(er)
}
chars++
if chars == 10 {
break
}
}
if isEnt {
if ent, isEnt := parseHTMLEntity(entName); isEnt {
outBuf.WriteString(ent)
inEnt = true
continue
}
}
case r == '<': // start of a tag
tagStart = i + 1
shouldOutput = false
continue
case r == '>': // end of a tag
shouldOutput = true
tag := html[tagStart:i]
tagNameLowercase := strings.ToLower(tag)
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
if opts.listPrefix != "" {
outBuf.WriteString(opts.lbr + opts.listPrefix)
} else {
outBuf.WriteString(opts.lbr)
}
} else if headersRE.MatchString(tagNameLowercase) {
if canPrintNewline {
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
// new line
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
if canPrintNewline {
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if opts.linksInnerText && tagNameLowercase == "/a" {
// end of link
// links can be empty can happen if the link matches the badLinkHrefRE
if len(hrefs) > 0 {
outBuf.WriteString(" <")
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
outBuf.WriteString(">")
hrefs = hrefs[1:]
}
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
// parse link href
// add special handling for a tags
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 5 {
link := m[2]
if len(link) == 0 {
link = m[3]
if len(link) == 0 {
link = m[4]
}
}
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
hrefs = append(hrefs, link)
}
}
} else if badTagnamesRE.MatchString(tagNameLowercase) {
// unwanted block
badTagStackDepth++
// if link inner text preservation is not enabled
// and the current tag is a link tag, parse its href and output that
if !opts.linksInnerText {
// parse link href
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 5 {
link := m[2]
if len(link) == 0 {
link = m[3]
if len(link) == 0 {
link = m[4]
}
}
if !badLinkHrefRE.MatchString(link) {
outBuf.WriteString(HTMLEntitiesToText(link))
}
}
}
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
// end of unwanted block
badTagStackDepth--
}
continue
} // switch end
if shouldOutput && badTagStackDepth == 0 && !inEnt {
canPrintNewline = true
outBuf.WriteRune(r)
}
}
return outBuf.String()
}

3
vendor/modules.txt vendored
View file

@ -446,6 +446,9 @@ github.com/josharian/intern
# github.com/json-iterator/go v1.1.12
## explicit; go 1.12
github.com/json-iterator/go
# github.com/k3a/html2text v1.2.1
## explicit; go 1.16
github.com/k3a/html2text
# github.com/klauspost/compress v1.17.9
## explicit; go 1.20
github.com/klauspost/compress