matrix-prometheus/vendor/maunium.net/go/mautrix/format/htmlparser.go

302 lines
9.4 KiB
Go

// Copyright (c) 2020 Tulir Asokan
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package format
import (
"fmt"
"math"
"strconv"
"strings"
"golang.org/x/net/html"
"maunium.net/go/mautrix/id"
)
type Context map[string]interface{}
type TextConverter func(string, Context) string
type CodeBlockConverter func(code, language string, ctx Context) string
type PillConverter func(displayname, mxid, eventID string, ctx Context) string
func DefaultPillConverter(displayname, mxid, eventID string, _ Context) string {
switch {
case len(mxid) == 0, mxid[0] == '@':
// User link, always just show the displayname
return displayname
case len(eventID) > 0:
// Event ID link, always just show the link
return fmt.Sprintf("https://matrix.to/#/%s/%s", mxid, eventID)
case mxid[0] == '!' && displayname == mxid:
// Room ID link with no separate display text, just show the link
return fmt.Sprintf("https://matrix.to/#/%s", mxid)
case mxid[0] == '#':
// Room alias link, just show the alias
return mxid
default:
// Other link (e.g. room ID link with display text), show text and link
return fmt.Sprintf("%s (https://matrix.to/#/%s)", displayname, mxid)
}
}
// HTMLParser is a somewhat customizable Matrix HTML parser.
type HTMLParser struct {
PillConverter PillConverter
TabsToSpaces int
Newline string
HorizontalLine string
BoldConverter TextConverter
ItalicConverter TextConverter
StrikethroughConverter TextConverter
UnderlineConverter TextConverter
MonospaceBlockConverter CodeBlockConverter
MonospaceConverter TextConverter
}
// TaggedString is a string that also contains a HTML tag.
type TaggedString struct {
string
tag string
}
func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
for _, attr := range node.Attr {
if attr.Key == attribute {
return attr.Val
}
}
return ""
}
// Digits counts the number of digits in a non-negative integer.
func Digits(num int) int {
return int(math.Floor(math.Log10(float64(num))) + 1)
}
func (parser *HTMLParser) listToString(node *html.Node, stripLinebreak bool, ctx Context) string {
ordered := node.Data == "ol"
taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, stripLinebreak, ctx)
counter := 1
indentLength := 0
if ordered {
start := parser.getAttribute(node, "start")
if len(start) > 0 {
counter, _ = strconv.Atoi(start)
}
longestIndex := (counter - 1) + len(taggedChildren)
indentLength = Digits(longestIndex)
}
indent := strings.Repeat(" ", indentLength+2)
var children []string
for _, child := range taggedChildren {
if child.tag != "li" {
continue
}
var prefix string
// TODO make bullets and numbering configurable
if ordered {
indexPadding := indentLength - Digits(counter)
prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
} else {
prefix = "* "
}
str := prefix + child.string
counter++
parts := strings.Split(str, "\n")
for i, part := range parts[1:] {
parts[i+1] = indent + part
}
str = strings.Join(parts, "\n")
children = append(children, str)
}
return strings.Join(children, "\n")
}
func (parser *HTMLParser) basicFormatToString(node *html.Node, stripLinebreak bool, ctx Context) string {
str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx)
switch node.Data {
case "b", "strong":
if parser.BoldConverter != nil {
return parser.BoldConverter(str, ctx)
}
return fmt.Sprintf("**%s**", str)
case "i", "em":
if parser.ItalicConverter != nil {
return parser.ItalicConverter(str, ctx)
}
return fmt.Sprintf("_%s_", str)
case "s", "del", "strike":
if parser.StrikethroughConverter != nil {
return parser.StrikethroughConverter(str, ctx)
}
return fmt.Sprintf("~~%s~~", str)
case "u", "ins":
if parser.UnderlineConverter != nil {
return parser.UnderlineConverter(str, ctx)
}
case "tt", "code":
if parser.MonospaceConverter != nil {
return parser.MonospaceConverter(str, ctx)
}
return fmt.Sprintf("`%s`", str)
}
return str
}
func (parser *HTMLParser) headerToString(node *html.Node, stripLinebreak bool, ctx Context) string {
children := parser.nodeToStrings(node.FirstChild, stripLinebreak, ctx)
length := int(node.Data[1] - '0')
prefix := strings.Repeat("#", length) + " "
return prefix + strings.Join(children, "")
}
func (parser *HTMLParser) blockquoteToString(node *html.Node, stripLinebreak bool, ctx Context) string {
str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx)
childrenArr := strings.Split(strings.TrimSpace(str), "\n")
// TODO make blockquote prefix configurable
for index, child := range childrenArr {
childrenArr[index] = "> " + child
}
return strings.Join(childrenArr, "\n")
}
func (parser *HTMLParser) linkToString(node *html.Node, stripLinebreak bool, ctx Context) string {
str := parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx)
href := parser.getAttribute(node, "href")
if len(href) == 0 {
return str
}
if parser.PillConverter != nil {
parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href)
if err == nil && parsedMatrix != nil {
return parser.PillConverter(str, parsedMatrix.PrimaryIdentifier(), parsedMatrix.SecondaryIdentifier(), ctx)
}
}
if str == href {
return str
}
return fmt.Sprintf("%s (%s)", str, href)
}
func (parser *HTMLParser) tagToString(node *html.Node, stripLinebreak bool, ctx Context) string {
switch node.Data {
case "blockquote":
return parser.blockquoteToString(node, stripLinebreak, ctx)
case "ol", "ul":
return parser.listToString(node, stripLinebreak, ctx)
case "h1", "h2", "h3", "h4", "h5", "h6":
return parser.headerToString(node, stripLinebreak, ctx)
case "br":
return parser.Newline
case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code":
return parser.basicFormatToString(node, stripLinebreak, ctx)
case "a":
return parser.linkToString(node, stripLinebreak, ctx)
case "p":
return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx)
case "hr":
return parser.HorizontalLine
case "pre":
var preStr, language string
if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
class := parser.getAttribute(node.FirstChild, "class")
if strings.HasPrefix(class, "language-") {
language = class[len("language-"):]
}
preStr = parser.nodeToString(node.FirstChild.FirstChild, false, ctx)
} else {
preStr = parser.nodeToString(node.FirstChild, false, ctx)
}
if parser.MonospaceBlockConverter != nil {
return parser.MonospaceBlockConverter(preStr, language, ctx)
}
if len(preStr) == 0 || preStr[len(preStr)-1] != '\n' {
preStr += "\n"
}
return fmt.Sprintf("```%s\n%s```", language, preStr)
default:
return parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx)
}
}
func (parser *HTMLParser) singleNodeToString(node *html.Node, stripLinebreak bool, ctx Context) TaggedString {
switch node.Type {
case html.TextNode:
if stripLinebreak {
node.Data = strings.Replace(node.Data, "\n", "", -1)
}
return TaggedString{node.Data, "text"}
case html.ElementNode:
return TaggedString{parser.tagToString(node, stripLinebreak, ctx), node.Data}
case html.DocumentNode:
return TaggedString{parser.nodeToTagAwareString(node.FirstChild, stripLinebreak, ctx), "html"}
default:
return TaggedString{"", "unknown"}
}
}
func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, stripLinebreak bool, ctx Context) (strs []TaggedString) {
for ; node != nil; node = node.NextSibling {
strs = append(strs, parser.singleNodeToString(node, stripLinebreak, ctx))
}
return
}
var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
func (parser *HTMLParser) isBlockTag(tag string) bool {
for _, blockTag := range BlockTags {
if tag == blockTag {
return true
}
}
return false
}
func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, stripLinebreak bool, ctx Context) string {
strs := parser.nodeToTaggedStrings(node, stripLinebreak, ctx)
var output strings.Builder
for _, str := range strs {
tstr := str.string
if parser.isBlockTag(str.tag) {
tstr = fmt.Sprintf("\n%s\n", tstr)
}
output.WriteString(tstr)
}
return strings.TrimSpace(output.String())
}
func (parser *HTMLParser) nodeToStrings(node *html.Node, stripLinebreak bool, ctx Context) (strs []string) {
for ; node != nil; node = node.NextSibling {
strs = append(strs, parser.singleNodeToString(node, stripLinebreak, ctx).string)
}
return
}
func (parser *HTMLParser) nodeToString(node *html.Node, stripLinebreak bool, ctx Context) string {
return strings.Join(parser.nodeToStrings(node, stripLinebreak, ctx), "")
}
// Parse converts Matrix HTML into text using the settings in this parser.
func (parser *HTMLParser) Parse(htmlData string, ctx Context) string {
if parser.TabsToSpaces >= 0 {
htmlData = strings.Replace(htmlData, "\t", strings.Repeat(" ", parser.TabsToSpaces), -1)
}
node, _ := html.Parse(strings.NewReader(htmlData))
return parser.nodeToTagAwareString(node, true, ctx)
}
// HTMLToText converts Matrix HTML into text with the default settings.
func HTMLToText(html string) string {
return (&HTMLParser{
TabsToSpaces: 4,
Newline: "\n",
HorizontalLine: "\n---\n",
PillConverter: DefaultPillConverter,
}).Parse(html, make(Context))
}