telegabber/telegram/formatter/formatter.go
2023-11-16 08:06:21 -05:00

445 lines
12 KiB
Go

package formatter
import (
"sort"
"unicode"
log "github.com/sirupsen/logrus"
"github.com/zelenin/go-tdlib/client"
)
type insertionType int
const (
insertionOpening insertionType = iota
insertionClosing
insertionUnpaired
)
type MarkupModeType int
const (
MarkupModeXEP0393 MarkupModeType = iota
MarkupModeMarkdown
)
// insertion is a piece of text in given position
type insertion struct {
Offset int32
Runes []rune
Type insertionType
}
// insertionStack contains the sequence of insertions
// from the start or from the end
type insertionStack []*insertion
var boldRunesMarkdown = []rune("**")
var boldRunesXEP0393 = []rune("*")
var italicRunes = []rune("_")
var strikeRunesMarkdown = []rune("~~")
var strikeRunesXEP0393 = []rune("~")
var codeRunes = []rune("`")
var preRuneStart = []rune("```\n")
var preRuneEnd = []rune("\n```")
var quoteRunes = []rune("> ")
var newlineRunes = []rune("\n")
var doubleNewlineRunes = []rune("\n\n")
var newlineCode = rune(0x0000000a)
var bmpCeil = rune(0x0000ffff)
// rebalance pumps all the values until the given offset to current stack (growing
// from start) from given stack (growing from end); should be called
// before any insertions to the current stack at the given offset
func (s insertionStack) rebalance(s2 insertionStack, offset int32) (insertionStack, insertionStack) {
for len(s2) > 0 && s2[len(s2)-1].Offset <= offset {
s = append(s, s2[len(s2)-1])
s2 = s2[:len(s2)-1]
}
return s, s2
}
// NewIterator is a second order function that sequentially scans and returns
// stack elements; starts returning nil when elements are ended
func (s insertionStack) NewIterator() func() *insertion {
i := -1
return func() *insertion {
i++
if i < len(s) {
return s[i]
}
return nil
}
}
// SortEntities arranges the entities in traversal-ready order
func SortEntities(entities []*client.TextEntity) []*client.TextEntity {
sortedEntities := make([]*client.TextEntity, len(entities))
copy(sortedEntities, entities)
sort.Slice(sortedEntities, func(i int, j int) bool {
entity1 := sortedEntities[i]
entity2 := sortedEntities[j]
if entity1.Offset < entity2.Offset {
return true
} else if entity1.Offset == entity2.Offset {
return entity1.Length > entity2.Length
}
return false
})
return sortedEntities
}
// MergeAdjacentEntities merges entities of a same kind
func MergeAdjacentEntities(entities []*client.TextEntity) []*client.TextEntity {
mergedEntities := make([]*client.TextEntity, 0, len(entities))
excludedIndices := make(map[int]bool)
for i, entity := range entities {
if excludedIndices[i] || entity.Type == nil {
continue
}
typ := entity.Type.TextEntityTypeType()
start := entity.Offset
end := start + entity.Length
ei := make(map[int]bool)
// collect continuations
for j, entity2 := range entities[i+1:] {
if entity2.Type != nil && entity2.Type.TextEntityTypeType() == typ && entity2.Offset == end {
end += entity2.Length
ei[j+i+1] = true
}
}
// check for intersections with other entities
var isIntersecting bool
if len(ei) > 0 {
for _, entity2 := range entities {
entity2End := entity2.Offset + entity2.Length
if (entity2.Offset < start && entity2End > start && entity2End < end) ||
(entity2.Offset > start && entity2.Offset < end && entity2End > end) {
isIntersecting = true
break
}
}
}
if !isIntersecting {
entity.Length = end - start
for j := range ei {
excludedIndices[j] = true
}
}
mergedEntities = append(mergedEntities, entity)
}
return mergedEntities
}
// ClaspDirectives to the following span as required by XEP-0393
func ClaspDirectives(doubledRunes []rune, entities []*client.TextEntity) []*client.TextEntity {
alignedEntities := make([]*client.TextEntity, len(entities))
copy(alignedEntities, entities)
for i, entity := range alignedEntities {
var dirty bool
endOffset := entity.Offset + entity.Length
if unicode.IsSpace(doubledRunes[entity.Offset]) {
for j, r := range doubledRunes[entity.Offset+1 : endOffset] {
if !unicode.IsSpace(r) {
dirty = true
entity.Offset += int32(j + 1)
entity.Length -= int32(j + 1)
break
}
}
}
if unicode.IsSpace(doubledRunes[endOffset-1]) {
for j := endOffset - 2; j >= entity.Offset; j-- {
if !unicode.IsSpace(doubledRunes[j]) {
dirty = true
entity.Length = j + 1 - entity.Offset
break
}
}
}
if dirty {
alignedEntities[i] = entity
}
}
return alignedEntities
}
func markupBraces(entity *client.TextEntity, lbrace, rbrace []rune) []*insertion {
return []*insertion{
&insertion{
Offset: entity.Offset,
Runes: lbrace,
Type: insertionOpening,
},
&insertion{
Offset: entity.Offset + entity.Length,
Runes: rbrace,
Type: insertionClosing,
},
}
}
func quotePrependNewlines(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
if len(doubledRunes) == 0 {
return []*insertion{}
}
startRunes := []rune("\n> ")
if entity.Offset == 0 || doubledRunes[entity.Offset-1] == newlineCode {
startRunes = quoteRunes
}
insertions := []*insertion{
&insertion{
Offset: entity.Offset,
Runes: startRunes,
Type: insertionUnpaired,
},
}
entityEnd := entity.Offset + entity.Length
entityEndInt := int(entityEnd)
var wasNewline bool
// last newline is omitted, there's no need to put quote mark after the quote
for i := entity.Offset; i < entityEnd-1; i++ {
isNewline := doubledRunes[i] == newlineCode
if (isNewline && markupMode == MarkupModeXEP0393) || (wasNewline && isNewline && markupMode == MarkupModeMarkdown) {
insertions = append(insertions, &insertion{
Offset: i + 1,
Runes: quoteRunes,
Type: insertionUnpaired,
})
}
if isNewline {
wasNewline = true
} else {
wasNewline = false
}
}
var rbrace []rune
if len(doubledRunes) > entityEndInt {
if doubledRunes[entityEnd] == newlineCode {
if markupMode == MarkupModeMarkdown && len(doubledRunes) > entityEndInt+1 && doubledRunes[entityEndInt+1] != newlineCode {
rbrace = newlineRunes
}
} else {
if markupMode == MarkupModeMarkdown {
rbrace = doubleNewlineRunes
} else {
rbrace = newlineRunes
}
}
}
insertions = append(insertions, &insertion{
Offset: entityEnd,
Runes: rbrace,
Type: insertionClosing,
})
return insertions
}
// entityToMarkdown generates the wrapping Markdown tags
func entityToMarkdown(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
if entity == nil || entity.Type == nil {
return []*insertion{}
}
switch entity.Type.TextEntityTypeType() {
case client.TypeTextEntityTypeBold:
return markupBraces(entity, boldRunesMarkdown, boldRunesMarkdown)
case client.TypeTextEntityTypeItalic:
return markupBraces(entity, italicRunes, italicRunes)
case client.TypeTextEntityTypeStrikethrough:
return markupBraces(entity, strikeRunesMarkdown, strikeRunesMarkdown)
case client.TypeTextEntityTypeCode:
return markupBraces(entity, codeRunes, codeRunes)
case client.TypeTextEntityTypePre:
return markupBraces(entity, preRuneStart, preRuneEnd)
case client.TypeTextEntityTypePreCode:
preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), codeRunes)
case client.TypeTextEntityTypeBlockQuote:
return quotePrependNewlines(entity, doubledRunes, MarkupModeMarkdown)
case client.TypeTextEntityTypeTextUrl:
textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
return markupBraces(entity, []rune("["), []rune("]("+textURL.Url+")"))
}
return []*insertion{}
}
// entityToXEP0393 generates the wrapping XEP-0393 tags
func entityToXEP0393(entity *client.TextEntity, doubledRunes []rune, markupMode MarkupModeType) []*insertion {
if entity == nil || entity.Type == nil {
return []*insertion{}
}
switch entity.Type.TextEntityTypeType() {
case client.TypeTextEntityTypeBold:
return markupBraces(entity, boldRunesXEP0393, boldRunesXEP0393)
case client.TypeTextEntityTypeItalic:
return markupBraces(entity, italicRunes, italicRunes)
case client.TypeTextEntityTypeStrikethrough:
return markupBraces(entity, strikeRunesXEP0393, strikeRunesXEP0393)
case client.TypeTextEntityTypeCode:
return markupBraces(entity, codeRunes, codeRunes)
case client.TypeTextEntityTypePre:
return markupBraces(entity, preRuneStart, preRuneEnd)
case client.TypeTextEntityTypePreCode:
preCode, _ := entity.Type.(*client.TextEntityTypePreCode)
return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), codeRunes)
case client.TypeTextEntityTypeBlockQuote:
return quotePrependNewlines(entity, doubledRunes, MarkupModeXEP0393)
case client.TypeTextEntityTypeTextUrl:
textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl)
// non-standard, Pidgin-specific
return markupBraces(entity, []rune{}, []rune(" <"+textURL.Url+">"))
}
return []*insertion{}
}
// transform the source text into a form with uniform runes and code points,
// by duplicating anything beyond the Basic Multilingual Plane
func textToDoubledRunes(text string) []rune {
doubledRunes := make([]rune, 0, len(text)*2)
for _, cp := range text {
if cp > bmpCeil {
doubledRunes = append(doubledRunes, cp, cp)
} else {
doubledRunes = append(doubledRunes, cp)
}
}
return doubledRunes
}
// Format traverses an already sorted list of entities and wraps the text in a markup
func Format(
sourceText string,
entities []*client.TextEntity,
markupMode MarkupModeType,
) string {
if len(entities) == 0 {
return sourceText
}
var entityToMarkup func(*client.TextEntity, []rune, MarkupModeType) []*insertion
if markupMode == MarkupModeXEP0393 {
entityToMarkup = entityToXEP0393
} else {
entityToMarkup = entityToMarkdown
}
doubledRunes := textToDoubledRunes(sourceText)
mergedEntities := SortEntities(ClaspDirectives(doubledRunes, MergeAdjacentEntities(SortEntities(entities))))
startStack := make(insertionStack, 0, len(sourceText))
endStack := make(insertionStack, 0, len(sourceText))
// convert entities to a stack of brackets
var maxEndOffset int32
for _, entity := range mergedEntities {
log.Debugf("%#v", entity)
if entity.Length <= 0 {
continue
}
endOffset := entity.Offset + entity.Length
if endOffset > maxEndOffset {
maxEndOffset = endOffset
}
startStack, endStack = startStack.rebalance(endStack, entity.Offset)
insertions := entityToMarkup(entity, doubledRunes, markupMode)
if len(insertions) > 1 {
startStack = append(startStack, insertions[0:len(insertions)-1]...)
}
if len(insertions) > 0 {
endStack = append(endStack, insertions[len(insertions)-1])
}
}
// flush the closing brackets that still remain in endStack
startStack, endStack = startStack.rebalance(endStack, maxEndOffset)
// sort unpaired insertions
sort.SliceStable(startStack, func(i int, j int) bool {
ins1 := startStack[i]
ins2 := startStack[j]
if ins1.Type == insertionUnpaired && ins2.Type == insertionUnpaired {
return ins1.Offset < ins2.Offset
}
if ins1.Type == insertionUnpaired {
if ins1.Offset == ins2.Offset {
if ins2.Type == insertionOpening { // > **
return true
} else if ins2.Type == insertionClosing { // **>
return false
}
} else {
return ins1.Offset < ins2.Offset
}
}
if ins2.Type == insertionUnpaired {
if ins1.Offset == ins2.Offset {
if ins1.Type == insertionOpening { // > **
return false
} else if ins1.Type == insertionClosing { // **>
return true
}
} else {
return ins1.Offset < ins2.Offset
}
}
return false
})
// merge brackets into text
markupRunes := make([]rune, 0, len(sourceText))
nextInsertion := startStack.NewIterator()
insertion := nextInsertion()
var skipNext bool
for i, cp := range doubledRunes {
if skipNext {
skipNext = false
continue
}
for insertion != nil && int(insertion.Offset) <= i {
markupRunes = append(markupRunes, insertion.Runes...)
insertion = nextInsertion()
}
markupRunes = append(markupRunes, cp)
// skip two UTF-16 code units (not points actually!) if needed
if cp > bmpCeil {
skipNext = true
}
}
for insertion != nil {
markupRunes = append(markupRunes, insertion.Runes...)
insertion = nextInsertion()
}
return string(markupRunes)
}