From 70383bee128b11451211e514200b23f47bf272db Mon Sep 17 00:00:00 2001 From: bodqhrohro Date: Thu, 9 Jan 2020 23:16:40 +0200 Subject: [PATCH] Convert formatting entities to Markdown --- Makefile | 2 +- telegram/formatter/formatter.go | 165 +++++++++++++++++++++ telegram/formatter/formatter_test.go | 208 +++++++++++++++++++++++++++ telegram/utils.go | 44 +++++- 4 files changed, 411 insertions(+), 8 deletions(-) create mode 100644 telegram/formatter/formatter.go create mode 100644 telegram/formatter/formatter_test.go diff --git a/Makefile b/Makefile index 6ea87f9..837be39 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ all: go build -o telegabber test: - go test -v ./config ./ ./telegram ./xmpp/gateway ./persistence + go test -v ./config ./ ./telegram ./xmpp/gateway ./persistence ./telegram/formatter lint: $(GOPATH)/bin/golint ./... diff --git a/telegram/formatter/formatter.go b/telegram/formatter/formatter.go new file mode 100644 index 0000000..4b26f83 --- /dev/null +++ b/telegram/formatter/formatter.go @@ -0,0 +1,165 @@ +package formatter + +import ( + "sort" + + log "github.com/sirupsen/logrus" + "github.com/zelenin/go-tdlib/client" +) + +// Insertion is a piece of text in given position +type Insertion struct { + Offset int32 + Runes []rune +} + +// InsertionStack contains the sequence of insertions +// from the start or from the end +type InsertionStack []*Insertion + +var boldRunes = []rune("**") +var italicRunes = []rune("_") +var codeRunes = []rune("\n```\n") +var urlRuneL = []rune("[") + +// rebalance pumps all the values at given offset to current stack (growing +// from start) from given stack (growing from end); should be called +// before any insertions to the current stack at the given offset +func (s InsertionStack) rebalance(s2 InsertionStack, offset int32) (InsertionStack, InsertionStack) { + for len(s2) > 0 && s2[len(s2)-1].Offset <= offset { + s = append(s, s2[len(s2)-1]) + s2 = s2[:len(s2)-1] + } + + return s, s2 +} + +// NewIterator is a second order function that sequentially scans and returns +// stack elements; starts returning nil when elements are ended +func (s InsertionStack) NewIterator() func() *Insertion { + i := -1 + + return func() *Insertion { + i++ + if i < len(s) { + return s[i] + } + return nil + } +} + +// SortEntities arranges the entities in traversal-ready order +func SortEntities(entities []*client.TextEntity) []*client.TextEntity { + sortedEntities := make([]*client.TextEntity, len(entities)) + copy(sortedEntities, entities) + + sort.Slice(sortedEntities, func(i int, j int) bool { + entity1 := entities[i] + entity2 := entities[j] + if entity1.Offset < entity2.Offset { + return true + } else if entity1.Offset == entity2.Offset { + return entity1.Length > entity2.Length + } + return false + }) + return sortedEntities +} + +func markupBraces(entity *client.TextEntity, lbrace, rbrace []rune) (*Insertion, *Insertion) { + return &Insertion{ + Offset: entity.Offset, + Runes: lbrace, + }, &Insertion{ + Offset: entity.Offset + entity.Length, + Runes: rbrace, + } +} + +// EntityToMarkdown generates the wrapping Markdown tags +func EntityToMarkdown(entity *client.TextEntity) (*Insertion, *Insertion) { + switch entity.Type.TextEntityTypeType() { + case client.TypeTextEntityTypeBold: + return markupBraces(entity, boldRunes, boldRunes) + case client.TypeTextEntityTypeItalic: + return markupBraces(entity, italicRunes, italicRunes) + case client.TypeTextEntityTypeCode, client.TypeTextEntityTypePre: + return markupBraces(entity, codeRunes, codeRunes) + case client.TypeTextEntityTypePreCode: + preCode, _ := entity.Type.(*client.TextEntityTypePreCode) + return markupBraces(entity, []rune("\n```"+preCode.Language+"\n"), codeRunes) + case client.TypeTextEntityTypeTextUrl: + textURL, _ := entity.Type.(*client.TextEntityTypeTextUrl) + return markupBraces(entity, urlRuneL, []rune("]("+textURL.Url+")")) + } + + return nil, nil +} + +// Format traverses an already sorted list of entities and wraps the text in Markdown +func Format( + sourceText string, + entities []*client.TextEntity, + entityToMarkup func(*client.TextEntity) (*Insertion, *Insertion), +) string { + if len(entities) == 0 { + return sourceText + } + + startStack := make(InsertionStack, 0, len(sourceText)) + endStack := make(InsertionStack, 0, len(sourceText)) + + // convert entities to a stack of brackets + var maxEndOffset int32 + for _, entity := range entities { + log.Debugf("%#v", entity) + if entity.Length <= 0 { + continue + } + + endOffset := entity.Offset + entity.Length + if endOffset > maxEndOffset { + maxEndOffset = endOffset + } + + startStack, endStack = startStack.rebalance(endStack, entity.Offset) + + startInsertion, endInsertion := entityToMarkup(entity) + if startInsertion != nil { + startStack = append(startStack, startInsertion) + } + if endInsertion != nil { + endStack = append(endStack, endInsertion) + } + } + // flush the closing brackets that still remain in endStack + startStack, endStack = startStack.rebalance(endStack, maxEndOffset) + + // merge brackets into text + markupRunes := make([]rune, 0, len(sourceText)) + + nextInsertion := startStack.NewIterator() + insertion := nextInsertion() + var runeI int32 + + for _, cp := range sourceText { + for insertion != nil && insertion.Offset <= runeI { + markupRunes = append(markupRunes, insertion.Runes...) + insertion = nextInsertion() + } + + markupRunes = append(markupRunes, cp) + // skip two UTF-16 code units (not points actually!) if needed + if cp > 0x0000ffff { + runeI += 2 + } else { + runeI++ + } + } + for insertion != nil { + markupRunes = append(markupRunes, insertion.Runes...) + insertion = nextInsertion() + } + + return string(markupRunes) +} diff --git a/telegram/formatter/formatter_test.go b/telegram/formatter/formatter_test.go new file mode 100644 index 0000000..63337d6 --- /dev/null +++ b/telegram/formatter/formatter_test.go @@ -0,0 +1,208 @@ +package formatter + +import ( + "testing" + + "github.com/zelenin/go-tdlib/client" +) + +func TestNoFormatting(t *testing.T) { + markup := Format("abc\ndef", []*client.TextEntity{}, EntityToMarkdown) + if markup != "abc\ndef" { + t.Errorf("No formatting expected, but: %v", markup) + } +} + +func TestFormattingSimple(t *testing.T) { + markup := Format("👙🐧🐖", []*client.TextEntity{ + &client.TextEntity{ + Offset: 2, + Length: 4, + Type: &client.TextEntityTypeBold{}, + }, + }, EntityToMarkdown) + if markup != "👙**🐧🐖**" { + t.Errorf("Wrong simple formatting: %v", markup) + } +} + +func TestFormattingAdjacent(t *testing.T) { + markup := Format("a👙🐧🐖", []*client.TextEntity{ + &client.TextEntity{ + Offset: 3, + Length: 2, + Type: &client.TextEntityTypeItalic{}, + }, + &client.TextEntity{ + Offset: 5, + Length: 2, + Type: &client.TextEntityTypeTextUrl{ + Url: "https://narayana.im/", + }, + }, + }, EntityToMarkdown) + if markup != "a👙_🐧_[🐖](https://narayana.im/)" { + t.Errorf("Wrong adjacent formatting: %v", markup) + } +} + +func TestFormattingAdjacentAndNested(t *testing.T) { + markup := Format("👙🐧🐖", []*client.TextEntity{ + &client.TextEntity{ + Offset: 0, + Length: 4, + Type: &client.TextEntityTypePre{}, + }, + &client.TextEntity{ + Offset: 0, + Length: 2, + Type: &client.TextEntityTypeBold{}, + }, + &client.TextEntity{ + Offset: 4, + Length: 2, + Type: &client.TextEntityTypeItalic{}, + }, + }, EntityToMarkdown) + if markup != "\n```\n**👙**🐧\n```\n_🐖_" { + t.Errorf("Wrong adjacent&nested formatting: %v", markup) + } +} + +func TestRebalanceTwoZero(t *testing.T) { + s1 := InsertionStack{ + &Insertion{Offset: 7}, + &Insertion{Offset: 8}, + } + s2 := InsertionStack{} + s1, s2 = s1.rebalance(s2, 7) + if !(len(s1) == 2 && len(s2) == 0 && s1[0].Offset == 7 && s1[1].Offset == 8) { + t.Errorf("Wrong rebalance 2–0: %#v %#v", s1, s2) + } +} + +func TestRebalanceNeeded(t *testing.T) { + s1 := InsertionStack{ + &Insertion{Offset: 7}, + &Insertion{Offset: 8}, + } + s2 := InsertionStack{ + &Insertion{Offset: 10}, + &Insertion{Offset: 9}, + } + s1, s2 = s1.rebalance(s2, 9) + if !(len(s1) == 3 && len(s2) == 1 && + s1[0].Offset == 7 && s1[1].Offset == 8 && s1[2].Offset == 9 && + s2[0].Offset == 10) { + t.Errorf("Wrong rebalance when needed: %#v %#v", s1, s2) + } +} + +func TestRebalanceNotNeeded(t *testing.T) { + s1 := InsertionStack{ + &Insertion{Offset: 7}, + &Insertion{Offset: 8}, + } + s2 := InsertionStack{ + &Insertion{Offset: 10}, + &Insertion{Offset: 9}, + } + s1, s2 = s1.rebalance(s2, 8) + if !(len(s1) == 2 && len(s2) == 2 && + s1[0].Offset == 7 && s1[1].Offset == 8 && + s2[0].Offset == 10 && s2[1].Offset == 9) { + t.Errorf("Wrong rebalance when not needed: %#v %#v", s1, s2) + } +} + +func TestRebalanceLate(t *testing.T) { + s1 := InsertionStack{ + &Insertion{Offset: 7}, + &Insertion{Offset: 8}, + } + s2 := InsertionStack{ + &Insertion{Offset: 10}, + &Insertion{Offset: 9}, + } + s1, s2 = s1.rebalance(s2, 10) + if !(len(s1) == 4 && len(s2) == 0 && + s1[0].Offset == 7 && s1[1].Offset == 8 && + s1[2].Offset == 9 && s1[3].Offset == 10) { + t.Errorf("Wrong rebalance when late: %#v %#v", s1, s2) + } +} + +func TestIteratorEmpty(t *testing.T) { + s := InsertionStack{} + g := s.NewIterator() + v := g() + if v != nil { + t.Errorf("Empty iterator should return nil but returned %#v", v) + } +} + +func TestIterator(t *testing.T) { + s := InsertionStack{ + &Insertion{Offset: 7}, + &Insertion{Offset: 8}, + } + g := s.NewIterator() + v := g() + if v == nil || v.Offset != 7 { + t.Errorf("Wrong insertion instead of 7: %#v", v) + } + v = g() + if v == nil || v.Offset != 8 { + t.Errorf("Wrong insertion instead of 8: %#v", v) + } + v = g() + if v != nil { + t.Errorf("nil should be returned after end, %#v instead", v) + } + v = g() + if v != nil { + t.Errorf("Further attempts should return nil too, %#v instead", v) + } +} + +func TestSortEntities(t *testing.T) { + entities := []*client.TextEntity{ + &client.TextEntity{ + Offset: 3, + Length: 2, + }, + &client.TextEntity{ + Offset: 5, + Length: 2, + }, + &client.TextEntity{ + Offset: 7, + Length: 2, + }, + &client.TextEntity{ + Offset: 6, + Length: 1, + }, + &client.TextEntity{ + Offset: 5, + Length: 1, + }, + } + entities = SortEntities(entities) + if !(len(entities) == 5 && + entities[0].Offset == 3 && entities[0].Length == 2 && + entities[1].Offset == 5 && entities[1].Length == 2 && + entities[2].Offset == 5 && entities[2].Length == 1 && + entities[3].Offset == 6 && entities[3].Length == 1 && + entities[4].Offset == 7 && entities[4].Length == 2) { + t.Errorf("Wrong sorting order: %#v", entities) + } +} + +func TestSortEmpty(t *testing.T) { + entities := []*client.TextEntity{} + entities = SortEntities(entities) + if len(entities) != 0 { + t.Errorf("Empty entities set sorting error: %#v", entities) + } +} diff --git a/telegram/utils.go b/telegram/utils.go index 8de1f5f..f7e7a28 100644 --- a/telegram/utils.go +++ b/telegram/utils.go @@ -15,6 +15,7 @@ import ( "time" "dev.narayana.im/narayana/telegabber/telegram/cache" + "dev.narayana.im/narayana/telegabber/telegram/formatter" "dev.narayana.im/narayana/telegabber/xmpp/gateway" log "github.com/sirupsen/logrus" @@ -281,6 +282,7 @@ func (c *Client) formatContent(file *client.File, filename string) string { } func (c *Client) messageToText(message *client.Message) string { + markupFunction := formatter.EntityToMarkdown switch message.Content.MessageContentType() { case client.TypeMessageSticker: sticker, _ := message.Content.(*client.MessageSticker) @@ -318,27 +320,55 @@ func (c *Client) messageToText(message *client.Message) string { ) case client.TypeMessagePhoto: photo, _ := message.Content.(*client.MessagePhoto) - return photo.Caption.Text + return formatter.Format( + photo.Caption.Text, + formatter.SortEntities(photo.Caption.Entities), + markupFunction, + ) case client.TypeMessageAudio: audio, _ := message.Content.(*client.MessageAudio) - return audio.Caption.Text + return formatter.Format( + audio.Caption.Text, + formatter.SortEntities(audio.Caption.Entities), + markupFunction, + ) case client.TypeMessageVideo: video, _ := message.Content.(*client.MessageVideo) - return video.Caption.Text + return formatter.Format( + video.Caption.Text, + formatter.SortEntities(video.Caption.Entities), + markupFunction, + ) case client.TypeMessageDocument: document, _ := message.Content.(*client.MessageDocument) - return document.Caption.Text + return formatter.Format( + document.Caption.Text, + formatter.SortEntities(document.Caption.Entities), + markupFunction, + ) case client.TypeMessageText: text, _ := message.Content.(*client.MessageText) - return text.Text.Text + return formatter.Format( + text.Text.Text, + formatter.SortEntities(text.Text.Entities), + markupFunction, + ) case client.TypeMessageVoiceNote: voice, _ := message.Content.(*client.MessageVoiceNote) - return voice.Caption.Text + return formatter.Format( + voice.Caption.Text, + formatter.SortEntities(voice.Caption.Entities), + markupFunction, + ) case client.TypeMessageVideoNote: return "" case client.TypeMessageAnimation: animation, _ := message.Content.(*client.MessageAnimation) - return animation.Caption.Text + return formatter.Format( + animation.Caption.Text, + formatter.SortEntities(animation.Caption.Entities), + markupFunction, + ) } return fmt.Sprintf("unknown message (%s)", message.Content.MessageContentType())