Skip to content

Commit a6c1c02

Browse files
committed
Do not add spaces in front of certain punctuation marks like '.', '?' or ')'.
But '(' would still get its space.
1 parent 7d64848 commit a6c1c02

2 files changed

Lines changed: 32 additions & 5 deletions

File tree

html2text.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
7272
}
7373

7474
ctx := textifyTraverseContext{
75-
buf: bytes.Buffer{},
76-
options: options,
75+
buf: bytes.Buffer{},
76+
options: options,
7777
citationMap: map[string]int{},
7878
}
7979
if err := ctx.traverse(doc); err != nil {
@@ -431,6 +431,25 @@ func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
431431
return nil
432432
}
433433

434+
// Tests r for being a character where no space should be inserted in front of.
435+
func punctNoSpaceBefore(r rune) bool {
436+
switch r {
437+
case '.', ',', ';', '!', '?', ')', ']', '>':
438+
return true
439+
default:
440+
return false
441+
}
442+
}
443+
444+
// Tests r for being a character where no space should be inserted after.
445+
func punctNoSpaceAfter(r rune) bool {
446+
switch r {
447+
case '(', '[', '<':
448+
return true
449+
default:
450+
return false
451+
}
452+
}
434453
func (ctx *textifyTraverseContext) emit(data string) error {
435454
if data == "" {
436455
return nil
@@ -441,14 +460,14 @@ func (ctx *textifyTraverseContext) emit(data string) error {
441460
)
442461
for _, line := range lines {
443462
runes := []rune(line)
444-
startsWithSpace := unicode.IsSpace(runes[0])
445-
if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
463+
startsWithSpace := unicode.IsSpace(runes[0]) || punctNoSpaceBefore(runes[0])
464+
if !startsWithSpace && !ctx.endsWithSpace {
446465
if err = ctx.buf.WriteByte(' '); err != nil {
447466
return err
448467
}
449468
ctx.lineLength++
450469
}
451-
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
470+
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) || punctNoSpaceAfter(runes[len(runes)-1])
452471
for _, c := range line {
453472
if _, err = ctx.buf.WriteString(string(c)); err != nil {
454473
return err

html2text_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,14 @@ func TestCitationStyleLinks(t *testing.T) {
542542
`<a href="">Link</a>`,
543543
"Link",
544544
},
545+
{
546+
`<a href="http://example1.com/">Link1</a><a href="http://example2.com/">Link2</a>`,
547+
"Link1 [1] Link2 [2]\n\n[1] http://example1.com/\n[2] http://example2.com/",
548+
},
549+
{
550+
`<a href="http://example1.com/">Link1</a> (<a href="http://example2.com/">Link2</a>)`,
551+
"Link1 [1] (Link2 [2])\n\n[1] http://example1.com/\n[2] http://example2.com/",
552+
},
545553
{
546554
`<a href="http://example1.com/">Link1</a>? <a href="http://example2.com/">Link2</a>!`,
547555
"Link1 [1]? Link2 [2]!\n\n[1] http://example1.com/\n[2] http://example2.com/",

0 commit comments

Comments
 (0)