Skip to content

Commit c875978

Browse files
committed
Support citation style links
1 parent 3577fbd commit c875978

2 files changed

Lines changed: 100 additions & 1 deletion

File tree

html2text.go

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package html2text
22

33
import (
44
"bytes"
5+
"fmt"
56
"io"
67
"regexp"
78
"strings"
@@ -18,6 +19,7 @@ type Options struct {
1819
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
1920
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
2021
OmitLinks bool // Turns on omitting links
22+
CitationStyleLinks bool // Uses citation style links like [1]
2123
}
2224

2325
// PrettyTablesOptions overrides tablewriter behaviors
@@ -72,11 +74,16 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
7274
ctx := textifyTraverseContext{
7375
buf: bytes.Buffer{},
7476
options: options,
77+
citationMap: map[string]int{},
7578
}
7679
if err := ctx.traverse(doc); err != nil {
7780
return "", err
7881
}
7982

83+
if ctx.options.CitationStyleLinks && ctx.citationCount > 0 {
84+
ctx.emitCitations()
85+
}
86+
8087
text := strings.TrimSpace(newlineRe.ReplaceAllString(
8188
strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
8289
)
@@ -124,6 +131,8 @@ type textifyTraverseContext struct {
124131
blockquoteLevel int
125132
lineLength int
126133
isPre bool
134+
citationCount int
135+
citationMap map[string]int
127136
}
128137

129138
// tableTraverseContext holds table ASCII-form related context.
@@ -255,7 +264,11 @@ func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
255264
attrVal = ctx.normalizeHrefLink(attrVal)
256265
// Don't print link href if it matches link element content or if the link is empty.
257266
if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
258-
hrefLink = "( " + attrVal + " )"
267+
if ctx.options.CitationStyleLinks {
268+
hrefLink = ctx.addCitation(attrVal)
269+
} else {
270+
hrefLink = "( " + attrVal + " )"
271+
}
259272
}
260273
}
261274

@@ -503,6 +516,40 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
503516
return link
504517
}
505518

519+
func formatCitation(idx int) string {
520+
return fmt.Sprintf("[%d] ", idx)
521+
}
522+
523+
func (ctx *textifyTraverseContext) addCitation(url string) string {
524+
idx, ok := ctx.citationMap[url]
525+
526+
if !ok {
527+
ctx.citationCount += 1
528+
idx = ctx.citationCount
529+
ctx.citationMap[url] = idx
530+
}
531+
532+
return formatCitation(idx)
533+
}
534+
535+
func (ctx *textifyTraverseContext) emitCitations() {
536+
// this method writes to the buffer directly instead of using `emit`, b/c we do not want to split long links
537+
ctx.buf.WriteString("\n\n")
538+
539+
// citations are ordered by link --> bring them into the correct order first
540+
links := make([]string, ctx.citationCount)
541+
542+
for k, v := range ctx.citationMap {
543+
links[v-1] = k // arrays are 0-based, our citations are 1-based
544+
}
545+
546+
for i, link := range links {
547+
ctx.buf.WriteString(formatCitation(i + 1))
548+
ctx.buf.WriteString(link)
549+
ctx.buf.WriteByte('\n')
550+
}
551+
}
552+
506553
// renderEachChild visits each direct child of a node and collects the sequence of
507554
// textuual representaitons separated by a single newline.
508555
func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {

html2text_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,58 @@ func TestOmitLinks(t *testing.T) {
521521
}
522522
}
523523

524+
func TestCitationStyleLinks(t *testing.T) {
525+
testCases := []struct {
526+
input string
527+
output string
528+
}{
529+
{
530+
`<a></a>`,
531+
``,
532+
},
533+
{
534+
`<a href=""></a>`,
535+
``,
536+
},
537+
{
538+
`<a href="http://example.com/"></a>`,
539+
"[1] \n\n[1] http://example.com/",
540+
},
541+
{
542+
`<a href="">Link</a>`,
543+
"Link",
544+
},
545+
{
546+
`<a href="http://example1.com/">Link1</a><a href="http://example2.com/">Link2</a>`,
547+
"Link1 [1] Link2 [2] \n\n[1] http://example1.com/\n[2] http://example2.com/",
548+
},
549+
{
550+
`<a href="http://example1.com/">Link1</a><a href="http://example1.com/">Link1 again</a>`,
551+
"Link1 [1] Link1 again [1] \n\n[1] http://example1.com/",
552+
},
553+
{
554+
`<a href="http://example.com/"><span class="a">Link</span></a>`,
555+
"Link [1] \n\n[1] http://example.com/",
556+
},
557+
{
558+
"<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
559+
"Link [1] \n\n[1] http://example.com/",
560+
},
561+
{
562+
`<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
563+
"Example [1] \n\n[1] http://example.com/",
564+
},
565+
}
566+
567+
for _, testCase := range testCases {
568+
if msg, err := wantString(testCase.input, testCase.output, Options{CitationStyleLinks: true}); err != nil {
569+
t.Error(err)
570+
} else if len(msg) > 0 {
571+
t.Log(msg)
572+
}
573+
}
574+
}
575+
524576
func TestImageAltTags(t *testing.T) {
525577
testCases := []struct {
526578
input string

0 commit comments

Comments
 (0)