@@ -2,6 +2,7 @@ package html2text
22
33import (
44 "bytes"
5+ "fmt"
56 "io"
67 "regexp"
78 "strings"
@@ -18,6 +19,7 @@ type Options struct {
1819 PrettyTables bool // Turns on pretty ASCII rendering for table elements.
1920 PrettyTablesOptions * PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
2021 OmitLinks bool // Turns on omitting links
22+ CitationStyleLinks bool // Uses citation style links like [1]
2123}
2224
2325// PrettyTablesOptions overrides tablewriter behaviors
@@ -72,11 +74,16 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
7274 ctx := textifyTraverseContext {
7375 buf : bytes.Buffer {},
7476 options : options ,
77+ citationMap : map [string ]int {},
7578 }
7679 if err := ctx .traverse (doc ); err != nil {
7780 return "" , err
7881 }
7982
83+ if ctx .options .CitationStyleLinks && ctx .citationCount > 0 {
84+ ctx .emitCitations ()
85+ }
86+
8087 text := strings .TrimSpace (newlineRe .ReplaceAllString (
8188 strings .Replace (ctx .buf .String (), "\n " , "\n " , - 1 ), "\n \n " ),
8289 )
@@ -124,6 +131,8 @@ type textifyTraverseContext struct {
124131 blockquoteLevel int
125132 lineLength int
126133 isPre bool
134+ citationCount int
135+ citationMap map [string ]int
127136}
128137
129138// tableTraverseContext holds table ASCII-form related context.
@@ -255,7 +264,11 @@ func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
255264 attrVal = ctx .normalizeHrefLink (attrVal )
256265 // Don't print link href if it matches link element content or if the link is empty.
257266 if ! ctx .options .OmitLinks && attrVal != "" && linkText != attrVal {
258- hrefLink = "( " + attrVal + " )"
267+ if ctx .options .CitationStyleLinks {
268+ hrefLink = ctx .addCitation (attrVal )
269+ } else {
270+ hrefLink = "( " + attrVal + " )"
271+ }
259272 }
260273 }
261274
@@ -503,6 +516,40 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
503516 return link
504517}
505518
519+ func formatCitation (idx int ) string {
520+ return fmt .Sprintf ("[%d] " , idx )
521+ }
522+
523+ func (ctx * textifyTraverseContext ) addCitation (url string ) string {
524+ idx , ok := ctx .citationMap [url ]
525+
526+ if ! ok {
527+ ctx .citationCount += 1
528+ idx = ctx .citationCount
529+ ctx .citationMap [url ] = idx
530+ }
531+
532+ return formatCitation (idx )
533+ }
534+
535+ func (ctx * textifyTraverseContext ) emitCitations () {
536+ // this method writes to the buffer directly instead of using `emit`, b/c we do not want to split long links
537+ ctx .buf .WriteString ("\n \n " )
538+
539+ // citations are ordered by link --> bring them into the correct order first
540+ links := make ([]string , ctx .citationCount )
541+
542+ for k , v := range ctx .citationMap {
543+ links [v - 1 ] = k // arrays are 0-based, our citations are 1-based
544+ }
545+
546+ for i , link := range links {
547+ ctx .buf .WriteString (formatCitation (i + 1 ))
548+ ctx .buf .WriteString (link )
549+ ctx .buf .WriteByte ('\n' )
550+ }
551+ }
552+
506553// renderEachChild visits each direct child of a node and collects the sequence of
507554// textuual representaitons separated by a single newline.
508555func (ctx * textifyTraverseContext ) renderEachChild (node * html.Node ) (string , error ) {
0 commit comments