Skip to content

Commit

Permalink
add functional options for altering behavior
Browse files Browse the repository at this point in the history
  • Loading branch information
Mario Hros committed Nov 7, 2022
1 parent 35493e6 commit 931105c
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 22 deletions.
90 changes: 77 additions & 13 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,50 @@ import (
"strings"
)

// Line break constants
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
const (
WIN_LBR = "\r\n"
UNIX_LBR = "\n"
)

var lbr = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style)($|\s+)`)
var legacyLBR = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)

type options struct {
lbr string
linksInnerText bool
}

func newOptions() *options {
// apply defaults
return &options{
lbr: WIN_LBR,
}
}

// Option is a functional option
type Option func(*options)

// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
func WithUnixLineBreaks() Option {
return func(o *options) {
o.lbr = UNIX_LBR
}
}

// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
// Example: click news <http://bit.ly/2n4wXRs>
func WithLinksInnerText() Option {
return func(o *options) {
o.linksInnerText = true
}
}

func parseHTMLEntity(entName string) (string, bool) {
if r, ok := entity[entName]; ok {
return string(r), true
Expand Down Expand Up @@ -47,11 +79,12 @@ func parseHTMLEntity(entName string) (string, bool) {

// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
func SetUnixLbr(b bool) {
if b {
lbr = UNIX_LBR
legacyLBR = UNIX_LBR
} else {
lbr = WIN_LBR
legacyLBR = WIN_LBR
}
}

Expand Down Expand Up @@ -113,12 +146,26 @@ func writeSpace(outBuf *bytes.Buffer) {

// HTML2Text converts html into a text form
func HTML2Text(html string) string {
var opts []Option
if legacyLBR == UNIX_LBR {
opts = append(opts, WithUnixLineBreaks())
}
return HTML2TextWithOptions(html, opts...)
}

// HTML2TextWithOptions converts html into a text form with additional options
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
opts := newOptions()
for _, opt := range reqOpts {
opt(opts)
}

inLen := len(html)
tagStart := 0
inEnt := false
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
shouldOutput := true
// maintain a stack of <a> tag href links and output it after the tag's inner text
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
hrefs := []string{}
// new line cannot be printed at the beginning or
// for <p> after a new line created by previous <p></p>
Expand Down Expand Up @@ -185,23 +232,23 @@ func HTML2Text(html string) string {
tagNameLowercase := strings.ToLower(tag)

if tagNameLowercase == "/ul" {
outBuf.WriteString(lbr)
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
outBuf.WriteString(lbr)
outBuf.WriteString(opts.lbr)
} else if headersRE.MatchString(tagNameLowercase) {
if canPrintNewline {
outBuf.WriteString(lbr + lbr)
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
// new line
outBuf.WriteString(lbr)
outBuf.WriteString(opts.lbr)
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
if canPrintNewline {
outBuf.WriteString(lbr + lbr)
outBuf.WriteString(opts.lbr + opts.lbr)
}
canPrintNewline = false
} else if tagNameLowercase == "/a" {
} else if opts.linksInnerText && tagNameLowercase == "/a" {
// end of link
// links can be empty can happen if the link matches the badLinkHrefRE
if len(hrefs) > 0 {
Expand All @@ -210,7 +257,7 @@ func HTML2Text(html string) string {
outBuf.WriteString(">")
hrefs = hrefs[1:]
}
} else if linkTagRE.MatchString(tagNameLowercase) {
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
// parse link href
// add special handling for a tags
m := linkTagRE.FindStringSubmatch(tag)
Expand All @@ -220,13 +267,30 @@ func HTML2Text(html string) string {
link = m[3]
}

if !badLinkHrefRE.MatchString(link) {
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
hrefs = append(hrefs, link)
}
}
} else if badTagnamesRE.MatchString(tagNameLowercase) {
// unwanted block
badTagStackDepth++

// if link inner text preservation is not enabled
// and the current tag is a link tag, parse its href and output that
if !opts.linksInnerText {
// parse link href
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 4 {
link := m[2]
if len(link) == 0 {
link = m[3]
}

if !badLinkHrefRE.MatchString(link) {
outBuf.WriteString(HTMLEntitiesToText(link))
}
}
}
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
// end of unwanted block
Expand Down
36 changes: 27 additions & 9 deletions html2text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ func TestHTML2Text(t *testing.T) {
Convey("Links", func() {
So(HTML2Text(`<div></div>`), ShouldEqual, "")
So(HTML2Text(`<div>simple text</div>`), ShouldEqual, "simple text")
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click here <test>")
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click here <test>")
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click here <ents/'x'>")
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click here")
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click here or here <test>")
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
So(HTML2Text(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`), ShouldEqual, "click here or <one> here <two>")

// the original behavior
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")

// with inner text
So(HTML2TextWithOptions(`click <a href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
So(HTML2TextWithOptions(`click <a class="x" href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
So(HTML2TextWithOptions(`click <a href="ents/&apos;x&apos;">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <ents/'x'>")
So(HTML2TextWithOptions(`click <a href="javascript:void(0)">here</a>`, WithLinksInnerText()), ShouldEqual, "click here")
So(HTML2TextWithOptions(`click <a href="test"><span>here</span> or here</a>`, WithLinksInnerText()), ShouldEqual, "click here or here <test>")
So(HTML2TextWithOptions(`click <a href="http://bit.ly/2n4wXRs">news</a>`, WithLinksInnerText()), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
So(HTML2TextWithOptions(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`, WithLinksInnerText()), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
So(HTML2TextWithOptions(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`, WithLinksInnerText()), ShouldEqual, "click here or <one> here <two>")
})

Convey("Inlines", func() {
Expand Down Expand Up @@ -80,7 +91,7 @@ func TestHTML2Text(t *testing.T) {
ShouldEqual, "we are not interested in scripts")
})

Convey("Switching Unix and Windows line breaks", func() {
Convey("Switching Unix and Windows line breaks (original behavior)", func() {
SetUnixLbr(true)
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\nline\nbreaks")
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\n\nparagraphs")
Expand All @@ -89,6 +100,13 @@ func TestHTML2Text(t *testing.T) {
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
})

Convey("Switching Unix and Windows line breaks (new options)", func() {
So(HTML2TextWithOptions(`two<br>line<br/>breaks`, WithUnixLineBreaks()), ShouldEqual, "two\nline\nbreaks")
So(HTML2TextWithOptions(`<p>two</p><p>paragraphs</p>`, WithUnixLineBreaks()), ShouldEqual, "two\n\nparagraphs")
So(HTML2TextWithOptions(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
So(HTML2TextWithOptions(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
})

Convey("Custom HTML Tags", func() {
So(HTML2Text(`<aa>hello</aa>`), ShouldEqual, "hello")
So(HTML2Text(`<aa >hello</aa>`), ShouldEqual, "hello")
Expand Down

0 comments on commit 931105c

Please sign in to comment.