-
Notifications
You must be signed in to change notification settings - Fork 6
/
parser.go
61 lines (53 loc) · 1.92 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
package dom
import (
"bytes"
"io"
"io/ioutil"
"github.com/gogs/chardet"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
xunicode "golang.org/x/text/encoding/unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
// FastParse parses html.Node from the specified reader without caring about
// text encoding. It always assume that the input uses UTF-8 encoding.
func FastParse(r io.Reader) (*html.Node, error) {
return html.Parse(r)
}
// Parse parses html.Node from the specified reader while converting the character
// encoding into UTF-8. This function is useful to correctly parse web pages that
// uses custom text encoding, e.g. web pages from Asian websites. However, since it
// has to detect charset before parsing, this function is quite slow and expensive
// so if you sure the reader uses valid UTF-8, just use FastParse.
func Parse(r io.Reader) (*html.Node, error) {
// Split the reader using tee
content, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
// Detect page encoding
res, err := chardet.NewHtmlDetector().DetectBest(content)
if err != nil {
return nil, err
}
pageEncoding, _ := charset.Lookup(res.Charset)
if pageEncoding == nil {
pageEncoding = xunicode.UTF8
}
// Parse HTML using the page encoding
r = bytes.NewReader(content)
r = transform.NewReader(r, pageEncoding.NewDecoder())
r = normalizeTextEncoding(r)
return html.Parse(r)
}
// normalizeTextEncoding convert text encoding from NFD to NFC.
// It also remove soft hyphen since apparently it's useless in web.
// See: https://web.archive.org/web/19990117011731/http://www.hut.fi/~jkorpela/shy.html
func normalizeTextEncoding(r io.Reader) io.Reader {
fnSoftHyphen := func(r rune) bool { return r == '\u00AD' }
softHyphenSet := runes.Predicate(fnSoftHyphen)
transformer := transform.Chain(norm.NFD, runes.Remove(softHyphenSet), norm.NFC)
return transform.NewReader(r, transformer)
}