Skip to content

Commit

Permalink
add parser back
Browse files Browse the repository at this point in the history
  • Loading branch information
joshday committed May 10, 2023
1 parent f53f693 commit 9a81a2b
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/Cobweb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,6 @@ function iframe(x; height=250, width=750)
)
end

include("parser.jl")

end #module
143 changes: 143 additions & 0 deletions src/parser.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#-----------------------------------------------------------------------------# HTMLTokenIterator
@enum(TokenType,
UNKNOWNTOKEN, # ???
DOCTYPETOKEN, # <!DOCTYPE ...>
COMMENTTOKEN, # <!-- ... -->
ELEMENTTOKEN, # <NAME attributes... >
ELEMENTSELFCLOSEDTOKEN, # <NAME attributes... />
ELEMENTCLOSETOKEN, # </NAME>
TEXTTOKEN # text between a '>' and a '<'
)


mutable struct HTMLTokenIterator{IOT <: IO}
io::IOT
start_pos::Int64 # position(io) always returns Int64?
buffer::IOBuffer
end
HTMLTokenIterator(io::IO) = HTMLTokenIterator(io, position(io), IOBuffer())

readchar(o::HTMLTokenIterator) = (c = Base.read(o.io, Char); write(o.buffer, c); c)
reset(o::HTMLTokenIterator) = seek(o.io, o.start_pos)

function readuntil(o::HTMLTokenIterator, char::Char)
c = readchar(o)
while c != char
c = readchar(o)
end
end
function readuntil(o::HTMLTokenIterator, pattern::String)
chars = collect(pattern)
last_chars = similar(chars)
while last_chars != chars
for i in 1:(length(chars) - 1)
last_chars[i] = last_chars[i+1]
end
last_chars[end] = readchar(o)
end
end

function Base.iterate(o::HTMLTokenIterator, state=0)
state == 0 && seek(o.io, o.start_pos)
pair = next_token(o)
isnothing(pair) ? nothing : (pair, state+1)
end

function next_token(o::HTMLTokenIterator)
io = o.io
buffer = o.buffer
skipchars(isspace, io)
eof(io) && return nothing
foreach(_ -> readchar(o), 1:3)
s = String(take!(buffer))
skip(io, -3)
pair = if startswith(s, "<!D") || startswith(s, "<!d")
readuntil(o, '>')
DOCTYPETOKEN => String(take!(buffer))
elseif startswith(s, "<!-")
readuntil(o, "-->")
COMMENTTOKEN => String(take!(buffer))
elseif startswith(s, "</")
readuntil(o, '>')
ELEMENTCLOSETOKEN => String(take!(buffer))
elseif startswith(s, "<")
readuntil(o, '>')
s = String(take!(buffer))
t = endswith(s, "/>") ? ELEMENTSELFCLOSEDTOKEN : ELEMENTTOKEN
t => s
else
readuntil(o, '<')
skip(io, -1)
TEXTTOKEN => unescape(String(take!(buffer)[1:end-1]))
end
return pair
end


Base.eltype(::Type{<:HTMLTokenIterator}) = Pair{TokenType, String}

Base.IteratorSize(::Type{<:HTMLTokenIterator}) = Base.SizeUnknown()

Base.isdone(itr::HTMLTokenIterator, state...) = eof(itr.io)

#-----------------------------------------------------------------------------# read
read(path::AbstractString) = open(io -> read(HTMLTokenIterator(io)), path, "r")

function read(o::HTMLTokenIterator)
siblings = []
for (T, s) in o
if T == DOCTYPETOKEN
push!(siblings, Doctype())
elseif T == COMMENTTOKEN
push!(siblings, make_comment(s))
elseif T == ELEMENTSELFCLOSEDTOKEN
node = make_node(s)
push!(siblings, node)
elseif T == ELEMENTTOKEN
node = make_node(s)
add_children!(node, o, "</$(tag(node))>")
push!(siblings, node)
else
error("should be unreachable: T=$T, s=$s")
end
end
return siblings
end

function add_children!(node::Node, o::HTMLTokenIterator, until::String)
s = ""
c = children(node)
while s != until
next = iterate(o, -1) # if state == 0, io will get reset to original position
isnothing(next) && break
T, s = next[1]
if T == COMMENTTOKEN
push!(c, Comment(replace(s, "<!-- " => "", " -->" => "")))
elseif T == ELEMENTSELFCLOSEDTOKEN
push!(c, make_node(s))
elseif T == ELEMENTTOKEN
node = make_node(s)
add_children!(node, o, "</$(tag(node))>")
push!(c, node)
elseif T == TEXTTOKEN
push!(c, s)
end
end
end

make_node(s) = Node(get_tag(s), get_attributes(s), [])

get_tag(x) = x[findfirst(r"[a-zA-z][^\s>/]*", x)]

function get_attributes(x)
out = Dict{String,String}()
rng = findfirst(r"(?<=\s).*\"", x)
isnothing(rng) && return out
s = x[rng]
kys = (m.match for m in eachmatch(r"[a-zA-Z][a-zA-Z\.-_]*(?=\=)", s))
vals = (m.match for m in eachmatch(r"(?<=(\=\"))[^\"]*", s))
foreach(zip(kys,vals)) do (k,v)
out[string(k)] = v
end
out
end

2 comments on commit 9a81a2b

@joshday
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/83298

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.3 -m "<description of version>" 9a81a2b8b31f0c04f516addcefbad64fd276ec5f
git push origin v0.3.3

Please sign in to comment.