commit 8dd89c49c41c4c7d90aacbaac65eb18aea00892f
parent d32cbf1dfe66e6c99f3508ca75227bee54b8b9a6
Author: Jake Bauer <jbauer@paritybit.ca>
Date: Tue, 21 Feb 2023 16:59:00 -0500
Implement Chapter 4 HTML Tree
Diffstat:
M | browser.py | | | 188 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
1 file changed, 139 insertions(+), 49 deletions(-)
diff --git a/browser.py b/browser.py
@@ -25,6 +25,7 @@ class Browser:
self.window.bind("<Down>", self.scrolldown)
self.window.bind("<Up>", self.scrollup)
self.display_list = []
+ self.nodes = None
def scrolldown(self, e):
@@ -47,12 +48,12 @@ class Browser:
print('{0:.5f}'.format(end - start), "- Request to", url)
start = timer()
- tokens = lex(body)
+ self.nodes = HTMLParser(body).parse()
end = timer()
print('{0:.5f}'.format(end - start), "- Lexed response body")
start = timer()
- self.display_list = Layout(tokens).display_list
+ self.display_list = Layout(self.nodes).display_list
end = timer()
print('{0:.5f}'.format(end - start), "- Computed page layout")
@@ -73,7 +74,7 @@ class Browser:
class Layout:
- def __init__(self, tokens):
+ def __init__(self, nodes):
self.display_list = []
self.line = []
self.cursor_x = HSTEP
@@ -82,8 +83,7 @@ class Layout:
self.slant = "roman"
self.size = 10
self.display = True
- for tok in tokens:
- self.token(tok)
+ self.recurse(nodes)
self.flush()
@@ -100,7 +100,9 @@ class Layout:
max_descent = max([metric["descent"] for metric in metrics])
self.cursor_y = baseline + 1.25 * max_descent
+
def text(self, tok):
+ if not self.display: return
font = get_font(self.size, self.weight, self.slant)
for word in tok.text.split():
w = font.measure(word)
@@ -110,46 +112,148 @@ class Layout:
self.cursor_x += w + font.measure(" ")
- def token(self, tok):
- if isinstance(tok, Text):
- if self.display:
- self.text(tok)
- elif tok.tag == "i" or tok.tag == "em":
+ def open_tag(self, tag):
+ if tag == "i" or tag == "em":
self.slant = "italic"
- elif tok.tag == "/i" or tok.tag == "/em":
- self.slant = "roman"
- elif tok.tag == "b" or tok.tag == "strong":
+ elif tag == "b" or tag == "strong":
self.weight = "bold"
- elif tok.tag == "/b" or tok.tag == "/strong":
- self.weight = "normal"
- elif tok.tag == "small":
+ elif tag == "small":
self.size -= 2
- elif tok.tag == "/small":
- self.size += 2
- elif tok.tag == "big":
+ elif tag == "big":
self.size += 4
- elif tok.tag == "/big":
- self.size -= 4
- elif tok.tag == "style":
+ elif tag == "style":
self.display = False
- elif tok.tag == "/style":
- self.display = True
- elif tok.tag == "br":
+ elif tag == "br":
self.flush()
- elif tok.tag == "/p":
+
+
+ def close_tag(self, tag):
+ if tag == "i" or tag == "em":
+ self.slant = "roman"
+ elif tag == "b" or tag == "strong":
+ self.weight = "normal"
+ elif tag == "small":
+ self.size += 2
+ elif tag == "big":
+ self.size -= 4
+ elif tag == "style":
+ self.display = True
+ elif tag == "p":
self.flush()
self.cursor_y += VSTEP
+ def recurse(self, tree):
+ if isinstance(tree, Text):
+ self.text(tree)
+ else:
+ self.open_tag(tree.tag)
+ for child in tree.children:
+ self.recurse(child)
+ self.close_tag(tree.tag)
+
+
class Text:
- def __init__(self, text):
+ def __init__(self, text, parent):
self.text = text
+ self.children = []
+ self.parent = parent
+
+
+ def __repr__(self):
+ return repr(self.text)
-class Tag:
- def __init__(self, tag):
+class Element:
+ def __init__(self, tag, attributes, parent):
self.tag = tag
+ self.attributes = attributes
+ self.children = []
+ self.parent = parent
+
+
+ def __repr__(self):
+ return "<" + self.tag + ">"
+
+
+class HTMLParser:
+ def __init__(self, body):
+ self.body = body
+ self.unfinished = []
+ self.SELF_CLOSING_TAGS = [
+ "area", "base", "br", "col", "embed", "hr", "img", "input",
+ "link", "meta", "param", "source", "track", "wbr",
+ ]
+
+
+ def add_text(self, text):
+ if text.isspace(): return
+ parent = self.unfinished[-1]
+ node = Text(text, parent)
+ parent.children.append(node)
+
+
+ def add_tag(self, tag):
+ tag, attributes = self.get_attributes(tag)
+ if tag.startswith("!"): return
+ if tag.startswith("/"):
+ if len(self.unfinished) == 1: return
+ node = self.unfinished.pop()
+ parent = self.unfinished[-1]
+ parent.children.append(node)
+ elif tag in self.SELF_CLOSING_TAGS:
+ parent = self.unfinished[-1]
+ node = Element(tag, attributes, parent)
+ parent.children.append(node)
+ else:
+ parent = self.unfinished[-1] if self.unfinished else None
+ node = Element(tag, attributes, parent)
+ self.unfinished.append(node)
+
+
+ def get_attributes(self, text):
+ parts = text.split()
+ tag = parts[0].lower()
+ attributes = {}
+ for attrpair in parts[1:]:
+ if "=" in attrpair:
+ key, value = attrpair.split("=", 1)
+ if len(value) > 2 and value[0] in ["'", "\""]:
+ value = value[1:-1]
+ attributes[key.lower()] = value
+ else:
+ attributes[attrpair.lower()] = ""
+ return tag, attributes
+
+
+ def finish(self):
+ if len(self.unfinished) == 0:
+ self.add_tag("html")
+ while len(self.unfinished) > 1:
+ node = self.unfinished.pop()
+ parent = self.unfinished[-1]
+ parent.children.append(node)
+ return self.unfinished.pop()
+
+
+ def parse(self):
+ text = ""
+ in_tag = False
+ for c in self.body:
+ if c == "<":
+ in_tag = True
+ if text: self.add_text(text)
+ text = ""
+ elif c == ">":
+ in_tag = False
+ self.add_tag(text)
+ text = ""
+ else:
+ text += c
+ if not in_tag and text:
+ self.add_text(text)
+ return self.finish()
def request(url):
@@ -202,26 +306,6 @@ def request(url):
return headers, body
-def lex(body):
- out = []
- text = ""
- in_tag = False
- for c in body:
- if c == "<":
- in_tag = True
- if text: out.append(Text(text))
- text = ""
- elif c == ">":
- in_tag = False
- out.append(Tag(text))
- text = ""
- else:
- text += c
- if not in_tag and text:
- out.append(Text(text))
- return out
-
-
def get_font(size, weight, slant):
key = (size, weight, slant)
if key not in FONTS:
@@ -230,6 +314,12 @@ def get_font(size, weight, slant):
return FONTS[key]
+def print_tree(node, indent=0):
+ print(" " * indent, node)
+ for child in node.children:
+ print_tree(child, indent + 2)
+
+
if __name__ == "__main__":
EXEC_TIME = timer()
import sys