browser.py

Working through the exercises at browser.engineering
git clone https://git.sr.ht/~jbauer/browser.py
Log | Files | Refs | README | LICENSE

commit 8dd89c49c41c4c7d90aacbaac65eb18aea00892f
parent d32cbf1dfe66e6c99f3508ca75227bee54b8b9a6
Author: Jake Bauer <jbauer@paritybit.ca>
Date:   Tue, 21 Feb 2023 16:59:00 -0500

Implement Chapter 4 HTML Tree

Diffstat:
Mbrowser.py | 188++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 139 insertions(+), 49 deletions(-)

diff --git a/browser.py b/browser.py @@ -25,6 +25,7 @@ class Browser: self.window.bind("<Down>", self.scrolldown) self.window.bind("<Up>", self.scrollup) self.display_list = [] + self.nodes = None def scrolldown(self, e): @@ -47,12 +48,12 @@ class Browser: print('{0:.5f}'.format(end - start), "- Request to", url) start = timer() - tokens = lex(body) + self.nodes = HTMLParser(body).parse() end = timer() print('{0:.5f}'.format(end - start), "- Lexed response body") start = timer() - self.display_list = Layout(tokens).display_list + self.display_list = Layout(self.nodes).display_list end = timer() print('{0:.5f}'.format(end - start), "- Computed page layout") @@ -73,7 +74,7 @@ class Browser: class Layout: - def __init__(self, tokens): + def __init__(self, nodes): self.display_list = [] self.line = [] self.cursor_x = HSTEP @@ -82,8 +83,7 @@ class Layout: self.slant = "roman" self.size = 10 self.display = True - for tok in tokens: - self.token(tok) + self.recurse(nodes) self.flush() @@ -100,7 +100,9 @@ class Layout: max_descent = max([metric["descent"] for metric in metrics]) self.cursor_y = baseline + 1.25 * max_descent + def text(self, tok): + if not self.display: return font = get_font(self.size, self.weight, self.slant) for word in tok.text.split(): w = font.measure(word) @@ -110,46 +112,148 @@ class Layout: self.cursor_x += w + font.measure(" ") - def token(self, tok): - if isinstance(tok, Text): - if self.display: - self.text(tok) - elif tok.tag == "i" or tok.tag == "em": + def open_tag(self, tag): + if tag == "i" or tag == "em": self.slant = "italic" - elif tok.tag == "/i" or tok.tag == "/em": - self.slant = "roman" - elif tok.tag == "b" or tok.tag == "strong": + elif tag == "b" or tag == "strong": self.weight = "bold" - elif tok.tag == "/b" or tok.tag == "/strong": - self.weight = "normal" - elif tok.tag == "small": + elif tag == "small": self.size -= 2 - elif tok.tag == "/small": - self.size += 2 - elif tok.tag == "big": + elif tag == "big": self.size += 4 - elif tok.tag == "/big": - self.size -= 4 - elif tok.tag == "style": + elif tag == "style": self.display = False - elif tok.tag == "/style": - self.display = True - elif tok.tag == "br": + elif tag == "br": self.flush() - elif tok.tag == "/p": + + + def close_tag(self, tag): + if tag == "i" or tag == "em": + self.slant = "roman" + elif tag == "b" or tag == "strong": + self.weight = "normal" + elif tag == "small": + self.size += 2 + elif tag == "big": + self.size -= 4 + elif tag == "style": + self.display = True + elif tag == "p": self.flush() self.cursor_y += VSTEP + def recurse(self, tree): + if isinstance(tree, Text): + self.text(tree) + else: + self.open_tag(tree.tag) + for child in tree.children: + self.recurse(child) + self.close_tag(tree.tag) + + class Text: - def __init__(self, text): + def __init__(self, text, parent): self.text = text + self.children = [] + self.parent = parent + + + def __repr__(self): + return repr(self.text) -class Tag: - def __init__(self, tag): +class Element: + def __init__(self, tag, attributes, parent): self.tag = tag + self.attributes = attributes + self.children = [] + self.parent = parent + + + def __repr__(self): + return "<" + self.tag + ">" + + +class HTMLParser: + def __init__(self, body): + self.body = body + self.unfinished = [] + self.SELF_CLOSING_TAGS = [ + "area", "base", "br", "col", "embed", "hr", "img", "input", + "link", "meta", "param", "source", "track", "wbr", + ] + + + def add_text(self, text): + if text.isspace(): return + parent = self.unfinished[-1] + node = Text(text, parent) + parent.children.append(node) + + + def add_tag(self, tag): + tag, attributes = self.get_attributes(tag) + if tag.startswith("!"): return + if tag.startswith("/"): + if len(self.unfinished) == 1: return + node = self.unfinished.pop() + parent = self.unfinished[-1] + parent.children.append(node) + elif tag in self.SELF_CLOSING_TAGS: + parent = self.unfinished[-1] + node = Element(tag, attributes, parent) + parent.children.append(node) + else: + parent = self.unfinished[-1] if self.unfinished else None + node = Element(tag, attributes, parent) + self.unfinished.append(node) + + + def get_attributes(self, text): + parts = text.split() + tag = parts[0].lower() + attributes = {} + for attrpair in parts[1:]: + if "=" in attrpair: + key, value = attrpair.split("=", 1) + if len(value) > 2 and value[0] in ["'", "\""]: + value = value[1:-1] + attributes[key.lower()] = value + else: + attributes[attrpair.lower()] = "" + return tag, attributes + + + def finish(self): + if len(self.unfinished) == 0: + self.add_tag("html") + while len(self.unfinished) > 1: + node = self.unfinished.pop() + parent = self.unfinished[-1] + parent.children.append(node) + return self.unfinished.pop() + + + def parse(self): + text = "" + in_tag = False + for c in self.body: + if c == "<": + in_tag = True + if text: self.add_text(text) + text = "" + elif c == ">": + in_tag = False + self.add_tag(text) + text = "" + else: + text += c + if not in_tag and text: + self.add_text(text) + return self.finish() def request(url): @@ -202,26 +306,6 @@ def request(url): return headers, body -def lex(body): - out = [] - text = "" - in_tag = False - for c in body: - if c == "<": - in_tag = True - if text: out.append(Text(text)) - text = "" - elif c == ">": - in_tag = False - out.append(Tag(text)) - text = "" - else: - text += c - if not in_tag and text: - out.append(Text(text)) - return out - - def get_font(size, weight, slant): key = (size, weight, slant) if key not in FONTS: @@ -230,6 +314,12 @@ def get_font(size, weight, slant): return FONTS[key] +def print_tree(node, indent=0): + print(" " * indent, node) + for child in node.children: + print_tree(child, indent + 2) + + if __name__ == "__main__": EXEC_TIME = timer() import sys