Skip to content

Commit

Permalink
Meta: Make import-wpt-test.py use html.parser, not BeautifulSoup
Browse files Browse the repository at this point in the history
This change switches the Meta/import-wpt-test.py script to using the
standard html.parser module rather than BeautifulSoup.

Otherwise, without this change, when a contributor first tries to run
the script, if they don’t have BeautifulSoup installed, it will fail.

Note that this patch also includes an unrelated small change that
switches to using os.path.normpath — rather than Path.absolute() — to
“normalize” the destination names of the downloaded test files.

(cherry picked from commit 286511c4cf9dcd985882c9bbb2de42b01e7494a9)
  • Loading branch information
sideshowbarker authored and nico committed Nov 22, 2024
1 parent c5b9ce1 commit c4b4f29
Showing 1 changed file with 25 additions and 26 deletions.
51 changes: 25 additions & 26 deletions Meta/import-wpt-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import os
import sys

from html.parser import HTMLParser
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen
from collections import namedtuple
Expand All @@ -13,18 +14,16 @@
wpt_expected_path = 'Tests/LibWeb/Text/expected/wpt-import'
PathMapping = namedtuple('PathMapping', ['source', 'destination'])

src_values = []

def get_script_sources(page_source):
# Find all the <script> tags
scripts = [script for script in page_source.findAll('script')]

# Get the src attribute of each script tag
sources = list(map(lambda x: x.get('src'), scripts))

# Remove None values
sources = list(filter(lambda x: x is not None, sources))
class ScriptSrcValueFinder(HTMLParser):

return sources
def handle_starttag(self, tag, attrs):
if tag == "script":
attr_dict = dict(attrs)
if "src" in attr_dict:
src_values.append(attr_dict["src"])


def map_to_path(sources, is_resource=True, resource_path=None):
Expand Down Expand Up @@ -65,25 +64,23 @@ def modify_sources(files):
parent_folder_path = '../' * parent_folder_count

with open(file, 'r') as f:
page_source = BeautifulSoup(f.read(), 'html.parser')

# Iterate all scripts and overwrite the src attribute
scripts = [script for script in page_source.findAll('script')]
for script in scripts:
if script.get('src') is not None:
if script['src'].startswith('/'):
script['src'] = parent_folder_path + script['src'][1::]
page_source = f.read()

with open(file, 'w') as f:
f.write(str(page_source))
# Iterate all scripts and overwrite the src attribute
for i, src_value in enumerate(src_values):
if src_value.startswith('/'):
new_src_value = parent_folder_path + src_value[1::]
page_source = page_source.replace(src_value, new_src_value)
with open(file, 'w') as f:
f.write(str(page_source))


def download_files(filepaths):
downloaded_files = []

for file in filepaths:
source = urljoin(file.source, "/".join(file.source.split('/')[3:]))
destination = Path(file.destination).absolute()
destination = Path(os.path.normpath(file.destination))

if destination.exists():
print(f"Skipping {destination} as it already exists")
Expand Down Expand Up @@ -132,13 +129,15 @@ def main():
main_paths = map_to_path(main_file, False)
files_to_modify = download_files(main_paths)
create_expectation_files(main_paths)
modify_sources(files_to_modify)

page = urlopen(url_to_import)
page_source = BeautifulSoup(page, 'html.parser')
with urlopen(url_to_import) as response:
page = response.read().decode("utf-8")

parser = ScriptSrcValueFinder()
parser.feed(page)

scripts = get_script_sources(page_source)
script_paths = map_to_path(scripts, True, resource_path)
modify_sources(files_to_modify)
script_paths = map_to_path(src_values, True, resource_path)
download_files(script_paths)


Expand Down

0 comments on commit c4b4f29

Please sign in to comment.