Skip to content

用 BeautifulSoup 和 html5lib 提取英文单词

parse_html_as_words.py

python
from bs4 import BeautifulSoup
import re

with open("a.html", "r", encoding="utf-8") as fp:
    content = fp.read()

l = list()
soup = BeautifulSoup(content, features="html5lib")
div_words = soup.find("div", {"class":"words"})
for div in div_words:
    div_small = div.find("small")
    if not isinstance(div_small, int) and div_small:
        t = re.findall(r"[a-zA-Z]+", div_small.text)
        l.extend(t)

with open(f"4365_word.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(l))