用 BeautifulSoup 和 html5lib 提取英文单词
parse_html_as_words.py
python
from bs4 import BeautifulSoup
import re
with open("a.html", "r", encoding="utf-8") as fp:
content = fp.read()
l = list()
soup = BeautifulSoup(content, features="html5lib")
div_words = soup.find("div", {"class":"words"})
for div in div_words:
div_small = div.find("small")
if not isinstance(div_small, int) and div_small:
t = re.findall(r"[a-zA-Z]+", div_small.text)
l.extend(t)
with open(f"4365_word.txt", "w", encoding="utf-8") as fp:
fp.write("\n".join(l))