Skip to content

统计文本文件中单词的频率

dedup.py

python
import re, sys, os
from collections import Counter

filepath = sys.argv[1]

with open(filepath,'r', encoding="utf-8") as fp:
	content = fp.read().lower()

ts = Counter(re.findall(r"[a-zA-Z]+", content)).most_common()

n, e = os.path.splitext(filepath)
with open(f"{n}_word.{e}", "w", encoding="utf-8") as fp:
	fp.write("\n".join(map(lambda i:i[0], ts)))