统计文本文件中单词的频率
dedup.py
python
import re, sys, os
from collections import Counter
filepath = sys.argv[1]
with open(filepath,'r', encoding="utf-8") as fp:
content = fp.read().lower()
ts = Counter(re.findall(r"[a-zA-Z]+", content)).most_common()
n, e = os.path.splitext(filepath)
with open(f"{n}_word.{e}", "w", encoding="utf-8") as fp:
fp.write("\n".join(map(lambda i:i[0], ts)))