mirror of
https://kkgithub.com/chinese-poetry/chinese-poetry.git
synced 2026-04-09 10:27:32 +00:00
Add: 更正脚本
This commit is contained in:
103
ci/UpdateCi.py
Normal file
103
ci/UpdateCi.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import NavigableString
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_content(page: int) -> list:
|
||||||
|
""" 获取目录页每一页的内容 """
|
||||||
|
content = []
|
||||||
|
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
|
||||||
|
"seektype": 2,
|
||||||
|
"seekvalue": "",
|
||||||
|
"pageno": page
|
||||||
|
})
|
||||||
|
r.encoding = "gbk"
|
||||||
|
soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
|
||||||
|
for i, a in enumerate(soup.find_all(name="a")):
|
||||||
|
if i % 2 == 0:
|
||||||
|
content.append({
|
||||||
|
"rhythmic": a.string.split("(")[0],
|
||||||
|
"param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
content[-1]["author"] = a.string
|
||||||
|
for c in content:
|
||||||
|
c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
|
||||||
|
del c["param"]
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_paragraphs(seek_type: int, seek_value: int) -> list:
|
||||||
|
""" 获取词的内容段落 """
|
||||||
|
paragraphs = []
|
||||||
|
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
|
||||||
|
"seektype": seek_type,
|
||||||
|
"seekvalue": seek_value,
|
||||||
|
"pageno": 1
|
||||||
|
})
|
||||||
|
r.encoding = "gbk"
|
||||||
|
soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
|
||||||
|
for child in soup.find(name="p", align=None).contents:
|
||||||
|
if isinstance(child, NavigableString):
|
||||||
|
paragraphs.append(child)
|
||||||
|
return paragraphs
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_page(temp_file: str):
|
||||||
|
""" 爬取数据并保存至临时文件 """
|
||||||
|
for page in range(1, 1240):
|
||||||
|
all_data.extend(get_page_content(page))
|
||||||
|
logging.info("Success: save page {0}".format(page))
|
||||||
|
with open(temp_file, "w", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
|
def only_text(text: str):
|
||||||
|
""" 去除标点只保留文字 """
|
||||||
|
return re.sub(r"[,。、《》…()·・\s]", "", text)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
|
||||||
|
temp_file_name = "all.json"
|
||||||
|
# 临时文件不存在则先爬取
|
||||||
|
if not os.path.exists(temp_file_name):
|
||||||
|
get_all_page(temp_file_name)
|
||||||
|
# 读取临时文件
|
||||||
|
with open("all.json", "r", encoding="utf-8") as f:
|
||||||
|
all_data = json.load(f)
|
||||||
|
# 统计更正的数目
|
||||||
|
diff_num = 0
|
||||||
|
# 遍历当前目录
|
||||||
|
for file_name in os.listdir("./"):
|
||||||
|
if re.match(r"ci\.song\.\d+\.json", file_name):
|
||||||
|
# 每个文件开始的数据索引
|
||||||
|
start = int(file_name.split(".")[2])
|
||||||
|
with open(file_name, "r", encoding="utf-8") as f:
|
||||||
|
file_data = json.load(f)
|
||||||
|
for i in range(len(file_data)):
|
||||||
|
old_text = only_text("".join(file_data[i]["paragraphs"]))
|
||||||
|
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
|
||||||
|
# 计算纯文字的相似度
|
||||||
|
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
|
||||||
|
if 0.9 <= ratio < 1.0:
|
||||||
|
# 假定此范围内说明缺字,需要更新
|
||||||
|
diff_num += 1
|
||||||
|
file_data[i]["author"] = all_data[start + i]["author"]
|
||||||
|
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
|
||||||
|
elif ratio < 0.9:
|
||||||
|
# 异常情况warning输出,不更新
|
||||||
|
logging.warning(old_text)
|
||||||
|
logging.warning(new_text)
|
||||||
|
# 保存数据,原文件中逗号后有空格,这里保持一致
|
||||||
|
with open(file_name, "w", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
|
||||||
|
logging.info("Save " + file_name)
|
||||||
|
logging.info("Change {0} items".format(diff_num))
|
||||||
Reference in New Issue
Block a user