与之前的版本校对

This commit is contained in:
chienmy
2022-05-02 13:54:14 +08:00
parent 476802ea96
commit 9da55b4d6f
24 changed files with 3413 additions and 3244 deletions

136
UpdateCi.py Normal file
View File

@@ -0,0 +1,136 @@
import json
import logging
import os
import re
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
def get_page_content(page: int) -> list:
""" 获取目录页每一页的内容 """
content = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": 2,
"seekvalue": "",
"pageno": page
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
for i, a in enumerate(soup.find_all(name="a")):
if i % 2 == 0:
content.append({
"rhythmic": a.string.split("")[0],
"param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
})
else:
content[-1]["author"] = a.string
for c in content:
c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
del c["param"]
return content
def get_paragraphs(seek_type: int, seek_value: int) -> list:
""" 获取词的内容段落 """
paragraphs = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": seek_type,
"seekvalue": seek_value,
"pageno": 1
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
for child in soup.find(name="p", align=None).contents:
if isinstance(child, NavigableString):
paragraphs.append(child)
return paragraphs
def get_all_page(temp_file: str):
""" 爬取数据并保存至临时文件 """
for page in range(1, 1240):
all_data.extend(get_page_content(page))
logging.info("Success: save page {0}".format(page))
with open(temp_file, "w", encoding="utf-8") as f:
f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
def only_text(text: str):
""" 去除标点只保留文字 """
return re.sub(r"[,。、《》…()·・\s]", "", text)
def update_file_data(old_data: list, new_data: list):
for i in range(len(old_data)):
old_text = only_text("".join(old_data[i]["paragraphs"]))
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
old_data[i]["author"] = new_data[start + i]["author"]
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
else:
old_data[i]["author"] = new_data[start + i]["author"]
char_dict = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"薄倖": "薄幸",
"": "",
"鷫鸘": "鹔鹴",
"": "",
"": "",
"": "",
"": "",
"崑崙": "昆仑",
"": ""
}
def correct(old_data: list):
""" 部分繁体转为简体 """
for i in range(len(old_data)):
for j in range(len(old_data[i]["paragraphs"])):
for k, v in char_dict.items():
if k in old_data[i]["paragraphs"][j]:
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
temp_file_name = "all.json"
# 临时文件不存在则先爬取
if not os.path.exists(temp_file_name):
get_all_page(temp_file_name)
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 遍历当前目录
for file_name in os.listdir("./ci/"):
if re.match(r"ci\.song\.\d+\.json", file_name):
# 每个文件开始的数据索引
start = int(file_name.split(".")[2])
with open("./ci/" + file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
# update_file_data(file_data, all_data)
correct(file_data)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open("./ci/" + file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)

View File

@@ -63,6 +63,54 @@ def only_text(text: str):
return re.sub(r"[,。、《》…()·・\s]", "", text)
def update_file_data(old_data: list, new_data: list):
for i in range(len(old_data)):
old_text = only_text("".join(old_data[i]["paragraphs"]))
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
old_data[i]["author"] = new_data[start + i]["author"]
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
else:
old_data[i]["author"] = new_data[start + i]["author"]
char_dict = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"薄倖": "薄幸",
"": "",
"鷫鸘": "鹔鹴",
"": "",
"": "",
"": "",
"": "",
"崑崙": "昆仑",
"": ""
}
def correct(old_data: list):
""" 部分繁体转为简体 """
for i in range(len(old_data)):
for j in range(len(old_data[i]["paragraphs"])):
for k, v in char_dict.items():
if k in old_data[i]["paragraphs"][j]:
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
@@ -73,8 +121,6 @@ if __name__ == '__main__':
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 统计更正的数目
diff_num = 0
# 遍历当前目录
for file_name in os.listdir("./"):
if re.match(r"ci\.song\.\d+\.json", file_name):
@@ -82,22 +128,9 @@ if __name__ == '__main__':
start = int(file_name.split(".")[2])
with open(file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
for i in range(len(file_data)):
old_text = only_text("".join(file_data[i]["paragraphs"]))
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
diff_num += 1
file_data[i]["author"] = all_data[start + i]["author"]
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
update_file_data(file_data, all_data)
correct(file_data)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open(file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)
logging.info("Change {0} items".format(diff_num))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -146,7 +146,7 @@
"幕天席地。",
"瑞脑香浓笙歌沸。",
"白纻衣轻。",
"发霜髯照座明。",
"发霜髯照座明。",
"轻簪小珥。",
"却是人间真富贵。",
"好着丹青。",
@@ -214,7 +214,7 @@
"paragraphs": [
"香烟绕遍兰堂宴。",
"香鸭珠帘卷。",
"香风转送韶音。",
"香风转送韶音。",
"香酝佳筵今日、庆佳辰。",
"香山烧尽禽飞放。",
"香袖佳人唱。",
@@ -327,7 +327,7 @@
"昴宿呈祥,符应生公族。",
"盖世功名夸九牧。",
"黼衮褒扬,庆阀辉南北。",
"赐宫醪,分笃<EFBFBD><EFBFBD>",
"赐宫醪,分笃耨。",
"天与长生,谩把仙椿祝。",
"好继平阳腾茂躅。",
"富贵千秋,饮听瑶池曲。"
@@ -565,7 +565,7 @@
"龙角辉春,蛾春惊晓,梦阑金翠屏开。",
"异芬薰室,风送蕊仙来。",
"玉女擎香沐浴,人间世、洗彻凡埃。",
"梅开,留花酝染,清味俗难猜。",
"梅开,留花酝染,清味俗难猜。",
"东君,尤雅爱,传香芳畹,香发庭陔。",
"宁馨满尊前,喜奏瑶台。",
"便好纽为佩王,瀛洲路、同赏蓬莱。",
@@ -576,7 +576,7 @@
{
"author": "潘熊飞",
"paragraphs": [
"十日重阳。",
"十日重阳。",
"甘菊阶前满意黄。",
"生日无钱留贺客,何妨。",
"尚有儿曹理寿觞。",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff