Merge pull request #302 from chienmy/master

提交经过初步校对的版本
This commit is contained in:
JG
2022-05-11 14:59:37 +08:00
committed by GitHub
24 changed files with 3282 additions and 3244 deletions

View File

@@ -70,3 +70,8 @@ sqlite> select * from ciauthor limit 1;
1||(1037-1101)...
```
## 已知问题
1. 尚存在一些繁体字,由于找不到对应的简化字,或是不能确定是否应当简化,仍保留在词中。
2. 部分异体字、通用字,不及一一对照原本考证用法,只得保留。例如:`搵``揾`,`酴醿``酴醾``溟濛``溟蒙``鸿濛``鸿蒙`等。

View File

@@ -63,6 +63,54 @@ def only_text(text: str):
return re.sub(r"[,。、《》…()·・\s]", "", text)
def update_file_data(old_data: list, new_data: list):
for i in range(len(old_data)):
old_text = only_text("".join(old_data[i]["paragraphs"]))
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
old_data[i]["author"] = new_data[start + i]["author"]
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
else:
old_data[i]["author"] = new_data[start + i]["author"]
char_dict = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"薄倖": "薄幸",
"": "",
"鷫鸘": "鹔鹴",
"": "",
"": "",
"": "",
"": "",
"崑崙": "昆仑",
"": ""
}
def correct(old_data: list):
""" 部分繁体转为简体 """
for i in range(len(old_data)):
for j in range(len(old_data[i]["paragraphs"])):
for k, v in char_dict.items():
if k in old_data[i]["paragraphs"][j]:
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
@@ -73,8 +121,6 @@ if __name__ == '__main__':
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 统计更正的数目
diff_num = 0
# 遍历当前目录
for file_name in os.listdir("./"):
if re.match(r"ci\.song\.\d+\.json", file_name):
@@ -82,22 +128,9 @@ if __name__ == '__main__':
start = int(file_name.split(".")[2])
with open(file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
for i in range(len(file_data)):
old_text = only_text("".join(file_data[i]["paragraphs"]))
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
diff_num += 1
file_data[i]["author"] = all_data[start + i]["author"]
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
update_file_data(file_data, all_data)
correct(file_data)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open(file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)
logging.info("Change {0} items".format(diff_num))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -146,7 +146,7 @@
"幕天席地。",
"瑞脑香浓笙歌沸。",
"白纻衣轻。",
"发霜髯照座明。",
"发霜髯照座明。",
"轻簪小珥。",
"却是人间真富贵。",
"好着丹青。",
@@ -214,7 +214,7 @@
"paragraphs": [
"香烟绕遍兰堂宴。",
"香鸭珠帘卷。",
"香风转送韶音。",
"香风转送韶音。",
"香酝佳筵今日、庆佳辰。",
"香山烧尽禽飞放。",
"香袖佳人唱。",
@@ -327,7 +327,7 @@
"昴宿呈祥,符应生公族。",
"盖世功名夸九牧。",
"黼衮褒扬,庆阀辉南北。",
"赐宫醪,分笃<EFBFBD><EFBFBD>",
"赐宫醪,分笃耨。",
"天与长生,谩把仙椿祝。",
"好继平阳腾茂躅。",
"富贵千秋,饮听瑶池曲。"
@@ -565,7 +565,7 @@
"龙角辉春,蛾春惊晓,梦阑金翠屏开。",
"异芬薰室,风送蕊仙来。",
"玉女擎香沐浴,人间世、洗彻凡埃。",
"梅开,留花酝染,清味俗难猜。",
"梅开,留花酝染,清味俗难猜。",
"东君,尤雅爱,传香芳畹,香发庭陔。",
"宁馨满尊前,喜奏瑶台。",
"便好纽为佩王,瀛洲路、同赏蓬莱。",
@@ -576,7 +576,7 @@
{
"author": "潘熊飞",
"paragraphs": [
"十日重阳。",
"十日重阳。",
"甘菊阶前满意黄。",
"生日无钱留贺客,何妨。",
"尚有儿曹理寿觞。",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff