mirror of
https://kkgithub.com/chinese-poetry/chinese-poetry.git
synced 2026-04-04 16:07:30 +00:00
与之前的版本校对
This commit is contained in:
136
UpdateCi.py
Normal file
136
UpdateCi.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString
|
||||
|
||||
|
||||
def get_page_content(page: int) -> list:
|
||||
""" 获取目录页每一页的内容 """
|
||||
content = []
|
||||
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
|
||||
"seektype": 2,
|
||||
"seekvalue": "",
|
||||
"pageno": page
|
||||
})
|
||||
r.encoding = "gbk"
|
||||
soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
|
||||
for i, a in enumerate(soup.find_all(name="a")):
|
||||
if i % 2 == 0:
|
||||
content.append({
|
||||
"rhythmic": a.string.split("(")[0],
|
||||
"param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
|
||||
})
|
||||
else:
|
||||
content[-1]["author"] = a.string
|
||||
for c in content:
|
||||
c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
|
||||
del c["param"]
|
||||
return content
|
||||
|
||||
|
||||
def get_paragraphs(seek_type: int, seek_value: int) -> list:
|
||||
""" 获取词的内容段落 """
|
||||
paragraphs = []
|
||||
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
|
||||
"seektype": seek_type,
|
||||
"seekvalue": seek_value,
|
||||
"pageno": 1
|
||||
})
|
||||
r.encoding = "gbk"
|
||||
soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
|
||||
for child in soup.find(name="p", align=None).contents:
|
||||
if isinstance(child, NavigableString):
|
||||
paragraphs.append(child)
|
||||
return paragraphs
|
||||
|
||||
|
||||
def get_all_page(temp_file: str):
|
||||
""" 爬取数据并保存至临时文件 """
|
||||
for page in range(1, 1240):
|
||||
all_data.extend(get_page_content(page))
|
||||
logging.info("Success: save page {0}".format(page))
|
||||
with open(temp_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
def only_text(text: str):
|
||||
""" 去除标点只保留文字 """
|
||||
return re.sub(r"[,。、《》…()·・\s]", "", text)
|
||||
|
||||
|
||||
def update_file_data(old_data: list, new_data: list):
|
||||
for i in range(len(old_data)):
|
||||
old_text = only_text("".join(old_data[i]["paragraphs"]))
|
||||
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
|
||||
# 计算纯文字的相似度
|
||||
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
|
||||
if 0.9 <= ratio < 1.0:
|
||||
# 假定此范围内说明缺字,需要更新
|
||||
old_data[i]["author"] = new_data[start + i]["author"]
|
||||
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
|
||||
elif ratio < 0.9:
|
||||
# 异常情况warning输出,不更新
|
||||
logging.warning(old_text)
|
||||
logging.warning(new_text)
|
||||
else:
|
||||
old_data[i]["author"] = new_data[start + i]["author"]
|
||||
|
||||
|
||||
char_dict = {
|
||||
"鵷": "鹓",
|
||||
"颭": "飐",
|
||||
"鷁": "鹢",
|
||||
"鴞": "鸮",
|
||||
"餖": "饾",
|
||||
"飣": "饤",
|
||||
"舃": "舄",
|
||||
"駸": "骎",
|
||||
"薄倖": "薄幸",
|
||||
"赬": "赪",
|
||||
"鷫鸘": "鹔鹴",
|
||||
"嶮": "崄",
|
||||
"後": "后",
|
||||
"纇": "颣",
|
||||
"颸": "飔",
|
||||
"崑崙": "昆仑",
|
||||
"曨": "昽"
|
||||
}
|
||||
|
||||
|
||||
def correct(old_data: list):
|
||||
""" 部分繁体转为简体 """
|
||||
for i in range(len(old_data)):
|
||||
for j in range(len(old_data[i]["paragraphs"])):
|
||||
for k, v in char_dict.items():
|
||||
if k in old_data[i]["paragraphs"][j]:
|
||||
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
|
||||
temp_file_name = "all.json"
|
||||
# 临时文件不存在则先爬取
|
||||
if not os.path.exists(temp_file_name):
|
||||
get_all_page(temp_file_name)
|
||||
# 读取临时文件
|
||||
with open("all.json", "r", encoding="utf-8") as f:
|
||||
all_data = json.load(f)
|
||||
# 遍历当前目录
|
||||
for file_name in os.listdir("./ci/"):
|
||||
if re.match(r"ci\.song\.\d+\.json", file_name):
|
||||
# 每个文件开始的数据索引
|
||||
start = int(file_name.split(".")[2])
|
||||
with open("./ci/" + file_name, "r", encoding="utf-8") as f:
|
||||
file_data = json.load(f)
|
||||
# update_file_data(file_data, all_data)
|
||||
correct(file_data)
|
||||
# 保存数据,原文件中逗号后有空格,这里保持一致
|
||||
with open("./ci/" + file_name, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
|
||||
logging.info("Save " + file_name)
|
||||
@@ -63,6 +63,54 @@ def only_text(text: str):
|
||||
return re.sub(r"[,。、《》…()·・\s]", "", text)
|
||||
|
||||
|
||||
def update_file_data(old_data: list, new_data: list):
|
||||
for i in range(len(old_data)):
|
||||
old_text = only_text("".join(old_data[i]["paragraphs"]))
|
||||
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
|
||||
# 计算纯文字的相似度
|
||||
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
|
||||
if 0.9 <= ratio < 1.0:
|
||||
# 假定此范围内说明缺字,需要更新
|
||||
old_data[i]["author"] = new_data[start + i]["author"]
|
||||
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
|
||||
elif ratio < 0.9:
|
||||
# 异常情况warning输出,不更新
|
||||
logging.warning(old_text)
|
||||
logging.warning(new_text)
|
||||
else:
|
||||
old_data[i]["author"] = new_data[start + i]["author"]
|
||||
|
||||
|
||||
char_dict = {
|
||||
"鵷": "鹓",
|
||||
"颭": "飐",
|
||||
"鷁": "鹢",
|
||||
"鴞": "鸮",
|
||||
"餖": "饾",
|
||||
"飣": "饤",
|
||||
"舃": "舄",
|
||||
"駸": "骎",
|
||||
"薄倖": "薄幸",
|
||||
"赬": "赪",
|
||||
"鷫鸘": "鹔鹴",
|
||||
"嶮": "崄",
|
||||
"後": "后",
|
||||
"纇": "颣",
|
||||
"颸": "飔",
|
||||
"崑崙": "昆仑",
|
||||
"曨": "昽"
|
||||
}
|
||||
|
||||
|
||||
def correct(old_data: list):
|
||||
""" 部分繁体转为简体 """
|
||||
for i in range(len(old_data)):
|
||||
for j in range(len(old_data[i]["paragraphs"])):
|
||||
for k, v in char_dict.items():
|
||||
if k in old_data[i]["paragraphs"][j]:
|
||||
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
|
||||
@@ -73,8 +121,6 @@ if __name__ == '__main__':
|
||||
# 读取临时文件
|
||||
with open("all.json", "r", encoding="utf-8") as f:
|
||||
all_data = json.load(f)
|
||||
# 统计更正的数目
|
||||
diff_num = 0
|
||||
# 遍历当前目录
|
||||
for file_name in os.listdir("./"):
|
||||
if re.match(r"ci\.song\.\d+\.json", file_name):
|
||||
@@ -82,22 +128,9 @@ if __name__ == '__main__':
|
||||
start = int(file_name.split(".")[2])
|
||||
with open(file_name, "r", encoding="utf-8") as f:
|
||||
file_data = json.load(f)
|
||||
for i in range(len(file_data)):
|
||||
old_text = only_text("".join(file_data[i]["paragraphs"]))
|
||||
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
|
||||
# 计算纯文字的相似度
|
||||
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
|
||||
if 0.9 <= ratio < 1.0:
|
||||
# 假定此范围内说明缺字,需要更新
|
||||
diff_num += 1
|
||||
file_data[i]["author"] = all_data[start + i]["author"]
|
||||
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
|
||||
elif ratio < 0.9:
|
||||
# 异常情况warning输出,不更新
|
||||
logging.warning(old_text)
|
||||
logging.warning(new_text)
|
||||
update_file_data(file_data, all_data)
|
||||
correct(file_data)
|
||||
# 保存数据,原文件中逗号后有空格,这里保持一致
|
||||
with open(file_name, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
|
||||
logging.info("Save " + file_name)
|
||||
logging.info("Change {0} items".format(diff_num))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -146,7 +146,7 @@
|
||||
"幕天席地。",
|
||||
"瑞脑香浓笙歌沸。",
|
||||
"白纻衣轻。",
|
||||
"鵷发霜髯照座明。",
|
||||
"鹓发霜髯照座明。",
|
||||
"轻簪小珥。",
|
||||
"却是人间真富贵。",
|
||||
"好着丹青。",
|
||||
@@ -214,7 +214,7 @@
|
||||
"paragraphs": [
|
||||
"香烟绕遍兰堂宴。",
|
||||
"香鸭珠帘卷。",
|
||||
"香风转後送韶音。",
|
||||
"香风转后送韶音。",
|
||||
"香酝佳筵今日、庆佳辰。",
|
||||
"香山烧尽禽飞放。",
|
||||
"香袖佳人唱。",
|
||||
@@ -327,7 +327,7 @@
|
||||
"昴宿呈祥,符应生公族。",
|
||||
"盖世功名夸九牧。",
|
||||
"黼衮褒扬,庆阀辉南北。",
|
||||
"赐宫醪,分笃<EFBFBD>酢<EFBFBD>",
|
||||
"赐宫醪,分笃耨。",
|
||||
"天与长生,谩把仙椿祝。",
|
||||
"好继平阳腾茂躅。",
|
||||
"富贵千秋,饮听瑶池曲。"
|
||||
@@ -565,7 +565,7 @@
|
||||
"龙角辉春,蛾春惊晓,梦阑金翠屏开。",
|
||||
"异芬薰室,风送蕊仙来。",
|
||||
"玉女擎香沐浴,人间世、洗彻凡埃。",
|
||||
"梅开後,留花酝染,清味俗难猜。",
|
||||
"梅开后,留花酝染,清味俗难猜。",
|
||||
"东君,尤雅爱,传香芳畹,香发庭陔。",
|
||||
"宁馨满尊前,喜奏瑶台。",
|
||||
"便好纽为佩王,瀛洲路、同赏蓬莱。",
|
||||
@@ -576,7 +576,7 @@
|
||||
{
|
||||
"author": "潘熊飞",
|
||||
"paragraphs": [
|
||||
"十日後重阳。",
|
||||
"十日后重阳。",
|
||||
"甘菊阶前满意黄。",
|
||||
"生日无钱留贺客,何妨。",
|
||||
"尚有儿曹理寿觞。",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user