Merge pull request #288 from chienmy/master

宋词中部分生僻字缺失
This commit is contained in:
JG
2022-05-01 18:45:24 +08:00
committed by GitHub
23 changed files with 3532 additions and 3429 deletions

103
ci/UpdateCi.py Normal file
View File

@@ -0,0 +1,103 @@
import json
import logging
import os
import re
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
def get_page_content(page: int) -> list:
""" 获取目录页每一页的内容 """
content = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": 2,
"seekvalue": "",
"pageno": page
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
for i, a in enumerate(soup.find_all(name="a")):
if i % 2 == 0:
content.append({
"rhythmic": a.string.split("")[0],
"param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
})
else:
content[-1]["author"] = a.string
for c in content:
c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
del c["param"]
return content
def get_paragraphs(seek_type: int, seek_value: int) -> list:
""" 获取词的内容段落 """
paragraphs = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": seek_type,
"seekvalue": seek_value,
"pageno": 1
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
for child in soup.find(name="p", align=None).contents:
if isinstance(child, NavigableString):
paragraphs.append(child)
return paragraphs
def get_all_page(temp_file: str):
""" 爬取数据并保存至临时文件 """
for page in range(1, 1240):
all_data.extend(get_page_content(page))
logging.info("Success: save page {0}".format(page))
with open(temp_file, "w", encoding="utf-8") as f:
f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
def only_text(text: str):
""" 去除标点只保留文字 """
return re.sub(r"[,。、《》…()·・\s]", "", text)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
temp_file_name = "all.json"
# 临时文件不存在则先爬取
if not os.path.exists(temp_file_name):
get_all_page(temp_file_name)
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 统计更正的数目
diff_num = 0
# 遍历当前目录
for file_name in os.listdir("./"):
if re.match(r"ci\.song\.\d+\.json", file_name):
# 每个文件开始的数据索引
start = int(file_name.split(".")[2])
with open(file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
for i in range(len(file_data)):
old_text = only_text("".join(file_data[i]["paragraphs"]))
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
diff_num += 1
file_data[i]["author"] = all_data[start + i]["author"]
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open(file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)
logging.info("Change {0} items".format(diff_num))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -36,7 +36,7 @@
"交广出沉香,路遥难致。",
"何况卑人更不易。",
"寿星香帕,我又几曾识置。",
"有般祝寿底,忒戏。",
"有般祝寿底,忒戏。",
"剪下一张,池州表纸。",
"拈得轻圆更滑腻。",
"五双纸拈,管打十个喷嚏。",
@@ -83,7 +83,7 @@
{
"author": "胡于",
"paragraphs": [
"袅袅薰风响环。",
"袅袅薰风响环。",
"广寒仙子跨清鸾。",
"谁教瑞世仪周间,自赋多才继小山。",
"铃阁静,画堂闲。",
@@ -145,8 +145,8 @@
"paragraphs": [
"幕天席地。",
"瑞脑香浓笙歌沸。",
"白衣轻。",
"发霜髯照座明。",
"白衣轻。",
"发霜髯照座明。",
"轻簪小珥。",
"却是人间真富贵。",
"好着丹青。",
@@ -158,7 +158,7 @@
"author": "李夫人",
"paragraphs": [
"急鼓疏钟声报晓,楼上今朝,卷起重帘早。",
"环珊珊香袅袅。",
"环珊珊香袅袅。",
"尘埃不到如蓬岛。",
"何用珠玑相映照。",
"韵胜形清,自有天然好。",
@@ -181,7 +181,7 @@
{
"author": "张藻",
"paragraphs": [
"露零金井,尘清玉宇,双呈瑞新秋。",
"露零金井,尘清玉宇,双呈瑞新秋。",
"佳气郁葱,祥烟缭绕,玉门初诞风流。",
"宾客竞回眸。",
"庆虎头食肉,燕颔封侯。",
@@ -327,7 +327,7 @@
"昴宿呈祥,符应生公族。",
"盖世功名夸九牧。",
"黼衮褒扬,庆阀辉南北。",
"赐宫醪,分笃",
"赐宫醪,分笃<EFBFBD><EFBFBD>",
"天与长生,谩把仙椿祝。",
"好继平阳腾茂躅。",
"富贵千秋,饮听瑶池曲。"
@@ -363,7 +363,7 @@
"author": "徐去非",
"paragraphs": [
"祥景飞光衮绣。",
"流庆台,自是神仙胄。",
"流庆台,自是神仙胄。",
"谁遣阳和放春透。",
"化工重入丹青手。",
"云筝锦瑟争为寿。",
@@ -390,9 +390,9 @@
"paragraphs": [
"月寺星轺尘梦断,如今平地仙人。",
"烟霞卷起旧精神。",
"焚香金鸾,书奏玉麒麟。",
"焚香金鸾,书奏玉麒麟。",
"闻道枫宸求侍从,看庆命重新。",
"且将风月剩身。",
"且将风月剩身。",
"樽中长有酒,花下不辜春。"
],
"rhythmic": "临江仙"
@@ -446,8 +446,8 @@
"author": "霍安人",
"paragraphs": [
"正朱明时侯,院宇清和,庆逢佳节。",
"梦应熊罴,尧翻三叶,罗绮如云,寿杯争劝,竞起歌新阕。",
"瑞气氤氲,祥云缭绕,玉炉频。",
"梦应熊罴,尧翻三叶,罗绮如云,寿杯争劝,竞起歌新阕。",
"瑞气氤氲,祥云缭绕,玉炉频。",
"溪室封功,几多勋业,首冠今朝,一时英杰。",
"得配侯门,岂不惭疏拙。",
"彩凤和鸣,早膺荣擢。",
@@ -551,7 +551,7 @@
"paragraphs": [
"玉宇生凉秋恰半。",
"月到今霄,分外清光满。",
"兔魄呈祥冰烂。",
"兔魄呈祥冰<EFBFBD>昀谩<EFBFBD>",
"广寒仙子生华旦。",
"聪慧风流天与擅,淑质冰婆,本是飞琼伴。",
"□领彩衣椿祝劝。",
@@ -639,7 +639,7 @@
{
"author": "吴氏3",
"paragraphs": [
"一声初报晓。"
"鶗鴂一声初报晓。"
],
"rhythmic": "渔家傲"
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff