Merge pull request #288 from chienmy/master

宋词中部分生僻字缺失
2026-06-15 18:27:07 +00:00 · 2022-05-01 18:45:24 +08:00
parent 851c35d4a1 476802ea96
commit 08cce7b893
23 changed files with 3532 additions and 3429 deletions
--- a/ci/UpdateCi.py
+++ b/ci/UpdateCi.py
@@ -0,0 +1,103 @@
+import json
+import logging
+import os
+import re
+from difflib import SequenceMatcher
+
+import requests
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+
+def get_page_content(page: int) -> list:
+    """ 获取目录页每一页的内容 """
+    content = []
+    r = requests.post("http://qsc.zww.cn/getdata.asp", data={
+        "seektype": 2,
+        "seekvalue": "",
+        "pageno": page
+    })
+    r.encoding = "gbk"
+    soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
+    for i, a in enumerate(soup.find_all(name="a")):
+        if i % 2 == 0:
+            content.append({
+                "rhythmic": a.string.split("（")[0],
+                "param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
+            })
+        else:
+            content[-1]["author"] = a.string
+    for c in content:
+        c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
+        del c["param"]
+    return content
+
+
+def get_paragraphs(seek_type: int, seek_value: int) -> list:
+    """ 获取词的内容段落 """
+    paragraphs = []
+    r = requests.post("http://qsc.zww.cn/getdata.asp", data={
+        "seektype": seek_type,
+        "seekvalue": seek_value,
+        "pageno": 1
+    })
+    r.encoding = "gbk"
+    soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
+    for child in soup.find(name="p", align=None).contents:
+        if isinstance(child, NavigableString):
+            paragraphs.append(child)
+    return paragraphs
+
+
+def get_all_page(temp_file: str):
+    """ 爬取数据并保存至临时文件 """
+    for page in range(1, 1240):
+        all_data.extend(get_page_content(page))
+        logging.info("Success: save page {0}".format(page))
+    with open(temp_file, "w", encoding="utf-8") as f:
+        f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
+
+
+def only_text(text: str):
+    """ 去除标点只保留文字 """
+    return re.sub(r"[，。、《》…（）·・\s]", "", text)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
+    temp_file_name = "all.json"
+    # 临时文件不存在则先爬取
+    if not os.path.exists(temp_file_name):
+        get_all_page(temp_file_name)
+    # 读取临时文件
+    with open("all.json", "r", encoding="utf-8") as f:
+        all_data = json.load(f)
+    # 统计更正的数目
+    diff_num = 0
+    # 遍历当前目录
+    for file_name in os.listdir("./"):
+        if re.match(r"ci\.song\.\d+\.json", file_name):
+            # 每个文件开始的数据索引
+            start = int(file_name.split(".")[2])
+            with open(file_name, "r", encoding="utf-8") as f:
+                file_data = json.load(f)
+            for i in range(len(file_data)):
+                old_text = only_text("".join(file_data[i]["paragraphs"]))
+                new_text = only_text("".join(all_data[start + i]["paragraphs"]))
+                # 计算纯文字的相似度
+                ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
+                if 0.9 <= ratio < 1.0:
+                    # 假定此范围内说明缺字，需要更新
+                    diff_num += 1
+                    file_data[i]["author"] = all_data[start + i]["author"]
+                    file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
+                elif ratio < 0.9:
+                    # 异常情况warning输出，不更新
+                    logging.warning(old_text)
+                    logging.warning(new_text)
+            # 保存数据，原文件中逗号后有空格，这里保持一致
+            with open(file_name, "w", encoding="utf-8") as f:
+                f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
+                logging.info("Save " + file_name)
+    logging.info("Change {0} items".format(diff_num))
--- a/ci/ci.song.0.json
+++ b/ci/ci.song.0.json
--- a/ci/ci.song.1000.json
+++ b/ci/ci.song.1000.json
--- a/ci/ci.song.10000.json
+++ b/ci/ci.song.10000.json
--- a/ci/ci.song.11000.json
+++ b/ci/ci.song.11000.json
--- a/ci/ci.song.12000.json
+++ b/ci/ci.song.12000.json
--- a/ci/ci.song.13000.json
+++ b/ci/ci.song.13000.json
--- a/ci/ci.song.14000.json
+++ b/ci/ci.song.14000.json
--- a/ci/ci.song.15000.json
+++ b/ci/ci.song.15000.json
--- a/ci/ci.song.16000.json
+++ b/ci/ci.song.16000.json
--- a/ci/ci.song.17000.json
+++ b/ci/ci.song.17000.json
--- a/ci/ci.song.18000.json
+++ b/ci/ci.song.18000.json
--- a/ci/ci.song.19000.json
+++ b/ci/ci.song.19000.json
--- a/ci/ci.song.2000.json
+++ b/ci/ci.song.2000.json
--- a/ci/ci.song.20000.json
+++ b/ci/ci.song.20000.json
--- a/ci/ci.song.21000.json
+++ b/ci/ci.song.21000.json
@@ -36,7 +36,7 @@
      "交广出沉香，路遥难致。", 
      "何况卑人更不易。", 
      "寿星香帕，我又几曾识置。", 
-      "有般祝寿底，忒戏。", 
+      "有般祝寿底，忒忔戏。", 
      "剪下一张，池州表纸。", 
      "拈得轻圆更滑腻。", 
      "五双纸拈，管打十个喷嚏。", 
@@ -83,7 +83,7 @@
  {
    "author": "胡于", 
    "paragraphs": [
-      "袅袅薰风响佩环。", 
+      "袅袅薰风响珮环。", 
      "广寒仙子跨清鸾。", 
      "谁教瑞世仪周间，自赋多才继小山。", 
      "铃阁静，画堂闲。", 
@@ -145,8 +145,8 @@
    "paragraphs": [
      "幕天席地。", 
      "瑞脑香浓笙歌沸。", 
-      "白衣轻。", 
-      "发霜髯照座明。", 
+      "白纻衣轻。", 
+      "鵷发霜髯照座明。", 
      "轻簪小珥。", 
      "却是人间真富贵。", 
      "好着丹青。", 
@@ -158,7 +158,7 @@
    "author": "李夫人", 
    "paragraphs": [
      "急鼓疏钟声报晓，楼上今朝，卷起重帘早。", 
-      "环珊珊香袅袅。", 
+      "环珮珊珊香袅袅。", 
      "尘埃不到如蓬岛。", 
      "何用珠玑相映照。", 
      "韵胜形清，自有天然好。", 
@@ -181,7 +181,7 @@
  {
    "author": "张藻", 
    "paragraphs": [
-      "露零金井，尘清玉宇，双呈瑞新秋。", 
+      "露零金井，尘清玉宇，双蓂呈瑞新秋。", 
      "佳气郁葱，祥烟缭绕，玉门初诞风流。", 
      "宾客竞回眸。", 
      "庆虎头食肉，燕颔封侯。", 
@@ -327,7 +327,7 @@
      "昴宿呈祥，符应生公族。", 
      "盖世功名夸九牧。", 
      "黼衮褒扬，庆阀辉南北。", 
-      "赐宫醪，分笃。", 
+      "赐宫醪，分笃<EFBFBD>酢<EFBFBD>", 
      "天与长生，谩把仙椿祝。", 
      "好继平阳腾茂躅。", 
      "富贵千秋，饮听瑶池曲。"
@@ -363,7 +363,7 @@
    "author": "徐去非", 
    "paragraphs": [
      "祥景飞光衮绣。", 
-      "流庆台，自是神仙胄。", 
+      "流庆崑台，自是神仙胄。", 
      "谁遣阳和放春透。", 
      "化工重入丹青手。", 
      "云筝锦瑟争为寿。", 
@@ -390,9 +390,9 @@
    "paragraphs": [
      "月寺星轺尘梦断，如今平地仙人。", 
      "烟霞卷起旧精神。", 
-      "焚香金鸾，书奏玉麒麟。", 
+      "焚香金鸾鷟，书奏玉麒麟。", 
      "闻道枫宸求侍从，看庆命重新。", 
-      "且将风月剩身。", 
+      "且将风月剩酧身。", 
      "樽中长有酒，花下不辜春。"
    ], 
    "rhythmic": "临江仙"
@@ -446,8 +446,8 @@
    "author": "霍安人", 
    "paragraphs": [
      "正朱明时侯，院宇清和，庆逢佳节。", 
-      "梦应熊罴，尧翻三叶，罗绮如云，寿杯争劝，竞起歌新阕。", 
-      "瑞气氤氲，祥云缭绕，玉炉频。", 
+      "梦应熊罴，尧蓂翻三叶，罗绮如云，寿杯争劝，竞起歌新阕。", 
+      "瑞气氤氲，祥云缭绕，玉炉频爇。", 
      "溪室封功，几多勋业，首冠今朝，一时英杰。", 
      "得配侯门，岂不惭疏拙。", 
      "彩凤和鸣，早膺荣擢。", 
@@ -551,7 +551,7 @@
    "paragraphs": [
      "玉宇生凉秋恰半。", 
      "月到今霄，分外清光满。", 
-      "兔魄呈祥冰烂。", 
+      "兔魄呈祥冰<EFBFBD>昀谩<EFBFBD>", 
      "广寒仙子生华旦。", 
      "聪慧风流天与擅，淑质冰婆，本是飞琼伴。", 
      "□领彩衣椿祝劝。", 
@@ -639,7 +639,7 @@
  {
    "author": "吴氏3", 
    "paragraphs": [
-      "一声初报晓。"
+      "鶗鴂一声初报晓。"
    ], 
    "rhythmic": "渔家傲"
  }
--- a/ci/ci.song.3000.json
+++ b/ci/ci.song.3000.json
--- a/ci/ci.song.4000.json
+++ b/ci/ci.song.4000.json
--- a/ci/ci.song.5000.json
+++ b/ci/ci.song.5000.json
--- a/ci/ci.song.6000.json
+++ b/ci/ci.song.6000.json
--- a/ci/ci.song.7000.json
+++ b/ci/ci.song.7000.json
--- a/ci/ci.song.8000.json
+++ b/ci/ci.song.8000.json
--- a/ci/ci.song.9000.json
+++ b/ci/ci.song.9000.json