修改为中文目录

2026-06-15 18:17:27 +00:00 · 2023-04-17 14:13:23 +08:00
parent e6fc8971ac
commit fa1814a804
1616 changed files with 1005 additions and 25 deletions
--- a/宋词/README.md
+++ b/宋词/README.md
--- a/宋词/UpdateCi.py
+++ b/宋词/UpdateCi.py
@@ -0,0 +1,136 @@
+import json
+import logging
+import os
+import re
+from difflib import SequenceMatcher
+
+import requests
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+
+def get_page_content(page: int) -> list:
+    """ 获取目录页每一页的内容 """
+    content = []
+    r = requests.post("http://qsc.zww.cn/getdata.asp", data={
+        "seektype": 2,
+        "seekvalue": "",
+        "pageno": page
+    })
+    r.encoding = "gbk"
+    soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
+    for i, a in enumerate(soup.find_all(name="a")):
+        if i % 2 == 0:
+            content.append({
+                "rhythmic": a.string.split("（")[0],
+                "param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
+            })
+        else:
+            content[-1]["author"] = a.string
+    for c in content:
+        c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
+        del c["param"]
+    return content
+
+
+def get_paragraphs(seek_type: int, seek_value: int) -> list:
+    """ 获取词的内容段落 """
+    paragraphs = []
+    r = requests.post("http://qsc.zww.cn/getdata.asp", data={
+        "seektype": seek_type,
+        "seekvalue": seek_value,
+        "pageno": 1
+    })
+    r.encoding = "gbk"
+    soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
+    for child in soup.find(name="p", align=None).contents:
+        if isinstance(child, NavigableString):
+            paragraphs.append(child)
+    return paragraphs
+
+
+def get_all_page(temp_file: str):
+    """ 爬取数据并保存至临时文件 """
+    for page in range(1, 1240):
+        all_data.extend(get_page_content(page))
+        logging.info("Success: save page {0}".format(page))
+    with open(temp_file, "w", encoding="utf-8") as f:
+        f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
+
+
+def only_text(text: str):
+    """ 去除标点只保留文字 """
+    return re.sub(r"[，。、《》…（）·・\s]", "", text)
+
+
+def update_file_data(old_data: list, new_data: list):
+    for i in range(len(old_data)):
+        old_text = only_text("".join(old_data[i]["paragraphs"]))
+        new_text = only_text("".join(new_data[start + i]["paragraphs"]))
+        # 计算纯文字的相似度
+        ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
+        if 0.9 <= ratio < 1.0:
+            # 假定此范围内说明缺字，需要更新
+            old_data[i]["author"] = new_data[start + i]["author"]
+            old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
+        elif ratio < 0.9:
+            # 异常情况warning输出，不更新
+            logging.warning(old_text)
+            logging.warning(new_text)
+        else:
+            old_data[i]["author"] = new_data[start + i]["author"]
+
+
+char_dict = {
+    "鵷": "鹓",
+    "颭": "飐",
+    "鷁": "鹢",
+    "鴞": "鸮",
+    "餖": "饾",
+    "飣": "饤",
+    "舃": "舄",
+    "駸": "骎",
+    "薄倖": "薄幸",
+    "赬": "赪",
+    "鷫鸘": "鹔鹴",
+    "嶮": "崄",
+    "後": "后",
+    "纇": "颣",
+    "颸": "飔",
+    "崑崙": "昆仑",
+    "曨": "昽"
+}
+
+
+def correct(old_data: list):
+    """ 部分繁体转为简体 """
+    for i in range(len(old_data)):
+        for j in range(len(old_data[i]["paragraphs"])):
+            for k, v in char_dict.items():
+                if k in old_data[i]["paragraphs"][j]:
+                    old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
+    temp_file_name = "all.json"
+    # 临时文件不存在则先爬取
+    if not os.path.exists(temp_file_name):
+        get_all_page(temp_file_name)
+    # 读取临时文件
+    with open("all.json", "r", encoding="utf-8") as f:
+        all_data = json.load(f)
+    # 遍历当前目录
+    for file_name in os.listdir("./"):
+        if re.match(r"ci\.song\.\d+\.json", file_name):
+            # 每个文件开始的数据索引
+            start = int(file_name.split(".")[2])
+            with open(file_name, "r", encoding="utf-8") as f:
+                file_data = json.load(f)
+            update_file_data(file_data, all_data)
+            correct(file_data)
+            # 保存数据，原文件中逗号后有空格，这里保持一致
+            with open(file_name, "w", encoding="utf-8") as f:
+                f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
+                logging.info("Save " + file_name)
--- a/宋词/author.song.json
+++ b/宋词/author.song.json
--- a/宋词/ci.db
+++ b/宋词/ci.db
--- a/宋词/ci.song.0.json
+++ b/宋词/ci.song.0.json
--- a/宋词/ci.song.1000.json
+++ b/宋词/ci.song.1000.json
--- a/宋词/ci.song.10000.json
+++ b/宋词/ci.song.10000.json
--- a/宋词/ci.song.11000.json
+++ b/宋词/ci.song.11000.json
--- a/宋词/ci.song.12000.json
+++ b/宋词/ci.song.12000.json
--- a/宋词/ci.song.13000.json
+++ b/宋词/ci.song.13000.json
--- a/宋词/ci.song.14000.json
+++ b/宋词/ci.song.14000.json
--- a/宋词/ci.song.15000.json
+++ b/宋词/ci.song.15000.json
--- a/宋词/ci.song.16000.json
+++ b/宋词/ci.song.16000.json
--- a/宋词/ci.song.17000.json
+++ b/宋词/ci.song.17000.json
--- a/宋词/ci.song.18000.json
+++ b/宋词/ci.song.18000.json
--- a/宋词/ci.song.19000.json
+++ b/宋词/ci.song.19000.json
--- a/宋词/ci.song.2000.json
+++ b/宋词/ci.song.2000.json
--- a/宋词/ci.song.20000.json
+++ b/宋词/ci.song.20000.json
--- a/宋词/ci.song.2019y.json
+++ b/宋词/ci.song.2019y.json
@@ -0,0 +1,78 @@
+[
+  {
+    "author": "李从周", 
+    "paragraphs": [
+      "夢魂尋遍。", 
+      "忽向尊前見。", 
+      "好似烏衣春社燕。", 
+      "軟語東風庭院。", 
+      "叮咛记取儿家。", 
+      "碧云隐映红霞。", 
+      "直下小桥流水，门前一树桃花。"
+    ], 
+    "rhythmic": "清平乐"
+  },
+  {
+    "author": "无名氏", 
+    "paragraphs": [
+      "一张机。", 
+      "织梭光景去如飞。", 
+      "兰房夜永愁无寐。", 
+      "呕呕轧轧，织成春恨，留著待郎归。", 
+      "两张机。", 
+      "月明人静漏声稀。", 
+      "千丝万缕相萦系。", 
+      "织成一段，回纹锦字，将去寄呈伊。", 
+      "三张机。", 
+      "中心有朵耍花儿。", 
+      "娇红嫩绿春明媚。", 
+      "君须早折，一枝浓艳，莫待过芳菲。", 
+      "四张机。", 
+      "鴛鴦織就欲雙飛。", 
+      "可憐未老頭先白。", 
+      "春波碧草，曉寒深處，相對浴紅衣。", 
+      "五张机。", 
+      "芳心密与巧心期。", 
+      "合欢树上枝连理。", 
+      "双头花下，两同心处，一对化生儿。", 
+      "六张机。", 
+      "雕花铺锦半离披。", 
+      "蘭房别有留春計。", 
+      "爐添小篆，日長一線，相對繡工遲。", 
+      "七张机。", 
+      "春蚕吐尽一生丝。", 
+      "莫教容易裁罗绮。", 
+      "无端翦破，仙鸾彩凤，分作两般衣。", 
+      "八张机。", 
+      "纤纤玉手住无时。", 
+      "蜀江濯尽春波媚。", 
+      "香遗囊麝，花房绣被，归去意迟迟。", 
+      "九张机。", 
+      "一心长在百花枝。", 
+      "百花共作紅堆被。", 
+      "都將春色，藏頭裏面，不怕睡多時。", 
+      "轻丝。", 
+      "象床玉手出新奇。", 
+      "千花万草光凝碧。", 
+      "裁缝衣著，春天歌舞，飞蝶语黄鹂。", 
+      "春衣。", 
+      "素丝染就已堪悲。", 
+      "尘世昏污无颜色。", 
+      "應同秋扇，從兹永棄，無復奉君時。", 
+      "歌聲飛落畫梁塵，舞罷香風捲繡茵。", 
+      "更欲縷陳機上恨，尊前忽有斷腸人。"
+    ], 
+    "rhythmic": "九张机"
+  },
+  {
+    "author": "张孝祥", 
+    "paragraphs": [
+      "只倚精忠不要兵。", 
+      "卷旗直入蔡州城。", 
+      "贼营半夜落妖星。", 
+      "万旅云屯看整暇，十眉环坐却娉婷。", 
+      "白麻早晚下天庭。"
+    ], 
+    "rhythmic": "浣溪沙"
+  }
+]
--- a/宋词/ci.song.21000.json
+++ b/宋词/ci.song.21000.json
--- a/宋词/ci.song.3000.json
+++ b/宋词/ci.song.3000.json
--- a/宋词/ci.song.4000.json
+++ b/宋词/ci.song.4000.json
--- a/宋词/ci.song.5000.json
+++ b/宋词/ci.song.5000.json
--- a/宋词/ci.song.6000.json
+++ b/宋词/ci.song.6000.json
--- a/宋词/ci.song.7000.json
+++ b/宋词/ci.song.7000.json
--- a/宋词/ci.song.8000.json
+++ b/宋词/ci.song.8000.json
--- a/宋词/ci.song.9000.json
+++ b/宋词/ci.song.9000.json
--- a/宋词/main.py
+++ b/宋词/main.py
@@ -0,0 +1,46 @@
+#!-*- coding: utf-8 -*-
+
+import json, sys
+import sqlite3
+from collections import OrderedDict
+
+try:               # Python 2
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+except NameError:  # Python 3
+    pass
+
+c = sqlite3.connect('ci.db')
+
+cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;")
+
+d = {"name": None, "description": None, "short_description": None}
+
+authors = []
+
+for row in cursor:
+    author = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
+    author["name"] = row[0]
+    author["description"] = row[1]
+    author["short_description"] = row[2]
+    authors.append(author) 
+
+open('author.song.json', 'w').write(json.dumps(authors, indent=2, ensure_ascii=False))
+
+
+cursor = c.execute("SELECT rhythmic, author, content from ci;")
+
+d = {"rhythmic": None, "author": None, "paragraphs": None}
+
+cis = []
+
+for row in cursor:
+    ci = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
+    ci["rhythmic"] = row[0]
+    ci["author"] = row[1]
+    ci["paragraphs"] = row[2].split('\n')
+    cis.append(ci) 
+
+for i in range(0, 21050, 1000):
+    open('ci.song.%s.json' % i, 'w').write(json.dumps(cis[i:i+1000], indent=2, ensure_ascii=False))
+
--- a/宋词/宋词三百首.json
+++ b/宋词/宋词三百首.json