修改为中文目录

This commit is contained in:
JackeyGao
2023-04-17 14:13:23 +08:00
parent e6fc8971ac
commit fa1814a804
1616 changed files with 1005 additions and 25 deletions

77
宋词/README.md Normal file

File diff suppressed because one or more lines are too long

136
宋词/UpdateCi.py Normal file
View File

@@ -0,0 +1,136 @@
import json
import logging
import os
import re
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
def get_page_content(page: int) -> list:
""" 获取目录页每一页的内容 """
content = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": 2,
"seekvalue": "",
"pageno": page
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"filllist\('·(.*?)'\);", r.text).group(1), features="lxml")
for i, a in enumerate(soup.find_all(name="a")):
if i % 2 == 0:
content.append({
"rhythmic": a.string.split("")[0],
"param": re.search(r"doseek2\((.*?)\);", a["onclick"]).group(1).split(",")
})
else:
content[-1]["author"] = a.string
for c in content:
c["paragraphs"] = get_paragraphs(int(c["param"][0]), int(c["param"][1]))
del c["param"]
return content
def get_paragraphs(seek_type: int, seek_value: int) -> list:
""" 获取词的内容段落 """
paragraphs = []
r = requests.post("http://qsc.zww.cn/getdata.asp", data={
"seektype": seek_type,
"seekvalue": seek_value,
"pageno": 1
})
r.encoding = "gbk"
soup = BeautifulSoup(re.search(r"fillbody\('(.*?)'\);", r.text).group(1), features="lxml")
for child in soup.find(name="p", align=None).contents:
if isinstance(child, NavigableString):
paragraphs.append(child)
return paragraphs
def get_all_page(temp_file: str):
""" 爬取数据并保存至临时文件 """
for page in range(1, 1240):
all_data.extend(get_page_content(page))
logging.info("Success: save page {0}".format(page))
with open(temp_file, "w", encoding="utf-8") as f:
f.write(json.dumps(all_data, indent=2, ensure_ascii=False))
def only_text(text: str):
""" 去除标点只保留文字 """
return re.sub(r"[,。、《》…()·・\s]", "", text)
def update_file_data(old_data: list, new_data: list):
for i in range(len(old_data)):
old_text = only_text("".join(old_data[i]["paragraphs"]))
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
old_data[i]["author"] = new_data[start + i]["author"]
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出不更新
logging.warning(old_text)
logging.warning(new_text)
else:
old_data[i]["author"] = new_data[start + i]["author"]
char_dict = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"薄倖": "薄幸",
"": "",
"鷫鸘": "鹔鹴",
"": "",
"": "",
"": "",
"": "",
"崑崙": "昆仑",
"": ""
}
def correct(old_data: list):
""" 部分繁体转为简体 """
for i in range(len(old_data)):
for j in range(len(old_data[i]["paragraphs"])):
for k, v in char_dict.items():
if k in old_data[i]["paragraphs"][j]:
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
temp_file_name = "all.json"
# 临时文件不存在则先爬取
if not os.path.exists(temp_file_name):
get_all_page(temp_file_name)
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 遍历当前目录
for file_name in os.listdir("./"):
if re.match(r"ci\.song\.\d+\.json", file_name):
# 每个文件开始的数据索引
start = int(file_name.split(".")[2])
with open(file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
update_file_data(file_data, all_data)
correct(file_data)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open(file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)

7817
宋词/author.song.json Normal file

File diff suppressed because one or more lines are too long

BIN
宋词/ci.db Normal file

Binary file not shown.

13907
宋词/ci.song.0.json Normal file

File diff suppressed because it is too large Load Diff

13157
宋词/ci.song.1000.json Normal file

File diff suppressed because it is too large Load Diff

13493
宋词/ci.song.10000.json Normal file

File diff suppressed because it is too large Load Diff

13614
宋词/ci.song.11000.json Normal file

File diff suppressed because it is too large Load Diff

13592
宋词/ci.song.12000.json Normal file

File diff suppressed because it is too large Load Diff

13836
宋词/ci.song.13000.json Normal file

File diff suppressed because it is too large Load Diff

14796
宋词/ci.song.14000.json Normal file

File diff suppressed because it is too large Load Diff

14838
宋词/ci.song.15000.json Normal file

File diff suppressed because it is too large Load Diff

14591
宋词/ci.song.16000.json Normal file

File diff suppressed because it is too large Load Diff

14637
宋词/ci.song.17000.json Normal file

File diff suppressed because it is too large Load Diff

14582
宋词/ci.song.18000.json Normal file

File diff suppressed because it is too large Load Diff

11975
宋词/ci.song.19000.json Normal file

File diff suppressed because it is too large Load Diff

13271
宋词/ci.song.2000.json Normal file

File diff suppressed because it is too large Load Diff

13503
宋词/ci.song.20000.json Normal file

File diff suppressed because it is too large Load Diff

78
宋词/ci.song.2019y.json Normal file
View File

@@ -0,0 +1,78 @@
[
{
"author": "李从周",
"paragraphs": [
"夢魂尋遍。",
"忽向尊前見。",
"好似烏衣春社燕。",
"軟語東風庭院。",
"叮咛记取儿家。",
"碧云隐映红霞。",
"直下小桥流水,门前一树桃花。"
],
"rhythmic": "清平乐"
},
{
"author": "无名氏",
"paragraphs": [
"一张机。",
"织梭光景去如飞。",
"兰房夜永愁无寐。",
"呕呕轧轧,织成春恨,留著待郎归。",
"两张机。",
"月明人静漏声稀。",
"千丝万缕相萦系。",
"织成一段,回纹锦字,将去寄呈伊。",
"三张机。",
"中心有朵耍花儿。",
"娇红嫩绿春明媚。",
"君须早折,一枝浓艳,莫待过芳菲。",
"四张机。",
"鴛鴦織就欲雙飛。",
"可憐未老頭先白。",
"春波碧草,曉寒深處,相對浴紅衣。",
"五张机。",
"芳心密与巧心期。",
"合欢树上枝连理。",
"双头花下,两同心处,一对化生儿。",
"六张机。",
"雕花铺锦半离披。",
"蘭房别有留春計。",
"爐添小篆,日長一線,相對繡工遲。",
"七张机。",
"春蚕吐尽一生丝。",
"莫教容易裁罗绮。",
"无端翦破,仙鸾彩凤,分作两般衣。",
"八张机。",
"纤纤玉手住无时。",
"蜀江濯尽春波媚。",
"香遗囊麝,花房绣被,归去意迟迟。",
"九张机。",
"一心长在百花枝。",
"百花共作紅堆被。",
"都將春色,藏頭裏面,不怕睡多時。",
"轻丝。",
"象床玉手出新奇。",
"千花万草光凝碧。",
"裁缝衣著,春天歌舞,飞蝶语黄鹂。",
"春衣。",
"素丝染就已堪悲。",
"尘世昏污无颜色。",
"應同秋扇,從兹永棄,無復奉君時。",
"歌聲飛落畫梁塵,舞罷香風捲繡茵。",
"更欲縷陳機上恨,尊前忽有斷腸人。"
],
"rhythmic": "九张机"
},
{
"author": "张孝祥",
"paragraphs": [
"只倚精忠不要兵。",
"卷旗直入蔡州城。",
"贼营半夜落妖星。",
"万旅云屯看整暇,十眉环坐却娉婷。",
"白麻早晚下天庭。"
],
"rhythmic": "浣溪沙"
}
]

646
宋词/ci.song.21000.json Normal file

File diff suppressed because it is too large Load Diff

13682
宋词/ci.song.3000.json Normal file

File diff suppressed because it is too large Load Diff

13532
宋词/ci.song.4000.json Normal file

File diff suppressed because it is too large Load Diff

13199
宋词/ci.song.5000.json Normal file

File diff suppressed because it is too large Load Diff

13180
宋词/ci.song.6000.json Normal file

File diff suppressed because it is too large Load Diff

13685
宋词/ci.song.7000.json Normal file

File diff suppressed because it is too large Load Diff

13184
宋词/ci.song.8000.json Normal file

File diff suppressed because it is too large Load Diff

13271
宋词/ci.song.9000.json Normal file

File diff suppressed because it is too large Load Diff

46
宋词/main.py Normal file
View File

@@ -0,0 +1,46 @@
#!-*- coding: utf-8 -*-
import json, sys
import sqlite3
from collections import OrderedDict
try: # Python 2
reload(sys)
sys.setdefaultencoding('utf-8')
except NameError: # Python 3
pass
c = sqlite3.connect('ci.db')
cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;")
d = {"name": None, "description": None, "short_description": None}
authors = []
for row in cursor:
author = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
author["name"] = row[0]
author["description"] = row[1]
author["short_description"] = row[2]
authors.append(author)
open('author.song.json', 'w').write(json.dumps(authors, indent=2, ensure_ascii=False))
cursor = c.execute("SELECT rhythmic, author, content from ci;")
d = {"rhythmic": None, "author": None, "paragraphs": None}
cis = []
for row in cursor:
ci = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
ci["rhythmic"] = row[0]
ci["author"] = row[1]
ci["paragraphs"] = row[2].split('\n')
cis.append(ci)
for i in range(0, 21050, 1000):
open('ci.song.%s.json' % i, 'w').write(json.dumps(cis[i:i+1000], indent=2, ensure_ascii=False))

5098
宋词/宋词三百首.json Normal file

File diff suppressed because it is too large Load Diff