初步添加Mcbbsnews的Java版本资讯订阅

This commit is contained in:
Azide 2022-05-19 02:15:21 +08:00
parent f566ddf894
commit d147aa0e39
3 changed files with 142 additions and 75 deletions

60
poetry.lock generated
View File

@ -70,7 +70,7 @@ python-dateutil = ">=2.7.0"
[[package]]
name = "asgiref"
version = "3.5.1"
version = "3.5.2"
description = "ASGI specs, helper code, and adapters"
category = "main"
optional = false
@ -183,7 +183,6 @@ mypy-extensions = ">=0.4.3"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
@ -460,22 +459,6 @@ category = "main"
optional = false
python-versions = ">=3.5"
[[package]]
name = "importlib-metadata"
version = "4.11.3"
description = "Read metadata from Python packages"
category = "main"
optional = false
python-versions = ">=3.7"
[package.dependencies]
zipp = ">=0.5"
[package.extras]
docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
perf = ["ipython"]
testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
[[package]]
name = "iniconfig"
version = "1.1.1"
@ -610,9 +593,6 @@ category = "main"
optional = false
python-versions = ">=3.6"
[package.dependencies]
importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
[package.extras]
testing = ["coverage", "pyyaml"]
@ -1300,14 +1280,14 @@ python-versions = ">=3.6,<4.0"
[[package]]
name = "traitlets"
version = "5.2.0"
description = "Traitlets Python configuration system"
version = "5.2.1.post0"
description = ""
category = "dev"
optional = false
python-versions = ">=3.7"
[package.extras]
test = ["pytest", "pre-commit"]
test = ["pre-commit", "pytest"]
[[package]]
name = "typing-extensions"
@ -1458,22 +1438,10 @@ python-versions = ">=3.6"
idna = ">=2.0"
multidict = ">=4.0"
[[package]]
name = "zipp"
version = "3.8.0"
description = "Backport of pathlib-compatible object wrapper for zip files"
category = "main"
optional = false
python-versions = ">=3.7"
[package.extras]
docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
[metadata]
lock-version = "1.1"
python-versions = "^3.9"
content-hash = "48207f450bd3f15faf69721a1b2daed6b15aa5e23ff94a6ab05036f37d844d73"
python-versions = ">=3.10,<4.0.0"
content-hash = "28457eb74ad24dd15a02b512fa3ae09a37be3ec85770465e6e6143c88d8c32fd"
[metadata.files]
aiofiles = [
@ -1497,8 +1465,8 @@ arrow = [
{file = "arrow-1.2.2.tar.gz", hash = "sha256:05caf1fd3d9a11a1135b2b6f09887421153b94558e5ef4d090b567b47173ac2b"},
]
asgiref = [
{file = "asgiref-3.5.1-py3-none-any.whl", hash = "sha256:45a429524fba18aba9d512498b19d220c4d628e75b40cf5c627524dbaebc5cc1"},
{file = "asgiref-3.5.1.tar.gz", hash = "sha256:fddeea3c53fa99d0cdb613c3941cc6e52d822491fc2753fba25768fb5bf4e865"},
{file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"},
{file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"},
]
asttokens = [
{file = "asttokens-2.0.5-py2.py3-none-any.whl", hash = "sha256:0844691e88552595a6f4a4281a9f7f79b8dd45ca4ccea82e5e05b4bbdb76705c"},
@ -1774,10 +1742,6 @@ idna = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
]
importlib-metadata = [
{file = "importlib_metadata-4.11.3-py3-none-any.whl", hash = "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6"},
{file = "importlib_metadata-4.11.3.tar.gz", hash = "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"},
]
iniconfig = [
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@ -2249,8 +2213,8 @@ tomlkit = [
{file = "tomlkit-0.9.2.tar.gz", hash = "sha256:ebd982d61446af95a1e082b103e250cb9e6d152eae2581d4a07d31a70b34ab0f"},
]
traitlets = [
{file = "traitlets-5.2.0-py3-none-any.whl", hash = "sha256:9dd4025123fbe018a2092b2ad6984792f53ea3362c698f37473258b1fa97b0bc"},
{file = "traitlets-5.2.0.tar.gz", hash = "sha256:60474f39bf1d39a11e0233090b99af3acee93bbc2281777e61dd8c87da8a0014"},
{file = "traitlets-5.2.1.post0-py3-none-any.whl", hash = "sha256:f44b708d33d98b0addb40c29d148a761f44af740603a8fd0e2f8b5b27cf0f087"},
{file = "traitlets-5.2.1.post0.tar.gz", hash = "sha256:70815ecb20ec619d1af28910ade523383be13754283aef90528eb3d47b77c5db"},
]
typing-extensions = [
{file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"},
@ -2430,7 +2394,3 @@ yarl = [
{file = "yarl-1.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:797c2c412b04403d2da075fb93c123df35239cd7b4cc4e0cd9e5839b73f52c58"},
{file = "yarl-1.7.2.tar.gz", hash = "sha256:45399b46d60c253327a460e99856752009fcee5f5d3c80b2f7c0cae1c38d56dd"},
]
zipp = [
{file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"},
{file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"},
]

View File

@ -23,7 +23,7 @@ classifiers = [
]
[tool.poetry.dependencies]
python = ">=3.10"
python = ">=3.10,<4.0.0"
nonebot2 = "^2.0.0-beta.2"
httpx = ">=0.16.1 <1.0.0"
bs4 = "^0.0.1"
@ -64,7 +64,7 @@ asyncio_mode = "auto"
[tool.black]
line-length = 88
target-version = ["py39", "py310"]
target-version = ["py310"]
include = '\.pyi?$'
extend-exclude = '''
'''

View File

@ -2,15 +2,22 @@ import re
import time
import httpx
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Tag
from ..post import Post
from ..types import Category, RawPost, Target
from .platform import CategoryNotSupport, NewMessage
def _format_text(rawtext: str) -> str:
"""处理BeautifulSoup生成的string中奇怪的回车+连续空格"""
ftext = re.sub(r"\n\s*", " ", rawtext)
def _format_text(rawtext: str, mode: int) -> str:
"""处理BeautifulSoup生成的string中奇怪的回车+连续空格
mode 0:处理标题
mode 1:处理推文"""
match mode:
case 0:
ftext = re.sub(r"\n\s*", " ", rawtext)
case 1:
ftext = re.sub(r"[\n\s*]", "", rawtext)
return ftext
@ -47,24 +54,28 @@ class McbbsJavaNews(NewMessage):
raw_post_list = soup.find_all(
"tbody", id=re.compile(r"normalthread_[0-9]*")
)
post_list = []
for raw_post in raw_post_list:
post = {}
post["url"] = raw_post.find("a", class_="s xst")["href"]
post["title"] = _format_text(raw_post.find("a", class_="s xst").string)
post["category"] = raw_post.select("th em a")[0].string
post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string
post["id"] = raw_post["id"]
rawdate = (
raw_post.select("td:nth-of-type(2) em span span")[0]["title"]
if raw_post.select("td:nth-of-type(2) em span span")
else raw_post.select("td:nth-of-type(2) em span")[0].string
)
post["date"] = _stamp_date(rawdate)
post_list.append(post)
post_list = self._gen_post_list(raw_post_list)
return post_list
def _gen_post_list(self, raw_post_list):
post_list = []
for raw_post in raw_post_list:
post = {}
post["url"] = raw_post.find("a", class_="s xst")["href"]
post["title"] = _format_text(raw_post.find("a", class_="s xst").string, 0)
post["category"] = raw_post.select("th em a")[0].string
post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string
post["id"] = raw_post["id"]
rawdate = (
raw_post.select("td:nth-of-type(2) em span span")[0]["title"]
if raw_post.select("td:nth-of-type(2) em span span")
else raw_post.select("td:nth-of-type(2) em span")[0].string
)
post["date"] = _stamp_date(rawdate)
post_list.append(post)
return post_list
def get_id(self, post: RawPost) -> str:
return post["id"]
@ -72,7 +83,103 @@ class McbbsJavaNews(NewMessage):
return post["date"]
def get_category(self, post: RawPost) -> Category:
if post["category"] == "Java版本资讯":
return Category(1)
else:
return CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"]))
match post["category"]:
case "Java版本资讯":
return Category(1)
case _:
raise CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"]))
def _check_str_chinese(self, check_str: str) -> bool:
"""检测字符串是否含有中文(有一个就算)"""
for ch in check_str:
if "\u4e00" <= ch <= "\u9fff":
return True
return False
def _javanews_parser(self, rawtext: str):
"""提取Java版本资讯的推送消息"""
# 事先删除不需要的尾部
rawtext = re.sub(r"【本文排版借助了:[\s\S]*】", "", rawtext)
rawsoup = BeautifulSoup(rawtext.replace("<br />", ""), "html.parser")
# 获取头图
pic_tag = rawsoup.find(
"img", file=re.compile(r"https://www.minecraft.net/\S*header.jpg")
)
pic_url: list[str] = [pic_tag.get("src", pic_tag.get("file"))]
# 获取blockquote标签下的内容
soup = rawsoup.find(
"td", id=re.compile(r"postmessage_[0-9]*")
).blockquote.blockquote
# 删除无用的div和span段内容
for del_tag in soup.find_all(["div", "span"]):
del_tag.extract()
# 进一步删除无用尾部
soup.select("blockquote > strong")[0].extract()
# 展开所有的a,u和strong标签,展开ul,font标签里的font标签
for unwrap_tag in soup.find_all(["a", "strong", "u", "ul", "font"]):
match unwrap_tag.name:
case "a" | "strong" | "u": # 展开所有的a,u和strong标签
unwrap_tag.unwrap()
case "ul" | "font": # 展开ul,font里的font标签
for font_tag in unwrap_tag.find_all("font"):
font_tag.unwrap()
# 获取所有的中文句子
post_text = ""
last_is_empty_line = True
for element in soup.contents:
if isinstance(element, Tag):
match element.name:
case "font":
text = ""
for sub in element.contents:
if isinstance(sub, NavigableString):
text += sub
if self._check_str_chinese(text):
post_text += "\n{}".format(_format_text(text, 1))
last_is_empty_line = False
case "ul":
for li_tag in element.find_all("li"):
text = ""
for sub in li_tag.contents:
if isinstance(sub, NavigableString):
text += sub
if self._check_str_chinese(text):
post_text += "\n{}".format(_format_text(text, 1))
last_is_empty_line = False
case _:
continue
elif isinstance(element, NavigableString):
if str(element) == "\n":
if not last_is_empty_line:
post_text += "\n"
last_is_empty_line = True
else:
post_text += "\n{}".format(_format_text(element, 1))
last_is_empty_line = False
else:
continue
return post_text, pic_url
async def parse(self, raw_post: RawPost) -> Post:
post_url = "https://www.mcbbs.net/{}".format(raw_post["url"])
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/51.0.2704.63 Safari/537.36"
}
async with httpx.AsyncClient() as client:
html = await client.get(post_url, headers=headers)
match raw_post["category"]:
case "Java版本资讯":
text, pic_urls = self._javanews_parser(html)
case _:
raise CategoryNotSupport(
"McbbsNews订阅暂不支持 `{}".format(raw_post["category"])
)
return Post(
self.name,
text=text,
url=post_url,
pics=pic_urls,
target_name=raw_post["category"],
)