diff --git a/poetry.lock b/poetry.lock index a48f2bb..1647c43 100644 --- a/poetry.lock +++ b/poetry.lock @@ -70,7 +70,7 @@ python-dateutil = ">=2.7.0" [[package]] name = "asgiref" -version = "3.5.1" +version = "3.5.2" description = "ASGI specs, helper code, and adapters" category = "main" optional = false @@ -183,7 +183,6 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] colorama = ["colorama (>=0.4.3)"] @@ -460,22 +459,6 @@ category = "main" optional = false python-versions = ">=3.5" -[[package]] -name = "importlib-metadata" -version = "4.11.3" -description = "Read metadata from Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] -perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -610,9 +593,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} - [package.extras] testing = ["coverage", "pyyaml"] @@ -1300,14 +1280,14 @@ python-versions = ">=3.6,<4.0" [[package]] name = "traitlets" -version = "5.2.0" -description = "Traitlets Python configuration system" +version = "5.2.1.post0" +description = "" category = "dev" optional = false python-versions = ">=3.7" [package.extras] -test = ["pytest", "pre-commit"] +test = ["pre-commit", "pytest"] [[package]] name = "typing-extensions" @@ -1458,22 +1438,10 @@ python-versions = ">=3.6" idna = ">=2.0" multidict = ">=4.0" -[[package]] -name = "zipp" -version = "3.8.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] - [metadata] lock-version = "1.1" -python-versions = "^3.9" -content-hash = "48207f450bd3f15faf69721a1b2daed6b15aa5e23ff94a6ab05036f37d844d73" +python-versions = ">=3.10,<4.0.0" +content-hash = "28457eb74ad24dd15a02b512fa3ae09a37be3ec85770465e6e6143c88d8c32fd" [metadata.files] aiofiles = [ @@ -1497,8 +1465,8 @@ arrow = [ {file = "arrow-1.2.2.tar.gz", hash = "sha256:05caf1fd3d9a11a1135b2b6f09887421153b94558e5ef4d090b567b47173ac2b"}, ] asgiref = [ - {file = "asgiref-3.5.1-py3-none-any.whl", hash = "sha256:45a429524fba18aba9d512498b19d220c4d628e75b40cf5c627524dbaebc5cc1"}, - {file = "asgiref-3.5.1.tar.gz", hash = "sha256:fddeea3c53fa99d0cdb613c3941cc6e52d822491fc2753fba25768fb5bf4e865"}, + {file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"}, + {file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"}, ] asttokens = [ {file = "asttokens-2.0.5-py2.py3-none-any.whl", hash = "sha256:0844691e88552595a6f4a4281a9f7f79b8dd45ca4ccea82e5e05b4bbdb76705c"}, @@ -1774,10 +1742,6 @@ idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] -importlib-metadata = [ - {file = "importlib_metadata-4.11.3-py3-none-any.whl", hash = "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6"}, - {file = "importlib_metadata-4.11.3.tar.gz", hash = "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"}, -] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -2249,8 +2213,8 @@ tomlkit = [ {file = "tomlkit-0.9.2.tar.gz", hash = "sha256:ebd982d61446af95a1e082b103e250cb9e6d152eae2581d4a07d31a70b34ab0f"}, ] traitlets = [ - {file = "traitlets-5.2.0-py3-none-any.whl", hash = "sha256:9dd4025123fbe018a2092b2ad6984792f53ea3362c698f37473258b1fa97b0bc"}, - {file = "traitlets-5.2.0.tar.gz", hash = "sha256:60474f39bf1d39a11e0233090b99af3acee93bbc2281777e61dd8c87da8a0014"}, + {file = "traitlets-5.2.1.post0-py3-none-any.whl", hash = "sha256:f44b708d33d98b0addb40c29d148a761f44af740603a8fd0e2f8b5b27cf0f087"}, + {file = "traitlets-5.2.1.post0.tar.gz", hash = "sha256:70815ecb20ec619d1af28910ade523383be13754283aef90528eb3d47b77c5db"}, ] typing-extensions = [ {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"}, @@ -2430,7 +2394,3 @@ yarl = [ {file = "yarl-1.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:797c2c412b04403d2da075fb93c123df35239cd7b4cc4e0cd9e5839b73f52c58"}, {file = "yarl-1.7.2.tar.gz", hash = "sha256:45399b46d60c253327a460e99856752009fcee5f5d3c80b2f7c0cae1c38d56dd"}, ] -zipp = [ - {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"}, - {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"}, -] diff --git a/pyproject.toml b/pyproject.toml index 77e2ad6..ef78f45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = ">=3.10" +python = ">=3.10,<4.0.0" nonebot2 = "^2.0.0-beta.2" httpx = ">=0.16.1 <1.0.0" bs4 = "^0.0.1" @@ -64,7 +64,7 @@ asyncio_mode = "auto" [tool.black] line-length = 88 -target-version = ["py39", "py310"] +target-version = ["py310"] include = '\.pyi?$' extend-exclude = ''' ''' diff --git a/src/plugins/nonebot_bison/platform/mcbbsnews.py b/src/plugins/nonebot_bison/platform/mcbbsnews.py index ad9b684..df31cf1 100644 --- a/src/plugins/nonebot_bison/platform/mcbbsnews.py +++ b/src/plugins/nonebot_bison/platform/mcbbsnews.py @@ -2,15 +2,22 @@ import re import time import httpx -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString, Tag +from ..post import Post from ..types import Category, RawPost, Target from .platform import CategoryNotSupport, NewMessage -def _format_text(rawtext: str) -> str: - """处理BeautifulSoup生成的string中奇怪的回车+连续空格""" - ftext = re.sub(r"\n\s*", " ", rawtext) +def _format_text(rawtext: str, mode: int) -> str: + """处理BeautifulSoup生成的string中奇怪的回车+连续空格 + mode 0:处理标题 + mode 1:处理推文""" + match mode: + case 0: + ftext = re.sub(r"\n\s*", " ", rawtext) + case 1: + ftext = re.sub(r"[\n\s*]", "", rawtext) return ftext @@ -47,24 +54,28 @@ class McbbsJavaNews(NewMessage): raw_post_list = soup.find_all( "tbody", id=re.compile(r"normalthread_[0-9]*") ) - post_list = [] - for raw_post in raw_post_list: - post = {} - post["url"] = raw_post.find("a", class_="s xst")["href"] - post["title"] = _format_text(raw_post.find("a", class_="s xst").string) - post["category"] = raw_post.select("th em a")[0].string - post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string - post["id"] = raw_post["id"] - rawdate = ( - raw_post.select("td:nth-of-type(2) em span span")[0]["title"] - if raw_post.select("td:nth-of-type(2) em span span") - else raw_post.select("td:nth-of-type(2) em span")[0].string - ) - post["date"] = _stamp_date(rawdate) - post_list.append(post) + post_list = self._gen_post_list(raw_post_list) return post_list + def _gen_post_list(self, raw_post_list): + post_list = [] + for raw_post in raw_post_list: + post = {} + post["url"] = raw_post.find("a", class_="s xst")["href"] + post["title"] = _format_text(raw_post.find("a", class_="s xst").string, 0) + post["category"] = raw_post.select("th em a")[0].string + post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string + post["id"] = raw_post["id"] + rawdate = ( + raw_post.select("td:nth-of-type(2) em span span")[0]["title"] + if raw_post.select("td:nth-of-type(2) em span span") + else raw_post.select("td:nth-of-type(2) em span")[0].string + ) + post["date"] = _stamp_date(rawdate) + post_list.append(post) + return post_list + def get_id(self, post: RawPost) -> str: return post["id"] @@ -72,7 +83,103 @@ class McbbsJavaNews(NewMessage): return post["date"] def get_category(self, post: RawPost) -> Category: - if post["category"] == "Java版本资讯": - return Category(1) - else: - return CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"])) + match post["category"]: + case "Java版本资讯": + return Category(1) + case _: + raise CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"])) + + def _check_str_chinese(self, check_str: str) -> bool: + """检测字符串是否含有中文(有一个就算)""" + for ch in check_str: + if "\u4e00" <= ch <= "\u9fff": + return True + return False + + def _javanews_parser(self, rawtext: str): + """提取Java版本资讯的推送消息""" + # 事先删除不需要的尾部 + rawtext = re.sub(r"【本文排版借助了:[\s\S]*】", "", rawtext) + rawsoup = BeautifulSoup(rawtext.replace("
", ""), "html.parser") + # 获取头图 + pic_tag = rawsoup.find( + "img", file=re.compile(r"https://www.minecraft.net/\S*header.jpg") + ) + pic_url: list[str] = [pic_tag.get("src", pic_tag.get("file"))] + # 获取blockquote标签下的内容 + soup = rawsoup.find( + "td", id=re.compile(r"postmessage_[0-9]*") + ).blockquote.blockquote + # 删除无用的div和span段内容 + for del_tag in soup.find_all(["div", "span"]): + del_tag.extract() + # 进一步删除无用尾部 + soup.select("blockquote > strong")[0].extract() + # 展开所有的a,u和strong标签,展开ul,font标签里的font标签 + for unwrap_tag in soup.find_all(["a", "strong", "u", "ul", "font"]): + match unwrap_tag.name: + case "a" | "strong" | "u": # 展开所有的a,u和strong标签 + unwrap_tag.unwrap() + case "ul" | "font": # 展开ul,font里的font标签 + for font_tag in unwrap_tag.find_all("font"): + font_tag.unwrap() + + # 获取所有的中文句子 + post_text = "" + last_is_empty_line = True + for element in soup.contents: + if isinstance(element, Tag): + match element.name: + case "font": + text = "" + for sub in element.contents: + if isinstance(sub, NavigableString): + text += sub + if self._check_str_chinese(text): + post_text += "\n{}".format(_format_text(text, 1)) + last_is_empty_line = False + case "ul": + for li_tag in element.find_all("li"): + text = "" + for sub in li_tag.contents: + if isinstance(sub, NavigableString): + text += sub + if self._check_str_chinese(text): + post_text += "\n{}".format(_format_text(text, 1)) + last_is_empty_line = False + case _: + continue + elif isinstance(element, NavigableString): + if str(element) == "\n": + if not last_is_empty_line: + post_text += "\n" + last_is_empty_line = True + else: + post_text += "\n{}".format(_format_text(element, 1)) + last_is_empty_line = False + else: + continue + return post_text, pic_url + + async def parse(self, raw_post: RawPost) -> Post: + post_url = "https://www.mcbbs.net/{}".format(raw_post["url"]) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/51.0.2704.63 Safari/537.36" + } + async with httpx.AsyncClient() as client: + html = await client.get(post_url, headers=headers) + match raw_post["category"]: + case "Java版本资讯": + text, pic_urls = self._javanews_parser(html) + case _: + raise CategoryNotSupport( + "McbbsNews订阅暂不支持 `{}".format(raw_post["category"]) + ) + return Post( + self.name, + text=text, + url=post_url, + pics=pic_urls, + target_name=raw_post["category"], + )