初步添加Mcbbsnews的Java版本资讯订阅

2026-06-23 14:16:52 +08:00 · 2022-05-19 02:15:21 +08:00
parent f566ddf894
commit d147aa0e39
3 changed files with 142 additions and 75 deletions
@@ -70,7 +70,7 @@ python-dateutil = ">=2.7.0"

 [[package]]
 name = "asgiref"
-version = "3.5.1"
+version = "3.5.2"
 description = "ASGI specs, helper code, and adapters"
 category = "main"
 optional = false
@@ -183,7 +183,6 @@ mypy-extensions = ">=0.4.3"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}

 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -460,22 +459,6 @@ category = "main"
 optional = false
 python-versions = ">=3.5"

-[[package]]
-name = "importlib-metadata"
-version = "4.11.3"
-description = "Read metadata from Python packages"
-category = "main"
-optional = false
-python-versions = ">=3.7"
-
-[package.dependencies]
-zipp = ">=0.5"
-
-[package.extras]
-docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
-perf = ["ipython"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
-
 [[package]]
 name = "iniconfig"
 version = "1.1.1"
@@ -610,9 +593,6 @@ category = "main"
 optional = false
 python-versions = ">=3.6"

-[package.dependencies]
-importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
-
 [package.extras]
 testing = ["coverage", "pyyaml"]

@@ -1300,14 +1280,14 @@ python-versions = ">=3.6,<4.0"

 [[package]]
 name = "traitlets"
-version = "5.2.0"
-description = "Traitlets Python configuration system"
+version = "5.2.1.post0"
+description = ""
 category = "dev"
 optional = false
 python-versions = ">=3.7"

 [package.extras]
-test = ["pytest", "pre-commit"]
+test = ["pre-commit", "pytest"]

 [[package]]
 name = "typing-extensions"
@@ -1458,22 +1438,10 @@ python-versions = ">=3.6"
 idna = ">=2.0"
 multidict = ">=4.0"

-[[package]]
-name = "zipp"
-version = "3.8.0"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-category = "main"
-optional = false
-python-versions = ">=3.7"
-
-[package.extras]
-docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
-
 [metadata]
 lock-version = "1.1"
-python-versions = "^3.9"
-content-hash = "48207f450bd3f15faf69721a1b2daed6b15aa5e23ff94a6ab05036f37d844d73"
+python-versions = ">=3.10,<4.0.0"
+content-hash = "28457eb74ad24dd15a02b512fa3ae09a37be3ec85770465e6e6143c88d8c32fd"

 [metadata.files]
 aiofiles = [
@@ -1497,8 +1465,8 @@ arrow = [
    {file = "arrow-1.2.2.tar.gz", hash = "sha256:05caf1fd3d9a11a1135b2b6f09887421153b94558e5ef4d090b567b47173ac2b"},
 ]
 asgiref = [
-    {file = "asgiref-3.5.1-py3-none-any.whl", hash = "sha256:45a429524fba18aba9d512498b19d220c4d628e75b40cf5c627524dbaebc5cc1"},
-    {file = "asgiref-3.5.1.tar.gz", hash = "sha256:fddeea3c53fa99d0cdb613c3941cc6e52d822491fc2753fba25768fb5bf4e865"},
+    {file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"},
+    {file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"},
 ]
 asttokens = [
    {file = "asttokens-2.0.5-py2.py3-none-any.whl", hash = "sha256:0844691e88552595a6f4a4281a9f7f79b8dd45ca4ccea82e5e05b4bbdb76705c"},
@@ -1774,10 +1742,6 @@ idna = [
    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
 ]
-importlib-metadata = [
-    {file = "importlib_metadata-4.11.3-py3-none-any.whl", hash = "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6"},
-    {file = "importlib_metadata-4.11.3.tar.gz", hash = "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"},
-]
 iniconfig = [
    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@@ -2249,8 +2213,8 @@ tomlkit = [
    {file = "tomlkit-0.9.2.tar.gz", hash = "sha256:ebd982d61446af95a1e082b103e250cb9e6d152eae2581d4a07d31a70b34ab0f"},
 ]
 traitlets = [
-    {file = "traitlets-5.2.0-py3-none-any.whl", hash = "sha256:9dd4025123fbe018a2092b2ad6984792f53ea3362c698f37473258b1fa97b0bc"},
-    {file = "traitlets-5.2.0.tar.gz", hash = "sha256:60474f39bf1d39a11e0233090b99af3acee93bbc2281777e61dd8c87da8a0014"},
+    {file = "traitlets-5.2.1.post0-py3-none-any.whl", hash = "sha256:f44b708d33d98b0addb40c29d148a761f44af740603a8fd0e2f8b5b27cf0f087"},
+    {file = "traitlets-5.2.1.post0.tar.gz", hash = "sha256:70815ecb20ec619d1af28910ade523383be13754283aef90528eb3d47b77c5db"},
 ]
 typing-extensions = [
    {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"},
@@ -2430,7 +2394,3 @@ yarl = [
    {file = "yarl-1.7.2-cp39-cp39-win_amd64.whl", hash = "sha256:797c2c412b04403d2da075fb93c123df35239cd7b4cc4e0cd9e5839b73f52c58"},
    {file = "yarl-1.7.2.tar.gz", hash = "sha256:45399b46d60c253327a460e99856752009fcee5f5d3c80b2f7c0cae1c38d56dd"},
 ]
-zipp = [
-    {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"},
-    {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"},
-]
@@ -23,7 +23,7 @@ classifiers = [
 ]

 [tool.poetry.dependencies]
-python = ">=3.10"
+python = ">=3.10,<4.0.0"
 nonebot2 = "^2.0.0-beta.2"
 httpx = ">=0.16.1 <1.0.0"
 bs4 = "^0.0.1"
@@ -64,7 +64,7 @@ asyncio_mode = "auto"

 [tool.black]
 line-length = 88
-target-version = ["py39", "py310"]
+target-version = ["py310"]
 include = '\.pyi?$'
 extend-exclude = '''
 '''
@@ -2,15 +2,22 @@ import re
 import time

 import httpx
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString, Tag

+from ..post import Post
 from ..types import Category, RawPost, Target
 from .platform import CategoryNotSupport, NewMessage


-def _format_text(rawtext: str) -> str:
-    """处理BeautifulSoup生成的string中奇怪的回车+连续空格"""
-    ftext = re.sub(r"\n\s*", " ", rawtext)
+def _format_text(rawtext: str, mode: int) -> str:
+    """处理BeautifulSoup生成的string中奇怪的回车+连续空格
+    mode 0:处理标题
+    mode 1:处理推文"""
+    match mode:
+        case 0:
+            ftext = re.sub(r"\n\s*", " ", rawtext)
+        case 1:
+            ftext = re.sub(r"[\n\s*]", "", rawtext)
    return ftext


@@ -47,24 +54,28 @@ class McbbsJavaNews(NewMessage):
            raw_post_list = soup.find_all(
                "tbody", id=re.compile(r"normalthread_[0-9]*")
            )
-            post_list = []
-            for raw_post in raw_post_list:
-                post = {}
-                post["url"] = raw_post.find("a", class_="s xst")["href"]
-                post["title"] = _format_text(raw_post.find("a", class_="s xst").string)
-                post["category"] = raw_post.select("th em a")[0].string
-                post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string
-                post["id"] = raw_post["id"]
-                rawdate = (
-                    raw_post.select("td:nth-of-type(2) em span span")[0]["title"]
-                    if raw_post.select("td:nth-of-type(2) em span span")
-                    else raw_post.select("td:nth-of-type(2) em span")[0].string
-                )
-                post["date"] = _stamp_date(rawdate)
-                post_list.append(post)
+            post_list = self._gen_post_list(raw_post_list)

        return post_list

+    def _gen_post_list(self, raw_post_list):
+        post_list = []
+        for raw_post in raw_post_list:
+            post = {}
+            post["url"] = raw_post.find("a", class_="s xst")["href"]
+            post["title"] = _format_text(raw_post.find("a", class_="s xst").string, 0)
+            post["category"] = raw_post.select("th em a")[0].string
+            post["author"] = raw_post.select("td:nth-of-type(2) cite a")[0].string
+            post["id"] = raw_post["id"]
+            rawdate = (
+                raw_post.select("td:nth-of-type(2) em span span")[0]["title"]
+                if raw_post.select("td:nth-of-type(2) em span span")
+                else raw_post.select("td:nth-of-type(2) em span")[0].string
+            )
+            post["date"] = _stamp_date(rawdate)
+            post_list.append(post)
+        return post_list
+
    def get_id(self, post: RawPost) -> str:
        return post["id"]

@@ -72,7 +83,103 @@ class McbbsJavaNews(NewMessage):
        return post["date"]

    def get_category(self, post: RawPost) -> Category:
-        if post["category"] == "Java版本资讯":
-            return Category(1)
-        else:
-            return CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"]))
+        match post["category"]:
+            case "Java版本资讯":
+                return Category(1)
+            case _:
+                raise CategoryNotSupport("McbbsNews订阅暂不支持 `{}".format(post["category"]))
+
+    def _check_str_chinese(self, check_str: str) -> bool:
+        """检测字符串是否含有中文（有一个就算）"""
+        for ch in check_str:
+            if "\u4e00" <= ch <= "\u9fff":
+                return True
+        return False
+
+    def _javanews_parser(self, rawtext: str):
+        """提取Java版本资讯的推送消息"""
+        # 事先删除不需要的尾部
+        rawtext = re.sub(r"【本文排版借助了：[\s\S]*】", "", rawtext)
+        rawsoup = BeautifulSoup(rawtext.replace("<br />", ""), "html.parser")
+        # 获取头图
+        pic_tag = rawsoup.find(
+            "img", file=re.compile(r"https://www.minecraft.net/\S*header.jpg")
+        )
+        pic_url: list[str] = [pic_tag.get("src", pic_tag.get("file"))]
+        # 获取blockquote标签下的内容
+        soup = rawsoup.find(
+            "td", id=re.compile(r"postmessage_[0-9]*")
+        ).blockquote.blockquote
+        # 删除无用的div和span段内容
+        for del_tag in soup.find_all(["div", "span"]):
+            del_tag.extract()
+        # 进一步删除无用尾部
+        soup.select("blockquote > strong")[0].extract()
+        # 展开所有的a,u和strong标签,展开ul,font标签里的font标签
+        for unwrap_tag in soup.find_all(["a", "strong", "u", "ul", "font"]):
+            match unwrap_tag.name:
+                case "a" | "strong" | "u":  # 展开所有的a,u和strong标签
+                    unwrap_tag.unwrap()
+                case "ul" | "font":  # 展开ul,font里的font标签
+                    for font_tag in unwrap_tag.find_all("font"):
+                        font_tag.unwrap()
+
+        # 获取所有的中文句子
+        post_text = ""
+        last_is_empty_line = True
+        for element in soup.contents:
+            if isinstance(element, Tag):
+                match element.name:
+                    case "font":
+                        text = ""
+                        for sub in element.contents:
+                            if isinstance(sub, NavigableString):
+                                text += sub
+                        if self._check_str_chinese(text):
+                            post_text += "\n{}".format(_format_text(text, 1))
+                            last_is_empty_line = False
+                    case "ul":
+                        for li_tag in element.find_all("li"):
+                            text = ""
+                            for sub in li_tag.contents:
+                                if isinstance(sub, NavigableString):
+                                    text += sub
+                            if self._check_str_chinese(text):
+                                post_text += "\n{}".format(_format_text(text, 1))
+                                last_is_empty_line = False
+                    case _:
+                        continue
+            elif isinstance(element, NavigableString):
+                if str(element) == "\n":
+                    if not last_is_empty_line:
+                        post_text += "\n"
+                    last_is_empty_line = True
+                else:
+                    post_text += "\n{}".format(_format_text(element, 1))
+                    last_is_empty_line = False
+            else:
+                continue
+        return post_text, pic_url
+
+    async def parse(self, raw_post: RawPost) -> Post:
+        post_url = "https://www.mcbbs.net/{}".format(raw_post["url"])
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/51.0.2704.63 Safari/537.36"
+        }
+        async with httpx.AsyncClient() as client:
+            html = await client.get(post_url, headers=headers)
+            match raw_post["category"]:
+                case "Java版本资讯":
+                    text, pic_urls = self._javanews_parser(html)
+                case _:
+                    raise CategoryNotSupport(
+                        "McbbsNews订阅暂不支持 `{}".format(raw_post["category"])
+                    )
+        return Post(
+            self.name,
+            text=text,
+            url=post_url,
+            pics=pic_urls,
+            target_name=raw_post["category"],
+        )