mirror of
https://github.com/suyiiyii/nonebot-bison.git
synced 2025-06-04 02:26:11 +08:00
* 🐞 fix(text-similarity): 修复除0报错 没有考虑到bilibili的推送出现动态或者视频简介长度为零的情况,出现文本相似度除0Error * 🧪 test(bilibili): 添加视频动态内容为空的情况的测试 * 🧪 test(text-similarity): 增加文本相似度函数的测试 * Update test_rss.py * 📃 docs(text_similarity): 添加文本相似度函数的注释 * 🦄 refactor(text_similarity): 重构文本相似度的比较方法 * 🎈 perf(similar_text): 将比较函数的return改成raise * 🦄 refactor(text_similarity): 重构文本相似度比较方法 * Update nonebot_bison/platform/bilibili.py Co-authored-by: felinae98 <731499577@qq.com> * Update nonebot_bison/platform/rss.py Co-authored-by: felinae98 <731499577@qq.com> --------- Co-authored-by: felinae98 <731499577@qq.com>
81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
import time
|
|
import calendar
|
|
from typing import Any
|
|
|
|
import feedparser
|
|
from httpx import AsyncClient
|
|
from bs4 import BeautifulSoup as bs
|
|
|
|
from ..post import Post
|
|
from .platform import NewMessage
|
|
from ..types import Target, RawPost
|
|
from ..utils import SchedulerConfig, text_similarity
|
|
|
|
|
|
class RssSchedConf(SchedulerConfig):
|
|
name = "rss"
|
|
schedule_type = "interval"
|
|
schedule_setting = {"seconds": 30}
|
|
|
|
|
|
class Rss(NewMessage):
|
|
categories = {}
|
|
enable_tag = False
|
|
platform_name = "rss"
|
|
name = "Rss"
|
|
enabled = True
|
|
is_common = True
|
|
scheduler = RssSchedConf
|
|
has_target = True
|
|
|
|
@classmethod
|
|
async def get_target_name(cls, client: AsyncClient, target: Target) -> str | None:
|
|
res = await client.get(target, timeout=10.0)
|
|
feed = feedparser.parse(res.text)
|
|
return feed["feed"]["title"]
|
|
|
|
def get_date(self, post: RawPost) -> int:
|
|
if hasattr(post, "published_parsed"):
|
|
return calendar.timegm(post.published_parsed)
|
|
elif hasattr(post, "updated_parsed"):
|
|
return calendar.timegm(post.updated_parsed)
|
|
else:
|
|
return calendar.timegm(time.gmtime())
|
|
|
|
def get_id(self, post: RawPost) -> Any:
|
|
return post.id
|
|
|
|
async def get_sub_list(self, target: Target) -> list[RawPost]:
|
|
res = await self.client.get(target, timeout=10.0)
|
|
feed = feedparser.parse(res)
|
|
entries = feed.entries
|
|
for entry in entries:
|
|
entry["_target_name"] = feed.feed.title
|
|
return feed.entries
|
|
|
|
def _text_process(self, title: str, desc: str) -> str:
|
|
similarity = 1.0 if len(title) == 0 or len(desc) == 0 else text_similarity(title, desc)
|
|
if similarity > 0.8:
|
|
text = title if len(title) > len(desc) else desc
|
|
else:
|
|
text = title + "\n\n" + desc
|
|
return text
|
|
|
|
async def parse(self, raw_post: RawPost) -> Post:
|
|
title = raw_post.get("title", "")
|
|
soup = bs(raw_post.description, "html.parser")
|
|
desc = soup.text.strip()
|
|
text = self._text_process(title, desc)
|
|
pics = [x.attrs["src"] for x in soup("img")]
|
|
if raw_post.get("media_content"):
|
|
for media in raw_post["media_content"]:
|
|
if media.get("medium") == "image" and media.get("url"):
|
|
pics.append(media.get("url"))
|
|
return Post(
|
|
"rss",
|
|
text=text,
|
|
url=raw_post.link,
|
|
pics=pics,
|
|
target_name=raw_post["_target_name"],
|
|
)
|