UKM 9838e25bad
🎈优化RSS推送的内容 ()
* 🧪 test(tests): 添加了RSS的单元测试

* 🎈 perf(rss and test): 优化了RSS部分源标题正文重复的问题

部分RSS源(RSSHub的Twitter)存在正文当标题用的情况,导致推送的时候呈现为两段重复的文字,现通过Jaccard相似系数来判断是否需要去重

* Update nonebot_bison/platform/rss.py

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>

* Update nonebot_bison/platform/rss.py

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>

* 🐞 fix(platform/rss): 修复了漏掉相似文本在后端位置的问题

* 🐞 fix(rss): 修正一些feed无法正确识别时间的bug

一些feed时间只有updated标签或者没有,原先的代码只能解析用published标签的时间

felinae98#275

* 🎈 perf(rss): 更改字符串相似度比较方法

从Jaccard相似系数比较相似度改为通过最长公共子序列来比较

* 🦄 refactor(rss): 重构实现字符串相似度比较的方法

使用标准库difflib代替原先手搓的LCS

* Update nonebot_bison/utils/__init__.py

Co-authored-by: felinae98 <731499577@qq.com>

* Update nonebot_bison/platform/rss.py

* Update nonebot_bison/platform/rss.py

---------

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>
Co-authored-by: felinae98 <731499577@qq.com>
2023-07-18 11:54:49 +08:00

119 lines
3.2 KiB
Python

import difflib
import re
import sys
from typing import Union
import nonebot
from bs4 import BeautifulSoup as bs
from nonebot.log import default_format, logger
from nonebot.plugin import require
from nonebot_plugin_saa import Image, MessageSegmentFactory, Text
from ..plugin_config import plugin_config
from .context import ProcessContext
from .http import http_client
from .scheduler_config import SchedulerConfig, scheduler
__all__ = [
"http_client",
"Singleton",
"parse_text",
"ProcessContext",
"html_to_text",
"SchedulerConfig",
"scheduler",
]
class Singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
async def parse_text(text: str) -> MessageSegmentFactory:
"return raw text if don't use pic, otherwise return rendered opcode"
if plugin_config.bison_use_pic:
require("nonebot_plugin_htmlrender")
from nonebot_plugin_htmlrender import text_to_pic as _text_to_pic
return Image(await _text_to_pic(text))
else:
return Text(text)
if not plugin_config.bison_skip_browser_check:
require("nonebot_plugin_htmlrender")
def html_to_text(html: str, query_dict: dict = {}) -> str:
html = re.sub(r"<br\s*/?>", "<br>\n", html)
html = html.replace("</p>", "</p>\n")
soup = bs(html, "html.parser")
if query_dict:
node = soup.find(**query_dict)
else:
node = soup
assert node is not None
return node.text.strip()
class Filter:
def __init__(self) -> None:
self.level: Union[int, str] = "DEBUG"
def __call__(self, record):
module_name: str = record["name"]
module = sys.modules.get(module_name)
if module:
module_name = getattr(module, "__module_name__", module_name)
record["name"] = module_name.split(".")[0]
levelno = (
logger.level(self.level).no if isinstance(self.level, str) else self.level
)
nonebot_warning_level = logger.level("WARNING").no
return (
record["level"].no >= levelno
if record["name"] != "nonebot"
else record["level"].no >= nonebot_warning_level
)
if plugin_config.bison_filter_log:
logger.remove()
default_filter = Filter()
logger.add(
sys.stdout,
colorize=True,
diagnose=False,
filter=default_filter,
format=default_format,
)
config = nonebot.get_driver().config
logger.success("Muted info & success from nonebot")
default_filter.level = (
("DEBUG" if config.debug else "INFO")
if config.log_level is None
else config.log_level
)
def jaccard_text_similarity(str1: str, str2: str) -> float:
"""
计算两个字符串(基于字符)的
[Jaccard相似系数](https://zh.wikipedia.org/wiki/雅卡尔指数)
是否达到阈值
"""
set1 = set(str1)
set2 = set(str2)
return len(set1 & set2) / len(set1 | set2)
def text_similarity(str1, str2) -> float:
matcher = difflib.SequenceMatcher(None, str1, str2)
t = sum(temp.size for temp in matcher.get_matching_blocks())
return t / min(len(str1), len(str2))