UKM cfa91df5e4
🐞 fix(text-similarity): 修复除0报错 (#302)
* 🐞 fix(text-similarity): 修复除0报错

没有考虑到bilibili的推送出现动态或者视频简介长度为零的情况,出现文本相似度除0Error

* 🧪 test(bilibili): 添加视频动态内容为空的情况的测试

* 🧪 test(text-similarity): 增加文本相似度函数的测试

* Update test_rss.py

* 📃 docs(text_similarity): 添加文本相似度函数的注释

* 🦄 refactor(text_similarity): 重构文本相似度的比较方法

* 🎈 perf(similar_text): 将比较函数的return改成raise

* 🦄 refactor(text_similarity): 重构文本相似度比较方法

* Update nonebot_bison/platform/bilibili.py

Co-authored-by: felinae98 <731499577@qq.com>

* Update nonebot_bison/platform/rss.py

Co-authored-by: felinae98 <731499577@qq.com>

---------

Co-authored-by: felinae98 <731499577@qq.com>
2023-08-27 16:28:26 +08:00

104 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import sys
import difflib
import nonebot
from nonebot.plugin import require
from bs4 import BeautifulSoup as bs
from nonebot.log import logger, default_format
from nonebot_plugin_saa import Text, Image, MessageSegmentFactory
from .http import http_client
from .context import ProcessContext
from ..plugin_config import plugin_config
from .scheduler_config import SchedulerConfig, scheduler
__all__ = [
"http_client",
"Singleton",
"parse_text",
"ProcessContext",
"html_to_text",
"SchedulerConfig",
"scheduler",
]
class Singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super().__call__(*args, **kwargs)
return cls._instances[cls]
async def parse_text(text: str) -> MessageSegmentFactory:
"return raw text if don't use pic, otherwise return rendered opcode"
if plugin_config.bison_use_pic:
require("nonebot_plugin_htmlrender")
from nonebot_plugin_htmlrender import text_to_pic as _text_to_pic
return Image(await _text_to_pic(text))
else:
return Text(text)
if not plugin_config.bison_skip_browser_check:
require("nonebot_plugin_htmlrender")
def html_to_text(html: str, query_dict: dict = {}) -> str:
html = re.sub(r"<br\s*/?>", "<br>\n", html)
html = html.replace("</p>", "</p>\n")
soup = bs(html, "html.parser")
if query_dict:
node = soup.find(**query_dict)
else:
node = soup
assert node is not None
return node.text.strip()
class Filter:
def __init__(self) -> None:
self.level: int | str = "DEBUG"
def __call__(self, record):
module_name: str = record["name"]
module = sys.modules.get(module_name)
if module:
module_name = getattr(module, "__module_name__", module_name)
record["name"] = module_name.split(".")[0]
levelno = logger.level(self.level).no if isinstance(self.level, str) else self.level
nonebot_warning_level = logger.level("WARNING").no
return (
record["level"].no >= levelno
if record["name"] != "nonebot"
else record["level"].no >= nonebot_warning_level
)
if plugin_config.bison_filter_log:
logger.remove()
default_filter = Filter()
logger.add(
sys.stdout,
colorize=True,
diagnose=False,
filter=default_filter,
format=default_format,
)
config = nonebot.get_driver().config
logger.success("Muted info & success from nonebot")
default_filter.level = ("DEBUG" if config.debug else "INFO") if config.log_level is None else config.log_level
def text_similarity(str1, str2) -> float:
"""利用最长公共子序列的算法判断两个字符串是否相似并返回0到1.0的相似度"""
if len(str1) == 0 or len(str2) == 0:
raise ValueError("The length of string can not be 0")
matcher = difflib.SequenceMatcher(None, str1, str2)
t = sum(temp.size for temp in matcher.get_matching_blocks())
return t / min(len(str1), len(str2))