From ff3c0ffe65c5818f0166bfcfab21ffb521078af6 Mon Sep 17 00:00:00 2001 From: UKM Date: Tue, 18 Jul 2023 15:06:46 +0800 Subject: [PATCH] =?UTF-8?q?:bug:=20=E4=BF=AE=E5=A4=8Dbilibili=E6=8E=A8?= =?UTF-8?q?=E9=80=81=E7=9A=84=E4=B8=80=E4=BA=9B=E6=A0=BC=E5=BC=8F=E9=94=99?= =?UTF-8?q?=E8=AF=AF=20(#263)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🎈 perf(platform/bilibili): 增加了相似度计算前文本的预处理 将动态和简介文本中较长的一段按照较短的一段进行截取(分了从前截和从后截的两种情况) * 🐞 fix(bilibili): 修复视频简介多余空格的bug * 🦄 refactor(bilibili): 更改文本相似度比较函数 --- .gitignore | 2 ++ nonebot_bison/platform/bilibili.py | 11 +++-------- nonebot_bison/utils/__init__.py | 11 ----------- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 6973892..f07dd00 100644 --- a/.gitignore +++ b/.gitignore @@ -316,6 +316,8 @@ docs/.vuepress/.temp/ # and uncomment the following lines # .pnp.* +### macOS ### +.DS_Store # End of https://www.toptal.com/developers/gitignore/api/python,linux,vim data*/* diff --git a/nonebot_bison/platform/bilibili.py b/nonebot_bison/platform/bilibili.py index b4ed4ff..9769348 100644 --- a/nonebot_bison/platform/bilibili.py +++ b/nonebot_bison/platform/bilibili.py @@ -12,7 +12,7 @@ from typing_extensions import Self from ..post import Post from ..types import ApiError, Category, RawPost, Tag, Target -from ..utils import SchedulerConfig, jaccard_text_similarity +from ..utils import SchedulerConfig, text_similarity from .platform import CategoryNotRecognize, CategoryNotSupport, NewMessage, StatusChange @@ -151,7 +151,7 @@ class Bilibili(NewMessage): title = card["title"] desc = card.get("desc", "") - if jaccard_text_similarity(desc, dynamic) > 0.8: + if text_similarity(desc, dynamic) > 0.8: # 如果视频简介和动态内容相似,就只保留长的那个 if len(dynamic) > len(desc): text = f"{dynamic}\n=================\n{title}" @@ -159,12 +159,7 @@ class Bilibili(NewMessage): text = f"{title}\n\n{desc}" else: # 否则就把两个拼起来 - text = f""" - {dynamic} - \n=================\n - {title}\n\n - {desc} - """ + text = f"{dynamic}\n=================\n{title}\n\n{desc}" pic = [card["pic"]] elif post_type == 4: diff --git a/nonebot_bison/utils/__init__.py b/nonebot_bison/utils/__init__.py index 31082ad..60aeacc 100644 --- a/nonebot_bison/utils/__init__.py +++ b/nonebot_bison/utils/__init__.py @@ -101,17 +101,6 @@ if plugin_config.bison_filter_log: ) -def jaccard_text_similarity(str1: str, str2: str) -> float: - """ - 计算两个字符串(基于字符)的 - [Jaccard相似系数](https://zh.wikipedia.org/wiki/雅卡尔指数) - 是否达到阈值 - """ - set1 = set(str1) - set2 = set(str2) - return len(set1 & set2) / len(set1 | set2) - - def text_similarity(str1, str2) -> float: matcher = difflib.SequenceMatcher(None, str1, str2) t = sum(temp.size for temp in matcher.get_matching_blocks())