diff --git a/nonebot_bison/platform/bilibili.py b/nonebot_bison/platform/bilibili.py index 12a0d72..fabd9c2 100644 --- a/nonebot_bison/platform/bilibili.py +++ b/nonebot_bison/platform/bilibili.py @@ -125,6 +125,16 @@ class Bilibili(NewMessage): def get_tags(self, raw_post: RawPost) -> list[Tag]: return [*(tp["topic_name"] for tp in raw_post["display"]["topic_info"]["topic_details"])] + def _text_process(self, dynamic: str, desc: str, title: str) -> str: + similarity = 1.0 if len(dynamic) == 0 or len(desc) == 0 else text_similarity(dynamic, desc) + if len(dynamic) == 0 and len(desc) == 0: + text = title + elif similarity > 0.8: + text = title + "\n\n" + desc if len(dynamic) < len(desc) else dynamic + "\n=================\n" + title + else: + text = dynamic + "\n=================\n" + title + "\n\n" + desc + return text + def _get_info(self, post_type: Category, card) -> tuple[str, list]: if post_type == 1: # 一般动态 @@ -139,17 +149,7 @@ class Bilibili(NewMessage): dynamic = card.get("dynamic", "") title = card["title"] desc = card.get("desc", "") - - if text_similarity(desc, dynamic) > 0.8: - # 如果视频简介和动态内容相似,就只保留长的那个 - if len(dynamic) > len(desc): - text = f"{dynamic}\n=================\n{title}" - else: - text = f"{title}\n\n{desc}" - else: - # 否则就把两个拼起来 - text = f"{dynamic}\n=================\n{title}\n\n{desc}" - + text = self._text_process(dynamic, desc, title) pic = [card["pic"]] elif post_type == 4: # 纯文字 diff --git a/nonebot_bison/platform/rss.py b/nonebot_bison/platform/rss.py index 1a0df49..aa1a9fb 100644 --- a/nonebot_bison/platform/rss.py +++ b/nonebot_bison/platform/rss.py @@ -53,18 +53,19 @@ class Rss(NewMessage): entry["_target_name"] = feed.feed.title return feed.entries + def _text_process(self, title: str, desc: str) -> str: + similarity = 1.0 if len(title) == 0 or len(desc) == 0 else text_similarity(title, desc) + if similarity > 0.8: + text = title if len(title) > len(desc) else desc + else: + text = title + "\n\n" + desc + return text + async def parse(self, raw_post: RawPost) -> Post: title = raw_post.get("title", "") soup = bs(raw_post.description, "html.parser") desc = soup.text.strip() - if not title or not desc: - text = title or desc - else: - if text_similarity(desc, title) > 0.8: - text = desc if len(desc) > len(title) else title - else: - text = f"{title}\n\n{desc}" - + text = self._text_process(title, desc) pics = [x.attrs["src"] for x in soup("img")] if raw_post.get("media_content"): for media in raw_post["media_content"]: diff --git a/nonebot_bison/utils/__init__.py b/nonebot_bison/utils/__init__.py index cfe5101..4534bd4 100644 --- a/nonebot_bison/utils/__init__.py +++ b/nonebot_bison/utils/__init__.py @@ -95,6 +95,9 @@ if plugin_config.bison_filter_log: def text_similarity(str1, str2) -> float: + """利用最长公共子序列的算法判断两个字符串是否相似,并返回0到1.0的相似度""" + if len(str1) == 0 or len(str2) == 0: + raise ValueError("The length of string can not be 0") matcher = difflib.SequenceMatcher(None, str1, str2) t = sum(temp.size for temp in matcher.get_matching_blocks()) return t / min(len(str1), len(str2)) diff --git a/tests/platforms/static/bilibili_bing_list.json b/tests/platforms/static/bilibili_bing_list.json index 573eedd..e35c857 100644 --- a/tests/platforms/static/bilibili_bing_list.json +++ b/tests/platforms/static/bilibili_bing_list.json @@ -425,7 +425,7 @@ "rid_str": "975400699", "bvid": "BV1K44y1h7Xg" }, - "card": "{\"aid\":975400699,\"attribute\":0,\"cid\":406644689,\"copyright\":1,\"ctime\":1631408447,\"desc\":\"本系列视频为饼组成员的有趣直播录播,主要内容为方舟相关,未来可能系列其他视频会包含部分饼组团建日常等。仅为娱乐性视频,内容与常规饼学预测无关。视频仅为当期主播主观观点,不代表饼组观点。仅供娱乐。\\n\\n直播主播:@寒蝉慕夏 \\n后期剪辑:@Melodiesviel \\n\\n本群视频为9.11组员慕夏直播录播,包含慕夏对新PV的个人解读,风笛厨力疯狂放出,CP言论输出,9.16轮换池预测视频分析和理智规划杂谈内容。\\n注意:内含大量个人性质对风笛的厨力观点,与多CP混乱发言,不适者请及时点击退出或跳到下一片段。\",\"dimension\":{\"height\":1080,\"rotate\":0,\"width\":1920},\"duration\":4318,\"dynamic\":\"昨天慕夏直播的录播剪辑版,关于新PV,慕夏对风笛的看法,新一期轮换池预测视频的分析以及理智规划。错过直播且有兴趣的朋友可以看啦。\",\"first_frame\":\"https:\\/\\/i1.hdslb.com\\/bfs\\/storyff\\/n210911a297vzlaeyhb8g26etg86gci5_firsti.jpg\",\"jump_url\":\"bilibili:\\/\\/video\\/975400699\\/?page=1&player_preload=null&player_width=1920&player_height=1080&player_rotate=0\",\"owner\":{\"face\":\"https:\\/\\/i0.hdslb.com\\/bfs\\/face\\/00776b6ddde4874af87b8bc2870da86ed39c2c80.jpg\",\"mid\":8412516,\"name\":\"罗德岛蜜饼工坊\"},\"pic\":\"https:\\/\\/i0.hdslb.com\\/bfs\\/archive\\/c8cb0073819a0c8171db5009002eec19a80c85f6.jpg\",\"player_info\":null,\"pubdate\":1631408446,\"rights\":{\"autoplay\":1,\"bp\":0,\"download\":0,\"elec\":0,\"hd5\":0,\"is_cooperation\":0,\"movie\":0,\"no_background\":0,\"no_reprint\":1,\"pay\":0,\"ugc_pay\":0,\"ugc_pay_preview\":0},\"short_link\":\"https:\\/\\/b23.tv\\/BV1K44y1h7Xg\",\"short_link_v2\":\"https:\\/\\/b23.tv\\/BV1K44y1h7Xg\",\"stat\":{\"aid\":975400699,\"coin\":46,\"danmaku\":156,\"dislike\":0,\"favorite\":45,\"his_rank\":0,\"like\":495,\"now_rank\":0,\"reply\":45,\"share\":6,\"view\":3293},\"state\":0,\"tid\":172,\"title\":\"阿消的罗德岛闲谈直播#01:《女人最喜欢的女人,就是在战场上熠熠生辉的女人》\",\"tname\":\"手机游戏\",\"up_from_v2\":35,\"videos\":1}", + "card": "{\"aid\":975400699,\"attribute\":0,\"cid\":406644689,\"copyright\":1,\"ctime\":1631408447,\"desc\":\"本系列视频为饼组成员的有趣直播录播,主要内容为方舟相关,未来可能系列其他视频会包含部分饼组团建日常等。仅为娱乐性视频,内容与常规饼学预测无关。视频仅为当期主播主观观点,不代表饼组观点。仅供娱乐。\\n\\n直播主播:@寒蝉慕夏 \\n后期剪辑:@Melodiesviel \\n\\n本群视频为9.11组员慕夏直播录播,包含慕夏对新PV的个人解读,风笛厨力疯狂放出,CP言论输出,9.16轮换池预测视频分析和理智规划杂谈内容。\\n注意:内含大量个人性质对风笛的厨力观点,与多CP混乱发言,不适者请及时点击退出或跳到下一片段。\",\"dimension\":{\"height\":1080,\"rotate\":0,\"width\":1920},\"duration\":4318,\"dynamic\":\"\",\"first_frame\":\"https:\\/\\/i1.hdslb.com\\/bfs\\/storyff\\/n210911a297vzlaeyhb8g26etg86gci5_firsti.jpg\",\"jump_url\":\"bilibili:\\/\\/video\\/975400699\\/?page=1&player_preload=null&player_width=1920&player_height=1080&player_rotate=0\",\"owner\":{\"face\":\"https:\\/\\/i0.hdslb.com\\/bfs\\/face\\/00776b6ddde4874af87b8bc2870da86ed39c2c80.jpg\",\"mid\":8412516,\"name\":\"罗德岛蜜饼工坊\"},\"pic\":\"https:\\/\\/i0.hdslb.com\\/bfs\\/archive\\/c8cb0073819a0c8171db5009002eec19a80c85f6.jpg\",\"player_info\":null,\"pubdate\":1631408446,\"rights\":{\"autoplay\":1,\"bp\":0,\"download\":0,\"elec\":0,\"hd5\":0,\"is_cooperation\":0,\"movie\":0,\"no_background\":0,\"no_reprint\":1,\"pay\":0,\"ugc_pay\":0,\"ugc_pay_preview\":0},\"short_link\":\"https:\\/\\/b23.tv\\/BV1K44y1h7Xg\",\"short_link_v2\":\"https:\\/\\/b23.tv\\/BV1K44y1h7Xg\",\"stat\":{\"aid\":975400699,\"coin\":46,\"danmaku\":156,\"dislike\":0,\"favorite\":45,\"his_rank\":0,\"like\":495,\"now_rank\":0,\"reply\":45,\"share\":6,\"view\":3293},\"state\":0,\"tid\":172,\"title\":\"阿消的罗德岛闲谈直播#01:《女人最喜欢的女人,就是在战场上熠熠生辉的女人》\",\"tname\":\"手机游戏\",\"up_from_v2\":35,\"videos\":1}", "extend_json": "{\"\":{\"ogv\":{\"ogv_id\":0}},\"dispute\":{\"content\":\"\"},\"from\":{\"from\":\"\",\"verify\":{}},\"like_icon\":{\"action\":\"\",\"action_url\":\"\",\"end\":\"\",\"end_url\":\"\",\"start\":\"\",\"start_url\":\"\"},\"topic\":{\"is_attach_topic\":1}}", "extra": { "is_space_top": 0 diff --git a/tests/platforms/test_bilibili.py b/tests/platforms/test_bilibili.py index 5ab2d94..453f775 100644 --- a/tests/platforms/test_bilibili.py +++ b/tests/platforms/test_bilibili.py @@ -52,6 +52,22 @@ async def test_video_forward(bilibili, bing_dy_list): ) +@pytest.mark.asyncio +async def test_video_forward_without_dynamic(bilibili, bing_dy_list): + # 视频简介和动态文本其中一方为空的情况 + post = await bilibili.parse(bing_dy_list[2]) + assert ( + post.text + == "阿消的罗德岛闲谈直播#01:《女人最喜欢的女人,就是在战场上熠熠生辉的女人》" + + "\n\n" + + "本系列视频为饼组成员的有趣直播录播,主要内容为方舟相关,未来可能系列其他视频会包含部分饼组团建日常等。" + "仅为娱乐性视频,内容与常规饼学预测无关。视频仅为当期主播主观观点,不代表饼组观点。仅供娱乐。" + "\n\n直播主播:@寒蝉慕夏 \n后期剪辑:@Melodiesviel \n\n本群视频为9.11组员慕夏直播录播," + "包含慕夏对新PV的个人解读,风笛厨力疯狂放出,CP言论输出,9.16轮换池预测视频分析和理智规划杂谈内容。" + "\n注意:内含大量个人性质对风笛的厨力观点,与多CP混乱发言,不适者请及时点击退出或跳到下一片段。" + ) + + @pytest.mark.asyncio async def test_article_forward(bilibili, bing_dy_list): post = await bilibili.parse(bing_dy_list[4]) diff --git a/tests/platforms/test_rss.py b/tests/platforms/test_rss.py index 6399272..88a78cf 100644 --- a/tests/platforms/test_rss.py +++ b/tests/platforms/test_rss.py @@ -163,3 +163,32 @@ async def test_fetch_new_4( post1 = res2[0][1][0] assert post1.url == "https://wallhaven.cc/w/85rjej" assert post1.text == "85rjej.jpg" + + +def test_similar_text_process(): + from nonebot_bison.utils import text_similarity + + str1 = "" + str2 = "xxxx" + with pytest.raises(ValueError, match="The length of string can not be 0"): + text_similarity(str1, str2) + str1 = "xxxx" + str2 = "" + with pytest.raises(ValueError, match="The length of string can not be 0"): + text_similarity(str1, str2) + str1 = ( + "天使九局下被追平,米基-莫尼亚克(Mickey Moniak)超前安打拒绝剧本,天使7-6老虎;阿莱克-博姆(Alec" + " Bohm)再见安打,费城人4-3金莺..." + ) + str2 = ( + "天使九局下被追平,米基-莫尼亚克(Mickey Moniak)超前安打拒绝剧本,天使7-6老虎;阿莱克-博姆(Alec" + " Bohm)再见安打,费城人4-3金莺;布兰登-洛维(Brandon Lowe)阳春炮,光芒4-1马林鱼;皮特-阿隆索(Pete Alonso)、" + "丹尼尔-沃格尔巴克(Daniel Vogelbach)背靠背本垒打,大都会9-3扬基;吉田正尚(Masataka Yoshida)3安2打点," + "红袜7-1勇士;凯尔-塔克(Kyle Tucker)阳春炮,太空人4-3连胜游骑兵。" + ) + res = text_similarity(str1, str2) + assert res > 0.8 + str1 = "我爱你" + str2 = "你爱我" + res = text_similarity(str1, str2) + assert res <= 0.8