UKM 9838e25bad
🎈优化RSS推送的内容 (#259)
* 🧪 test(tests): 添加了RSS的单元测试

* 🎈 perf(rss and test): 优化了RSS部分源标题正文重复的问题

部分RSS源(RSSHub的Twitter)存在正文当标题用的情况,导致推送的时候呈现为两段重复的文字,现通过Jaccard相似系数来判断是否需要去重

* Update nonebot_bison/platform/rss.py

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>

* Update nonebot_bison/platform/rss.py

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>

* 🐞 fix(platform/rss): 修复了漏掉相似文本在后端位置的问题

* 🐞 fix(rss): 修正一些feed无法正确识别时间的bug

一些feed时间只有updated标签或者没有,原先的代码只能解析用published标签的时间

felinae98#275

* 🎈 perf(rss): 更改字符串相似度比较方法

从Jaccard相似系数比较相似度改为通过最长公共子序列来比较

* 🦄 refactor(rss): 重构实现字符串相似度比较的方法

使用标准库difflib代替原先手搓的LCS

* Update nonebot_bison/utils/__init__.py

Co-authored-by: felinae98 <731499577@qq.com>

* Update nonebot_bison/platform/rss.py

* Update nonebot_bison/platform/rss.py

---------

Co-authored-by: AzideCupric <57004769+AzideCupric@users.noreply.github.com>
Co-authored-by: felinae98 <731499577@qq.com>
2023-07-18 11:54:49 +08:00

170 lines
5.6 KiB
Python

import typing
import xml.etree.ElementTree as ET
from datetime import datetime
import pytest
import pytz
import respx
from httpx import AsyncClient, Response
from nonebug.app import App
from .utils import get_file
if typing.TYPE_CHECKING:
from nonebot_bison.platform.rss import Rss
@pytest.fixture
def dummy_user(app: App):
from nonebot_bison.types import User
user = User(123, "group")
return user
@pytest.fixture
def user_info_factory(app: App, dummy_user):
from nonebot_bison.types import UserSubInfo
def _user_info(category_getter, tag_getter):
return UserSubInfo(dummy_user, category_getter, tag_getter)
return _user_info
@pytest.fixture
def rss(app: App):
from nonebot_bison.platform import platform_manager
from nonebot_bison.utils import ProcessContext
return platform_manager["rss"](ProcessContext(), AsyncClient())
@pytest.fixture
def update_time_feed_1():
file = get_file("rss-twitter-ArknightsStaff.xml")
root = ET.fromstring(file)
item = root.find("channel/item")
current_time = datetime.now(pytz.timezone("GMT")).strftime(
"%a, %d %b %Y %H:%M:%S %Z"
)
pubdate_elem = item.find("pubDate")
pubdate_elem.text = current_time
return ET.tostring(root, encoding="unicode")
@pytest.fixture
def update_time_feed_2():
file = get_file("rss-ruanyifeng.xml")
root = ET.fromstring(file)
current_time = datetime.now(pytz.timezone("GMT")).strftime(
"%a, %d %b %Y %H:%M:%S %Z"
)
published_element = root.find(".//{*}published")
published_element.text = current_time
return ET.tostring(root, encoding="unicode")
@pytest.mark.asyncio
@respx.mock
async def test_fetch_new_1(
rss,
user_info_factory,
update_time_feed_1,
):
## 标题重复的情况
rss_router = respx.get("https://rsshub.app/twitter/user/ArknightsStaff")
rss_router.mock(
return_value=Response(200, text=get_file("rss-twitter-ArknightsStaff-0.xml"))
)
target = "https://rsshub.app/twitter/user/ArknightsStaff"
res1 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res1) == 0
rss_router.mock(return_value=Response(200, text=update_time_feed_1))
res2 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res2[0][1]) == 1
post1 = res2[0][1][0]
assert post1.url == "https://twitter.com/ArknightsStaff/status/1659091539023282178"
assert (
post1.text
== "【#統合戦略】 引き続き新テーマ「ミヅキと紺碧の樹」の新要素及びシステムの変更点を一部ご紹介します! 今回は「灯火」、「ダイス」、「記号認識」、「鍵」についてです。詳細は添付の画像をご確認ください。#アークナイツ https://t.co/ARmptV0Zvu"
)
@pytest.mark.asyncio
@respx.mock
async def test_fetch_new_2(
rss,
user_info_factory,
update_time_feed_2,
):
## 标题与正文不重复的情况
rss_router = respx.get("https://www.ruanyifeng.com/blog/atom.xml")
rss_router.mock(return_value=Response(200, text=get_file("rss-ruanyifeng-0.xml")))
target = "https://www.ruanyifeng.com/blog/atom.xml"
res1 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res1) == 0
rss_router.mock(return_value=Response(200, text=update_time_feed_2))
res2 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res2[0][1]) == 1
post1 = res2[0][1][0]
assert post1.url == "http://www.ruanyifeng.com/blog/2023/05/weekly-issue-255.html"
assert post1.text == "科技爱好者周刊(第 255 期):对待 AI 的正确态度\n\n这里记录每周值得分享的科技内容,周五发布。..."
@pytest.fixture
def update_time_feed_3():
file = get_file("rss-github-atom.xml")
root = ET.fromstring(file)
current_time = datetime.now(pytz.timezone("GMT")).strftime(
"%a, %d %b %Y %H:%M:%S %Z"
)
published_element = root.findall(".//{*}updated")[1]
published_element.text = current_time
return ET.tostring(root, encoding="unicode")
@pytest.mark.asyncio
@respx.mock
async def test_fetch_new_3(
rss,
user_info_factory,
update_time_feed_3,
):
## 只有<updated>没有<published>
rss_router = respx.get("https://github.com/R3nzTheCodeGOD/R3nzSkin/releases.atom")
rss_router.mock(return_value=Response(200, text=get_file("rss-github-atom-0.xml")))
target = "https://github.com/R3nzTheCodeGOD/R3nzSkin/releases.atom"
res1 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res1) == 0
rss_router.mock(return_value=Response(200, text=update_time_feed_3))
res2 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res2[0][1]) == 1
post1 = res2[0][1][0]
assert post1.url == "https://github.com/R3nzTheCodeGOD/R3nzSkin/releases/tag/v3.0.9"
assert post1.text == "R3nzSkin\n\nNo content."
@pytest.mark.asyncio
@respx.mock
async def test_fetch_new_4(
rss,
user_info_factory,
):
## 没有日期信息的情况
rss_router = respx.get("https://rsshub.app/wallhaven/hot?limit=5")
rss_router.mock(return_value=Response(200, text=get_file("rss-top5-old.xml")))
target = "https://rsshub.app/wallhaven/hot?limit=5"
res1 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res1) == 0
rss_router.mock(return_value=Response(200, text=get_file("rss-top5-new.xml")))
res2 = await rss.fetch_new_post(target, [user_info_factory([], [])])
assert len(res2[0][1]) == 1
post1 = res2[0][1][0]
assert post1.url == "https://wallhaven.cc/w/85rjej"
assert post1.text == "85rjej.jpg"