🐛 Rss 不再删除格式化字符

This commit is contained in:
suyiiyii 2024-08-19 14:40:16 +08:00 committed by felinae98
parent 7d80b44d2a
commit cf38500be7
2 changed files with 21 additions and 7 deletions

View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup as bs
from ..post import Post from ..post import Post
from .platform import NewMessage from .platform import NewMessage
from ..types import Target, RawPost from ..types import Target, RawPost
from ..utils import Site, text_fletten, text_similarity from ..utils import Site, text_similarity
class RssSite(Site): class RssSite(Site):
@ -32,7 +32,7 @@ class RssPost(Post):
for p in soup.find_all("p"): for p in soup.find_all("p"):
p.insert_after("\n") p.insert_after("\n")
return text_fletten(soup.get_text()) return soup.get_text()
class Rss(NewMessage): class Rss(NewMessage):
@ -82,7 +82,7 @@ class Rss(NewMessage):
async def parse(self, raw_post: RawPost) -> Post: async def parse(self, raw_post: RawPost) -> Post:
title = raw_post.get("title", "") title = raw_post.get("title", "")
soup = bs(raw_post.description, "html.parser") soup = bs(raw_post.description, "html.parser")
desc = soup.text.strip() desc = raw_post.description
title, desc = self._text_process(title, desc) title, desc = self._text_process(title, desc)
pics = [x.attrs["src"] for x in soup("img")] pics = [x.attrs["src"] for x in soup("img")]
if raw_post.get("media_content"): if raw_post.get("media_content"):

View File

@ -88,9 +88,21 @@ async def test_fetch_new_1(
assert post1.title is None assert post1.title is None
assert ( assert (
post1.content post1.content
== "【#統合戦略】 引き続き新テーマ「ミヅキと紺碧の樹」の新要素及びシステムの変更点を一部ご紹介します!" == "【#統合戦略】 <br />引き続き新テーマ「ミヅキと紺碧の樹」の新要素及びシステムの変更点を一部ご紹介します! "
" 今回は「灯火」、「ダイス」、「記号認識」、「鍵」についてです。詳細は添付の画像をご確認ください。" "<br /><br />"
"#アークナイツ https://t.co/ARmptV0Zvu" "今回は「灯火」、「ダイス」、「記号認識」、「鍵」についてです。<br />詳細は添付の画像をご確認ください。"
"<br /><br />"
"#アークナイツ https://t.co/ARmptV0Zvu<br />"
'<img src="https://pbs.twimg.com/media/FwZG9YAacAIXDw2?format=jpg&amp;name=orig" />'
)
plain_content = await post1.get_plain_content()
assert (
plain_content == "【#統合戦略】 \n"
"引き続き新テーマ「ミヅキと紺碧の樹」の新要素及びシステムの変更点を一部ご紹介します! \n\n"
"今回は「灯火」、「ダイス」、「記号認識」、「鍵」についてです。\n"
"詳細は添付の画像をご確認ください。\n\n"
"#アークナイツ https://t.co/ARmptV0Zvu\n"
"[图片]"
) )
@ -174,7 +186,9 @@ async def test_fetch_new_4(
assert len(res2[0][1]) == 1 assert len(res2[0][1]) == 1
post1 = res2[0][1][0] post1 = res2[0][1][0]
assert post1.url == "https://wallhaven.cc/w/85rjej" assert post1.url == "https://wallhaven.cc/w/85rjej"
assert post1.content == "85rjej.jpg" assert post1.content == '<img alt="loading" class="lazyload" src="https://th.wallhaven.cc/small/85/85rjej.jpg" />'
plain_content = await post1.get_plain_content()
assert plain_content == "[图片]"
def test_similar_text_process(): def test_similar_text_process():