From e43c4edea411fd2c63d7460d727410a7e90dde31 Mon Sep 17 00:00:00 2001 From: felinae98 <731499577@qq.com> Date: Wed, 17 Feb 2021 22:55:57 +0800 Subject: [PATCH] optimise weibo post --- src/plugins/hk_reporter/platform/weibo.py | 12 +++++++++++- src/plugins/hk_reporter/post.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/plugins/hk_reporter/platform/weibo.py b/src/plugins/hk_reporter/platform/weibo.py index 676dd72..07a0392 100644 --- a/src/plugins/hk_reporter/platform/weibo.py +++ b/src/plugins/hk_reporter/platform/weibo.py @@ -1,6 +1,7 @@ from collections import defaultdict from datetime import datetime import json +import re import time from typing import Any, Optional @@ -64,10 +65,19 @@ class Weibo(Platform): return Category(2) else: return Category(3) + + def _get_text(self, raw_text: str) -> str: + text = raw_text.replace('
', '\n') + return bs(text).text async def parse(self, raw_post: RawPost) -> Post: info = raw_post['mblog'] - parsed_text = bs(info['text'], 'html.parser').text + if info['isLongText'] or info['pic_num'] > 9: + async with httpx.AsyncClient() as client: + res = await client.get('https://m.weibo.cn/detail/{}'.format(info['mid'])) + full_json_text = re.search(r'"status": ([\s\S]+),\s+"hotScheme"', res.text).group(1) + info = json.loads(full_json_text) + parsed_text = self._get_text(info['text']) pic_urls = [img['large']['url'] for img in info.get('pics', [])] detail_url = 'https://weibo.com/{}/{}'.format(info['user']['id'], info['bid']) # return parsed_text, detail_url, pic_urls diff --git a/src/plugins/hk_reporter/post.py b/src/plugins/hk_reporter/post.py index 63cc46c..ed74300 100644 --- a/src/plugins/hk_reporter/post.py +++ b/src/plugins/hk_reporter/post.py @@ -25,4 +25,4 @@ class Post: return res def __str__(self): - return 'type: {}\ntext: {}\nurl: {}\npic: {}'.format(self.target_type, self.text[:50], self.url, ','.join(map(lambda x: 'b64img' if x.startswith('base64') else x, self.pics))) + return 'type: {}\ntext: {}\nurl: {}\npic: {}'.format(self.target_type, self.text, self.url, ','.join(map(lambda x: 'b64img' if x.startswith('base64') else x, self.pics)))