Fix unreliable HTML parsing

2024-07-08 01:56:20 +01:00 · 2024-07-08 01:56:20 +01:00 · c618070718
commit c618070718
parent d856c10260
1 changed files with 3 additions and 2 deletions
--- a/src/cogs/onion_feed.py
+++ b/src/cogs/onion_feed.py
@ -25,7 +25,7 @@ class RSSItem:

 class OnionFeed(commands.Cog):
    SOURCE = "https://www.theonion.com/rss"
-    EPOCH = datetime.datetime(2024, 7, 7, tzinfo=datetime.timezone.utc)
+    EPOCH = datetime.datetime(2024, 7, 1, tzinfo=datetime.timezone.utc)

    def __init__(self, bot):
        self.bot: commands.Bot = bot
@ -38,6 +38,7 @@ class OnionFeed(commands.Cog):

    @staticmethod
    def parse_item(item: BeautifulSoup) -> RSSItem:
+        description = BeautifulSoup(item.description.get_text(), "html.parser").p.get_text(strip=True).strip()[:-1]
        kwargs = {
            "title": item.title.get_text(strip=True).strip(),
            "link": item.link.get_text(strip=True).strip(),
@ -45,7 +46,7 @@ class OnionFeed(commands.Cog):
                item.pubDate.get_text(strip=True).strip(), "%a, %d %b %Y %H:%M:%S %Z"
            ),
            "guid": item.guid.get_text(strip=True).strip(),
-            "description": BeautifulSoup(item.description.get_text()).p.get_text(strip=True).strip()[:-1],
+            "description": description,
            "thumbnail": item.find("media:thumbnail")["url"],
        }
        return RSSItem(**kwargs)