working version

author: Andronaco Marco <marco.andronaco@olivetti.com> 2023-07-12 13:02:46 +0200
committer: Andronaco Marco <marco.andronaco@olivetti.com> 2023-07-12 13:02:46 +0200
commit: 585855a8728f87cc5383329bd227f6d6ba840aff (patch)
tree: 634aa7868a614e45eeab6772a2a5ddf367e74a91 /Overpost.py
parent: 733a0a23988fb074c93e6c398d8b9142ee180b29 (diff)
download: sunstroke-585855a8728f87cc5383329bd227f6d6ba840aff.tar.gz
1 files changed, 80 insertions, 0 deletions
diff --git a/Overpost.py b/Overpost.py
new file mode 100644
index 0000000..6404b37
--- /dev/null
+++ b/Overpost.py
@@ -0,0 +1,80 @@
+from html.parser import HTMLParser
+from datetime import datetime
+from re import compile
+import os
+import feedparser
+from dotenv import load_dotenv
+load_dotenv()
+
+RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml")
+N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2
+REGEX_DATE = compile("\(([\d\.]*)\)")
+
+def add_or_update(dictionary, key, value):
+    try:
+        dictionary[key].append(value)
+    except KeyError:
+        dictionary[key] = [ value ]
+
+class PostParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = {}
+        self.prev_tag = None
+        self.current_tag = None
+        self.current_link = None
+    
+    def handle_starttag(self, tag, attrs):
+        if tag == "br":
+            return
+        self.prev_tag = self.current_tag
+        self.current_tag = tag
+        if tag == "a":
+            for at in attrs:
+                if at[0] == "href":
+                    self.current_link = at[1]
+
+    def handle_endtag(self, tag):
+        self.current_tag = self.prev_tag
+
+    def handle_data(self, data):
+        if self.current_tag == "a":
+            key = data.replace("_", " ").split(" - ")[0]
+            value = self.current_link
+            add_or_update(self.links, key, value)
+            
+    def get_links(self):
+        return self.links.copy()
+    
+def parse_html(html):
+    parser = PostParser()
+    parser.feed(html)
+    return parser.get_links()
+
+def dict_pop(d):
+    return (k := next(iter(d)), d.pop(k))
+
+def dict_pop_first_n(d, n):
+    return [dict_pop(d) for _ in range(n)]
+
+def parse_entry(entry): # entry = day
+    date = REGEX_DATE.findall(entry.title)[0]
+    links = parse_html(entry.turbo_content)
+    
+    dict_pop_first_n(links, int(N_LINKS_TO_REMOVE))
+    return (datetime.strptime(date, "%d.%m.%Y"), links)
+
+def get_links(rss_url):
+    feed = feedparser.parse(rss_url)
+    return [ parse_entry(entry) for entry in feed.entries ]
+
+def get_newspaper(prefix="", index=0):
+    links = get_links(RSS_URL)
+    try:
+        daily = links[index][1]
+    except IndexError:
+        return {}
+    return { k: v for k, v in daily.items() if k.startswith(prefix)}
+
+if __name__ == "__main__":
+    print(get_newspaper("Il Sole"))
author	Andronaco Marco <marco.andronaco@olivetti.com>	2023-07-12 13:02:46 +0200
committer	Andronaco Marco <marco.andronaco@olivetti.com>	2023-07-12 13:02:46 +0200
commit	585855a8728f87cc5383329bd227f6d6ba840aff (patch)
tree	634aa7868a614e45eeab6772a2a5ddf367e74a91 /Overpost.py
parent	733a0a23988fb074c93e6c398d8b9142ee180b29 (diff)
download	sunstroke-585855a8728f87cc5383329bd227f6d6ba840aff.tar.gz