summary refs log tree commit diff stats
path: root/Sole.py
diff options
context:
space:
mode:
authorMarco Andronaco <andronacomarco@gmail.com>2023-07-12 09:14:36 +0200
committerMarco Andronaco <andronacomarco@gmail.com>2023-07-12 09:14:36 +0200
commit733a0a23988fb074c93e6c398d8b9142ee180b29 (patch)
tree6a06a08378b0744474d45d8692ac1ce17bec7859 /Sole.py
downloadsunstroke-733a0a23988fb074c93e6c398d8b9142ee180b29.tar.gz
initial commit
Diffstat (limited to 'Sole.py')
-rw-r--r--Sole.py76
1 files changed, 76 insertions, 0 deletions
diff --git a/Sole.py b/Sole.py
new file mode 100644
index 0000000..717091c
--- /dev/null
+++ b/Sole.py
@@ -0,0 +1,76 @@
+import feedparser
+from html.parser import HTMLParser
+from datetime import datetime
+from re import compile
+
+N_LINKS_TO_REMOVE = 2
+REGEX_DATE = compile("\(([\d\.]*)\)")
+OVERPOST_URL = "https://overpost.biz/e-books/quotidiani/rss.xml"
+
+def add_or_update(dictionary, key, value):
+    try:
+        dictionary[key].append(value)
+    except KeyError:
+        dictionary[key] = [ value ]
+
+class PostParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = {}
+        self.prev_tag = None
+        self.current_tag = None
+        self.current_link = None
+    
+    def handle_starttag(self, tag, attrs):
+        if tag == "br":
+            return
+        self.prev_tag = self.current_tag
+        self.current_tag = tag
+        if tag == "a":
+            for at in attrs:
+                if at[0] == "href":
+                    self.current_link = at[1]
+
+    def handle_endtag(self, tag):
+        self.current_tag = self.prev_tag
+
+    def handle_data(self, data):
+        if self.current_tag == "a":
+            key = data.replace("_", " ").split(" - ")[0]
+            value = self.current_link
+            add_or_update(self.links, key, value)
+            
+    def get_links(self):
+        return self.links.copy()
+    
+def parse_html(html):
+    parser = PostParser()
+    parser.feed(html)
+    return parser.get_links()
+
+def remove_first(d):
+    return (k := next(iter(d)), d.pop(k))
+
+def remove_first_n(d, n):
+    for i in range(n):
+        remove_first(d)
+
+def parse_entry(entry): # entry = day
+    date = REGEX_DATE.findall(entry.title)[0]
+    links = parse_html(entry.turbo_content)
+    
+    remove_first_n(links, N_LINKS_TO_REMOVE)
+    return (datetime.strptime(date, "%d.%m.%Y"), links)
+
+def get_links(rss_url):
+    feed = feedparser.parse(rss_url)
+    return [ parse_entry(entry) for entry in feed.entries ]
+
+def get_sole():
+    links = get_links(OVERPOST_URL)
+    today = links[1]
+    return { k: v for k, v in today[1].items() if k.startswith("Il Sole 24 Ore")}
+
+OVERPOST_URL = r"/home/marco/Documenti/overpost/rss.xml"
+if __name__ == "__main__":
+    print(get_sole())