diff options
author | Andronaco Marco <marco.andronaco@olivetti.com> | 2023-07-12 13:02:46 +0200 |
---|---|---|
committer | Andronaco Marco <marco.andronaco@olivetti.com> | 2023-07-12 13:02:46 +0200 |
commit | 585855a8728f87cc5383329bd227f6d6ba840aff (patch) | |
tree | 634aa7868a614e45eeab6772a2a5ddf367e74a91 /Overpost.py | |
parent | 733a0a23988fb074c93e6c398d8b9142ee180b29 (diff) | |
download | sunstroke-585855a8728f87cc5383329bd227f6d6ba840aff.tar.gz |
working version
Diffstat (limited to 'Overpost.py')
-rw-r--r-- | Overpost.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/Overpost.py b/Overpost.py new file mode 100644 index 0000000..6404b37 --- /dev/null +++ b/Overpost.py @@ -0,0 +1,80 @@ +from html.parser import HTMLParser +from datetime import datetime +from re import compile +import os +import feedparser +from dotenv import load_dotenv +load_dotenv() + +RSS_URL = os.getenv("RSS_URL") or os.path.join(".", "rss.xml") +N_LINKS_TO_REMOVE = os.getenv("N_LINKS_TO_REMOVE") or 2 +REGEX_DATE = compile("\(([\d\.]*)\)") + +def add_or_update(dictionary, key, value): + try: + dictionary[key].append(value) + except KeyError: + dictionary[key] = [ value ] + +class PostParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.links = {} + self.prev_tag = None + self.current_tag = None + self.current_link = None + + def handle_starttag(self, tag, attrs): + if tag == "br": + return + self.prev_tag = self.current_tag + self.current_tag = tag + if tag == "a": + for at in attrs: + if at[0] == "href": + self.current_link = at[1] + + def handle_endtag(self, tag): + self.current_tag = self.prev_tag + + def handle_data(self, data): + if self.current_tag == "a": + key = data.replace("_", " ").split(" - ")[0] + value = self.current_link + add_or_update(self.links, key, value) + + def get_links(self): + return self.links.copy() + +def parse_html(html): + parser = PostParser() + parser.feed(html) + return parser.get_links() + +def dict_pop(d): + return (k := next(iter(d)), d.pop(k)) + +def dict_pop_first_n(d, n): + return [dict_pop(d) for _ in range(n)] + +def parse_entry(entry): # entry = day + date = REGEX_DATE.findall(entry.title)[0] + links = parse_html(entry.turbo_content) + + dict_pop_first_n(links, int(N_LINKS_TO_REMOVE)) + return (datetime.strptime(date, "%d.%m.%Y"), links) + +def get_links(rss_url): + feed = feedparser.parse(rss_url) + return [ parse_entry(entry) for entry in feed.entries ] + +def get_newspaper(prefix="", index=0): + links = get_links(RSS_URL) + try: + daily = links[index][1] + except IndexError: + return {} + return { k: v for k, v in daily.items() if k.startswith(prefix)} + +if __name__ == "__main__": + print(get_newspaper("Il Sole")) |