summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAndinus <andinus@nand.sh>2020-11-24 15:27:57 +0530
committerAndinus <andinus@nand.sh>2020-11-24 15:27:57 +0530
commitfbbc7de1089859c02d163b20adf9279c1dc6db5a (patch)
tree1965fcb54f5ec9b616f0048f30e11ad6bd4b8995
parent144369a429d51959e0952e0a26633379c8826138 (diff)
downloaddraco-fbbc7de1089859c02d163b20adf9279c1dc6db5a.tar.gz
Handle all kinds of comments in iterate_over_comments
iterate_over_comments() is a sub to handle all comments.
print_comment_chain() can only parse first kind of comments now, i.e.
top-level comments & their replies.

Comments hidden under "load more comments" & "continue this thread"
require other operations before print_comment_chain() can print them.
All those operations are handled by iterate_over_comments().

Currently "continue this thread" is skipped but "load more comments"
is parsed. This doesn't add anything to previous releases but just
changes the structure.

I'm just preparing to add code to parse comments hidden under
"continue this thread". After that Draco will be able to archive the
whole thread. Previously handling this would've required copying the
same code over 3 times, now it'll be handled at one place.

3 times would've been:

    1. when we start iterating over top-level comments.
    2. when comments hidden under "load more comments" are parsed &
    their child is hidden under "continue this thread".
    3. when replies of normal comments are hidden under "continue this
    thread".

Case 1 & 2 is fixed by switching to iterate_over_comments(). For
third, the replies are handled by iterate_over_comments() so the 3rd
case is also fixed.
-rwxr-xr-xdraco.pl208
1 files changed, 128 insertions, 80 deletions
diff --git a/draco.pl b/draco.pl
index 7d3c75c..4a85384 100755
--- a/draco.pl
+++ b/draco.pl
@@ -2,12 +2,11 @@
 
 use strict;
 use warnings;
+use Time::Piece;
 
 use HTTP::Tiny;
 use JSON::MaybeXS;
 
-use POSIX qw(strftime);
-
 # For wrapping comment blocks.
 use Unicode::LineBreak;
 my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.
@@ -47,6 +46,10 @@ if (exists $dispatch{$ARGV[0]}) {
     $dispatch{shift @ARGV}->();
 }
 
+# This is the start time.
+my $start_time = time;
+my $last_log = $start_time;
+
 # $url contains the reddit post. Raise the limit to 500 comments which
 # is the maximum reddit allows.
 my $url = shift @ARGV;
@@ -55,6 +58,7 @@ my $json_url = "${url}.json?limit=500&sort=top";
 my $http = HTTP::Tiny->new( verify_SSL => 1 );
 
 # Fetch the post.
+print_time() if $DEBUG;
 print STDERR "fetching `$json_url'.\n" if $DEBUG;
 my $response = get_response($json_url);
 
@@ -69,16 +73,12 @@ my $post = $json_data->[0]->{data}->{children}->[0]->{data};
 # author, body, created_utc & permalink.
 my $comments = $json_data->[1]->{data}->{children};
 
-# Print total top-level comments.
-print STDERR "total top-level comments: ",
-    scalar($comments->@*), "\n" if $DEBUG;
-
 # Start the Org document.
 print "#+", "STARTUP:content\n";
 
 # Print the date.
-my $date = strftime '%+', localtime();
-print "#+", "DATE: $date\n";
+my $current_date = Time::Piece->new->strftime('%+');
+print "#+", "DATE: $current_date\n";
 print "\n";
 
 # Print the post title & it's link.
@@ -86,14 +86,19 @@ print "* ", "[[$post->{url}][$post->{title}]]\n";
 
 # Add various details to :PROPERTIES:.
 print ":PROPERTIES:\n";
+# Include the created date, archive date & total top-level comments in
+# properties.
+print ":CREATED_UTC: ",
+    Time::Piece->strptime($post->{created_utc}, '%s')
+    ->strftime('%+'), "\n";
+
+print ":ARCHIVE_DATE: $current_date\n";
+print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
 foreach my $detail (qw( subreddit created_utc author permalink
                         upvote_ratio ups downs score )) {
     print ":${detail}: =$post->{$detail}=\n"
         if scalar $post->{$detail};
 }
-# Include the archive date & total top-level comments in properties.
-print ":ARCHIVE_DATE: $date\n";
-print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
 print ":END:\n";
 
 # Add selftext if present.
@@ -106,29 +111,40 @@ print "\n#+BEGIN_SRC markdown\n",
 my (@http_calls, @shell_comments, %counter);
 $counter{skipped_due_to_more} = 0;
 $counter{print_comment_chain_call} = 0;
+$counter{iterate_over_comments_call} = 0;
 
-# Iterate over top-level comments.
-foreach my $comment ($comments->@*) {
-    if ($comment->{kind} eq "more"
-        and $comment->{data}->{id} eq "_") {
-        $counter{skipped_due_to_more}++;
-        next;
-    }
-    print_comment_chain($comment->{data}, 0);
-}
+print_time() if $DEBUG;
+print STDERR "iterating over top-level comments.\n" if $DEBUG;
+# Iterate over top-level comments. The second argument is level
+# (depth), it should be 0 for top-level comments.
+iterate_over_comments($comments, 0);
+print_time() if $DEBUG;
 
+# Print important stats.
+print STDERR "\n" if $DEBUG;
 print STDERR "total http calls: ",
     scalar @http_calls, "\n" if $DEBUG;
+print STDERR "total top-level comments: ",
+    scalar($comments->@*), "\n" if $DEBUG;
 print STDERR "total shell comments: ",
     scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
 print STDERR "total print_comment_chain calls: ",
     $counter{print_comment_chain_call}, "\n" if $DEBUG;
+print STDERR "total iterate_over_comments calls: ",
+    $counter{iterate_over_comments_call}, "\n" if $DEBUG;
 
 # This is equivalent to "continue this thread ->" we see on
 # old.reddit.com threads.
 print STDERR "total comments skipped due to more: ",
     $counter{skipped_due_to_more}, "\n" if $DEBUG;
 
+sub print_time {
+    print STDERR "    ";
+    print STDERR "time since [start, last log]: [", time - $start_time,
+        ", ", time - $last_log, "] seconds\n";
+    $last_log = time;
+}
+
 sub get_response {
     my $url = shift @_;
     my $response = $http->get($url);
@@ -138,14 +154,93 @@ sub get_response {
     return $response;
 }
 
-# There are 3 kind of comments.
-#
-# 1. normal comments (includes top-level comments).
-# 2. comments hidden under "load more comments".
-# 3. comments hidden under "continue this thread".
+# First argument requires $comments & second is the level (depth).
+sub iterate_over_comments {
+    my $comments = shift @_;
+    my $level = shift @_;
+
+    $counter{iterate_over_comments_call}++;
+
+    foreach my $comment ($comments->@*) {
+        my $comment_data = $comment->{data};
+
+        # There are 3 kind of comments.
+        #
+        # 1. normal comments (includes top-level comments & replies).
+        # 2. comments hidden under "load more comments".
+        # 3. comments hidden under "continue this thread".
+
+        # We will be dealing with them in reverse order, i.e. 3rd ->
+        # 2nd -> 1st.
+
+        # This comment we are skipping is the third kind of comment,
+        # i.e. comment hidden under "continue this thread". We can't
+        # parse it yet.
+        if ($comment->{kind} eq "more"
+            and $comment_data->{id} eq "_") {
+            $counter{skipped_due_to_more}++;
+            next;
+        }
+
+        # These are second kind of comments, i.e. comments hidden
+        # under "load more comments". We can get it by making another
+        # HTTP call. This is skipped by default & user has to pass
+        # `FETCH_ALL' to enable it.
+        unless ($comment_data->{author}) {
+            push @shell_comments, $comment_data->{id};
+
+            # Don't proceed unless user has set `FETCH_ALL'.
+            next unless $ENV{FETCH_ALL};
+
+            unless ( eval {
+                # Reddit doesn't like this kind of url:
+                #     http://<reddit>/<post_id>//<comment_id>.json
+                #
+                # It wants this kind of url:
+                #     http://<reddit>/<post_id>/<comment_id>.json
+                #
+                # Notice the extra '/' in first url.
+                my $json_url = $url;
+                $json_url .= "/" unless substr $url, -1 eq "/";
+                $json_url .= "$comment_data->{id}.json?limit=500&sort=top";
+
+                # Fetch the comment.
+                my $response = get_response($json_url);
+
+                # Decode json.
+                my $json_data = decode_json($response->{content});
+
+                # $comments contains comment data. We are interested in: replies,
+                # author, body, created_utc & permalink.
+                my $comments = $json_data->[1]->{data}->{children};
+
+                # Now this is like a normal comment chain, i.e. first
+                # kind of comment. We just have to iterate over it &
+                # pass to print_comment_chain, iterate_over_comments
+                # will handle it.
+                iterate_over_comments($comments, $level);
+                return 1;
+            } ) {
+                my $err = $@;
+                print STDERR "parsing `$comment_data->{id}' failed: $err\n";
+            }
+
+            # This comment thread has been parsed, move on to the text
+            # one.
+            next;
+        }
+
+        # This is first kind of comment, we can pass it directly to
+        # print_comment_chain.
+        print_comment_chain($comment_data, $level);
+    }
+}
+
 
 # print_comment_chain will print the whole chain of comment while
-# accounting for level.
+# accounting for level. It can only parse the first kind of comment,
+# i.e. top-level comments & their replies. To learn about kinds of
+# comments, check iterate_over_comments() subroutine.
 sub print_comment_chain {
     # This was earlier called $comment & was changed to $comment_data
     # to prevent confusion because it is $comment->{data}.
@@ -154,59 +249,18 @@ sub print_comment_chain {
 
     $counter{print_comment_chain_call}++;
 
-    # $comment_data->{author} not being present means that it's a
-    # comment hidden under "load more comments". We can get it by
-    # making another HTTP call.
-    unless ($comment_data->{author}) {
-        push @shell_comments, $comment_data->{id};
-        return unless $ENV{FETCH_ALL};
-        unless ( eval {
-            # It'll fail if we fetch "${url}/$comment_data->{id}.json"
-            # & ${url} already has "/" at the end. So, we check if "/"
-            # is present, if not then we add it.
-            my $json_url = $url;
-            $json_url .= "/" unless substr $url, -1 eq "/";
-            $json_url .= "$comment_data->{id}.json?limit=500&sort=top";
-
-            # Fetch the comment.
-            my $response = get_response($json_url);
-
-            # Decode json.
-            my $json_data = decode_json($response->{content});
-
-            # $comments contains comment data. We are interested in: replies,
-            # author, body, created_utc & permalink.
-            my $comments = $json_data->[1]->{data}->{children};
-
-            foreach my $comment ($comments->@*) {
-                if ($comment->{kind} eq "more"
-                    and $comment->{data}->{id} eq "_") {
-                    $counter{skipped_due_to_more}++;
-                    next;
-                }
-                print_comment_chain($comment->{data}, $level);
-            }
-
-            return 1;
-        } ) {
-            my $err = $@;
-            print STDERR "parsing `$comment_data->{id}' failed: $err\n";
-        }
-
-        # This comment thread has been parsed, move on to the text
-        # one.
-        return;
-    }
-
     print "*" x ($level + 2), " ", "$comment_data->{author}";
     print " [S]" if $comment_data->{is_submitter};
     print "\n";
 
     # Print comment details.
     print ":PROPERTIES:\n";
+    print ":CREATED_UTC: ",
+        Time::Piece->strptime($comment_data->{created_utc}, '%s')
+          ->strftime('%+'), "\n";
     foreach my $detail (qw( created_utc author permalink upvote_ratio
-                            ups downs score edited is_submitter
-                            stickied controversiality )) {
+                            ups downs score edited stickied
+                            controversiality )) {
         print ":${detail}: =$comment_data->{$detail}=\n"
             if scalar $comment_data->{$detail};
     }
@@ -220,13 +274,7 @@ sub print_comment_chain {
 
     # If the comment has replies then iterate over those too.
     if (scalar $comment_data->{replies}) {
-        foreach my $reply ($comment_data->{replies}->{data}->{children}->@*) {
-            if ($reply->{kind} eq "more"
-                and $reply->{data}->{id} eq "_") {
-                $counter{skipped_due_to_more}++;
-                next;
-            }
-            print_comment_chain($reply->{data}, $level + 1);
-        }
+        iterate_over_comments($comment_data->{replies}->{data}->{children},
+                              $level + 1);
     }
 }