summary refs log tree commit diff stats
path: root/draco.pl
diff options
context:
space:
mode:
authorAndinus <andinus@nand.sh>2020-11-23 20:38:44 +0530
committerAndinus <andinus@nand.sh>2020-11-23 20:38:44 +0530
commit48668b05251a69110944df51800e88d01f6e2942 (patch)
tree9fe2debac792c448287c04a3706d355d1f86ec7d /draco.pl
parentd37383dc4c808fa116791cd9becd7a1df41f50ad (diff)
downloaddraco-48668b05251a69110944df51800e88d01f6e2942.tar.gz
Release v0.2.0, load comments hidden under "load more comments"
This version makes the script lot more complex.

This version will load all those comments hidden behind "load more
comments". But not those hidden behind "continue this thread".

Starting this release we have a News section.
Diffstat (limited to 'draco.pl')
-rwxr-xr-xdraco.pl124
1 files changed, 111 insertions, 13 deletions
diff --git a/draco.pl b/draco.pl
index 08779a1..b33fb62 100755
--- a/draco.pl
+++ b/draco.pl
@@ -15,39 +15,64 @@ my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.
 # Printing UTF-8 to STDOUT.
 binmode(STDOUT, "encoding(UTF-8)");
 
-die "usage: draco [-dv] <url>\n" unless scalar @ARGV;
+die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;
 
 my $DEBUG;
-my $VERSION = "v0.1.3";
+my $VERSION = "v0.2.0";
 # Dispatch table to be parsed before url.
 my %dispatch = (
     '-v'  => sub { print "Draco $VERSION\n"; exit; },
     '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
+    '-h'  => sub { print qq{Draco $VERSION
+
+Options:
+    -d
+        Turn on debug messages. Debug messages will be printed to
+        STDERR.
+    -h
+        Print this help.
+    -v
+        Print version.
+
+Environment Variables:
+    FETCH_ALL
+        Fetch all comments. This will make multiple HTTP calls to
+        reddit. This doesn't fetch *all* the comments.
+};
+                   exit;
+               },
 );
 if (exists $dispatch{$ARGV[0]}) {
     # shift @ARGV to get $url in next shift.
     $dispatch{shift @ARGV}->();
 }
 
-# $url contains the reddit post.
+# $url contains the reddit post. Raise the limit to 500 comments which
+# is the maximum reddit allows.
 my $url = shift @ARGV;
-my $json_url = "${url}.json";
+my $json_url = "${url}.json?limit=500&sort=top";
 
 my $http = HTTP::Tiny->new( verify_SSL => 1 );
 
 # Fetch the post.
-print STDERR "draco: fetching `$json_url'.\n" if $DEBUG;
-my $response = $http->get($json_url);
-die "Unexpected response - $response->{status}: $response->{reason}"
-    unless $response->{success};
+print STDERR "fetching `$json_url'.\n" if $DEBUG;
+my $response = get_response($json_url);
 
 # Decode json.
-print STDERR "draco: decoding json response.\n" if $DEBUG;
+print STDERR "decoding json response.\n" if $DEBUG;
 my $json_data = decode_json($response->{content});
 
 # $post contains post data
 my $post = $json_data->[0]->{data}->{children}->[0]->{data};
 
+# $comments contains comment data. We are interested in: replies,
+# author, body, created_utc & permalink.
+my $comments = $json_data->[1]->{data}->{children};
+
+# Print total top-level comments.
+print STDERR "total top-level comments: ",
+    scalar($comments->@*), "\n" if $DEBUG;
+
 # Start the Org document.
 print "#+", "STARTUP:content\n";
 
@@ -66,8 +91,9 @@ foreach my $detail (qw( subreddit created_utc author permalink
     print ":${detail}: =$post->{$detail}=\n"
         if scalar $post->{$detail};
 }
-# Include the archive date in properties.
+# Include the archive date & total top-level comments in properties.
 print ":ARCHIVE_DATE: $date\n";
+print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
 print ":END:\n";
 
 # Add selftext if present.
@@ -77,20 +103,87 @@ print "\n#+BEGIN_SRC markdown\n",
     "#+END_SRC\n"
     if scalar $post->{selftext};
 
-# $comments contains comment data. We are interested in: replies,
-# author, body, created_utc & permalink.
-my $comments = $json_data->[1]->{data}->{children};
+my (@http_calls, @shell_comments, %counter);
+$counter{skipped_due_to_more} = 0;
+$counter{print_comment_chain_call} = 0;
+
 # Iterate over top-level comments.
 foreach my $comment ($comments->@*) {
+    if ($comment->{kind} eq "more"
+        and $comment->{data}->{id} eq "_") {
+        $counter{skipped_due_to_more}++;
+        next;
+    }
     print_comment_chain($comment->{data}, 0);
 }
 
+print STDERR "total http calls: ",
+    scalar @http_calls, "\n" if $DEBUG;
+print STDERR "total shell comments: ",
+    scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
+print STDERR "total print_comment_chain calls: ",
+    $counter{print_comment_chain_call}, "\n" if $DEBUG;
+
+# This is equivalent to "continue this thread ->" we see on
+# old.reddit.com threads.
+print STDERR "total comments skipped due to more: ",
+    $counter{skipped_due_to_more}, "\n" if $DEBUG;
+
+sub get_response {
+    my $url = shift @_;
+    my $response = $http->get($url);
+    push @http_calls, $url;
+    die "Unexpected response - $response->{status}: $response->{reason} : $url"
+        unless $response->{success};
+    return $response;
+}
+
 # print_comment_chain will print the whole chain of comment while
 # accounting for level.
 sub print_comment_chain {
     my $comment = shift @_;
     my $level = shift @_;
 
+    $counter{print_comment_chain_call}++;
+
+    # $comment->{author} & $comment->{body} not being present means
+    # that it's a shell comment. We can get it by making another HTTP
+    # call.
+    unless ($comment->{author}) {
+        push @shell_comments, $comment->{id};
+        return unless $ENV{FETCH_ALL};
+        unless ( eval {
+            my $json_url = "${url}/$comment->{id}.json?limit=500&sort=top";
+
+            # Fetch the comment.
+            my $response = get_response($json_url);
+
+            # Decode json.
+            my $json_data = decode_json($response->{content});
+
+            # $comments contains comment data. We are interested in: replies,
+            # author, body, created_utc & permalink.
+            my $comments = $json_data->[1]->{data}->{children};
+
+            foreach my $comment ($comments->@*) {
+                if ($comment->{kind} eq "more"
+                    and $comment->{data}->{id} eq "_") {
+                    $counter{skipped_due_to_more}++;
+                    next;
+                }
+                print_comment_chain($comment->{data}, $level);
+            }
+
+            return 1;
+        } ) {
+            print STDERR "parsing shell comment: $comment->{id} : failed\n";
+        }
+
+        # This comment thread has been parsed, move on to the text
+        # one.
+        return;
+    }
+
     print "*" x ($level + 2), " ", "$comment->{author}";
     print " [S]" if $comment->{is_submitter};
     print "\n";
@@ -114,6 +207,11 @@ sub print_comment_chain {
     # If the comment has replies then iterate over those too.
     if (scalar $comment->{replies}) {
         foreach my $reply ($comment->{replies}->{data}->{children}->@*) {
+            if ($reply->{kind} eq "more"
+                and $reply->{data}->{id} eq "_") {
+                $counter{skipped_due_to_more}++;
+                next;
+            }
             print_comment_chain($reply->{data}, $level + 1);
         }
     }