summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--README.org60
-rw-r--r--draco.111
-rwxr-xr-xdraco.pl124
3 files changed, 161 insertions, 34 deletions
diff --git a/README.org b/README.org
index 3e1f470..34ad4c4 100644
--- a/README.org
+++ b/README.org
@@ -3,18 +3,13 @@
 #+TITLE: Draco
 
 Draco is a script to convert reddit thread to Org document. It accepts a
-url & prints the Org document to STDOUT.
-
-It'll also print comments along with their replies. It's limited by
-the reddit API.
+url & prints the Org document to STDOUT. It'll also print comments along
+with their replies.
 
 | Project Home    | [[https://andinus.nand.sh/draco/][Draco]]           |
 | Source Code     | [[https://git.tilde.institute/andinus/draco/][Andinus / Draco]] |
 | GitHub (Mirror) | [[https://github.com/andinus/draco/][Draco - GitHub]]  |
 
-*Tested on*:
-- OpenBSD 6.8 (Perl v5.30.3)
-
 * Why?
 I reference things from the web in my Journal & don't want those links
 to break so I save them locally. Previously I used to manually archive
@@ -29,11 +24,13 @@ This was recorded with =asciinema(1)=.
 + Draco 2020-11-19: https://asciinema.org/a/373851
   - alt-link: https://andinus.nand.sh/static/draco/2020-11-19.cast
 * Installation
-#+BEGIN_SRC sh
-# Clone the project.
-git clone https://git.tilde.institute/andinus/draco && \
-    cd draco
+Follow these instructions to get draco & then install the dependencies,
+they're listed below. All dependencies are in Debian & Fedora
+repositories.
 
+Check the /News/ section before updating or downloading latest release.
+** Dependencies
+#+BEGIN_SRC sh
 # Install dependencies. (OpenBSD)
 doas pkg_add p5-Unicode-LineBreak p5-JSON-MaybeXS
 cpan install HTTP::Tiny
@@ -44,17 +41,38 @@ sudo apt install libunicode-linebreak-perl libjson-maybexs-perl \
 
 # Install dependencies. (Fedora - dnf based)
 sudo dnf install perl-JSON-MaybeXS perl-HTTP-Tiny perl-Unicode-LineBreak
+#+END_SRC
+** v0.1.3
+#+BEGIN_SRC sh
+# Get the release tar.
+curl -O \
+     https://git.tilde.institute/andinus/draco/snapshot/draco-0.1.3.tar.gz
+
+# Extract the files.
+tar xzf draco-0.1.3.tar.gz
+
+# `cd' into the directory.
+cd draco-0.1.3/
 
 # Install. (Use sudo if doas is not present)
 doas make install
 #+END_SRC
-* Dependencies
-** Unicode::LineBreak
-This is used to wrap the text at 76 columns. Draco can work without this
-module.
-** JSON::MaybeXS
-This module is required for Draco to work, it can be replaced with any
-other module that parses JSON & produces same data structure.
-** =HTTP::Tiny=
-Draco can work without this module if you can get the file some other
-way.
+* News
+** v0.2.0  2020-11-23
+This version makes the script lot more complex. If you download only
+small threads then this update is not required.
+
+Previous version (v0.1.3) might throw some errors on threads that have
+comments hidden behind "load more comments" but the rest of thread will
+be saved.
+
+This version will load all those comments hidden behind "load more
+comments". But not those hidden behind "continue this thread". This is a
+known bug.
+
++ Add "[S]" after submitter's comments.
++ Print comments hidden under "load more comments".
++ Document environment variables in manual.
++ Add "limit=500" & "sort=top" to all posts/comments.
++ Print more information when debug is on.
++ Add help option.
diff --git a/draco.1 b/draco.1
index 797252b..a9d2856 100644
--- a/draco.1
+++ b/draco.1
@@ -22,6 +22,14 @@ The options are as follows:
 Turn on debug messages. Debug messages will be printed to STDERR.
 .It Fl v
 Print version.
+.It Fl h
+Print this help.
+.Pp
+.Sh ENVIRONMENT VARIABLES
+.Bl -tag -width Ds
+.It FETCH_ALL
+Fetch all comments. This will make multiple HTTP calls to reddit. This
+doesn't fetch *all* the comments.
 .Sh NOTES
 Draco will add 2 spaces before every new line. Comments/Posts may
 contain `*' at the start of line & that confuses Org. Org might
@@ -32,6 +40,9 @@ every new line the maximum number of columns becomes 78.
 .Pp
 The date of archive is saved under :PROPERTIES: of the post as
 :ARCHIVE_DATE:.
+.Pp
+Total number of top level comments is saved under :PROPERTIES: of the
+post as :TOTAL_TOP_LEVEL_COMMENTS:.
 .Sh WHY?
 I reference things from the web in my Journal & don't want those links
 to break so I save them locally. Previously I used to manually archive
diff --git a/draco.pl b/draco.pl
index 08779a1..b33fb62 100755
--- a/draco.pl
+++ b/draco.pl
@@ -15,39 +15,64 @@ my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.
 # Printing UTF-8 to STDOUT.
 binmode(STDOUT, "encoding(UTF-8)");
 
-die "usage: draco [-dv] <url>\n" unless scalar @ARGV;
+die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;
 
 my $DEBUG;
-my $VERSION = "v0.1.3";
+my $VERSION = "v0.2.0";
 # Dispatch table to be parsed before url.
 my %dispatch = (
     '-v'  => sub { print "Draco $VERSION\n"; exit; },
     '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
+    '-h'  => sub { print qq{Draco $VERSION
+
+Options:
+    -d
+        Turn on debug messages. Debug messages will be printed to
+        STDERR.
+    -h
+        Print this help.
+    -v
+        Print version.
+
+Environment Variables:
+    FETCH_ALL
+        Fetch all comments. This will make multiple HTTP calls to
+        reddit. This doesn't fetch *all* the comments.
+};
+                   exit;
+               },
 );
 if (exists $dispatch{$ARGV[0]}) {
     # shift @ARGV to get $url in next shift.
     $dispatch{shift @ARGV}->();
 }
 
-# $url contains the reddit post.
+# $url contains the reddit post. Raise the limit to 500 comments which
+# is the maximum reddit allows.
 my $url = shift @ARGV;
-my $json_url = "${url}.json";
+my $json_url = "${url}.json?limit=500&sort=top";
 
 my $http = HTTP::Tiny->new( verify_SSL => 1 );
 
 # Fetch the post.
-print STDERR "draco: fetching `$json_url'.\n" if $DEBUG;
-my $response = $http->get($json_url);
-die "Unexpected response - $response->{status}: $response->{reason}"
-    unless $response->{success};
+print STDERR "fetching `$json_url'.\n" if $DEBUG;
+my $response = get_response($json_url);
 
 # Decode json.
-print STDERR "draco: decoding json response.\n" if $DEBUG;
+print STDERR "decoding json response.\n" if $DEBUG;
 my $json_data = decode_json($response->{content});
 
 # $post contains post data
 my $post = $json_data->[0]->{data}->{children}->[0]->{data};
 
+# $comments contains comment data. We are interested in: replies,
+# author, body, created_utc & permalink.
+my $comments = $json_data->[1]->{data}->{children};
+
+# Print total top-level comments.
+print STDERR "total top-level comments: ",
+    scalar($comments->@*), "\n" if $DEBUG;
+
 # Start the Org document.
 print "#+", "STARTUP:content\n";
 
@@ -66,8 +91,9 @@ foreach my $detail (qw( subreddit created_utc author permalink
     print ":${detail}: =$post->{$detail}=\n"
         if scalar $post->{$detail};
 }
-# Include the archive date in properties.
+# Include the archive date & total top-level comments in properties.
 print ":ARCHIVE_DATE: $date\n";
+print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
 print ":END:\n";
 
 # Add selftext if present.
@@ -77,20 +103,87 @@ print "\n#+BEGIN_SRC markdown\n",
     "#+END_SRC\n"
     if scalar $post->{selftext};
 
-# $comments contains comment data. We are interested in: replies,
-# author, body, created_utc & permalink.
-my $comments = $json_data->[1]->{data}->{children};
+my (@http_calls, @shell_comments, %counter);
+$counter{skipped_due_to_more} = 0;
+$counter{print_comment_chain_call} = 0;
+
 # Iterate over top-level comments.
 foreach my $comment ($comments->@*) {
+    if ($comment->{kind} eq "more"
+        and $comment->{data}->{id} eq "_") {
+        $counter{skipped_due_to_more}++;
+        next;
+    }
     print_comment_chain($comment->{data}, 0);
 }
 
+print STDERR "total http calls: ",
+    scalar @http_calls, "\n" if $DEBUG;
+print STDERR "total shell comments: ",
+    scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
+print STDERR "total print_comment_chain calls: ",
+    $counter{print_comment_chain_call}, "\n" if $DEBUG;
+
+# This is equivalent to "continue this thread ->" we see on
+# old.reddit.com threads.
+print STDERR "total comments skipped due to more: ",
+    $counter{skipped_due_to_more}, "\n" if $DEBUG;
+
+sub get_response {
+    my $url = shift @_;
+    my $response = $http->get($url);
+    push @http_calls, $url;
+    die "Unexpected response - $response->{status}: $response->{reason} : $url"
+        unless $response->{success};
+    return $response;
+}
+
 # print_comment_chain will print the whole chain of comment while
 # accounting for level.
 sub print_comment_chain {
     my $comment = shift @_;
     my $level = shift @_;
 
+    $counter{print_comment_chain_call}++;
+
+    # $comment->{author} & $comment->{body} not being present means
+    # that it's a shell comment. We can get it by making another HTTP
+    # call.
+    unless ($comment->{author}) {
+        push @shell_comments, $comment->{id};
+        return unless $ENV{FETCH_ALL};
+        unless ( eval {
+            my $json_url = "${url}/$comment->{id}.json?limit=500&sort=top";
+
+            # Fetch the comment.
+            my $response = get_response($json_url);
+
+            # Decode json.
+            my $json_data = decode_json($response->{content});
+
+            # $comments contains comment data. We are interested in: replies,
+            # author, body, created_utc & permalink.
+            my $comments = $json_data->[1]->{data}->{children};
+
+            foreach my $comment ($comments->@*) {
+                if ($comment->{kind} eq "more"
+                    and $comment->{data}->{id} eq "_") {
+                    $counter{skipped_due_to_more}++;
+                    next;
+                }
+                print_comment_chain($comment->{data}, $level);
+            }
+
+            return 1;
+        } ) {
+            print STDERR "parsing shell comment: $comment->{id} : failed\n";
+        }
+
+        # This comment thread has been parsed, move on to the text
+        # one.
+        return;
+    }
+
     print "*" x ($level + 2), " ", "$comment->{author}";
     print " [S]" if $comment->{is_submitter};
     print "\n";
@@ -114,6 +207,11 @@ sub print_comment_chain {
     # If the comment has replies then iterate over those too.
     if (scalar $comment->{replies}) {
         foreach my $reply ($comment->{replies}->{data}->{children}->@*) {
+            if ($reply->{kind} eq "more"
+                and $reply->{data}->{id} eq "_") {
+                $counter{skipped_due_to_more}++;
+                next;
+            }
             print_comment_chain($reply->{data}, $level + 1);
         }
     }