diff options
-rw-r--r-- | README.org | 60 | ||||
-rw-r--r-- | draco.1 | 11 | ||||
-rwxr-xr-x | draco.pl | 124 |
3 files changed, 161 insertions, 34 deletions
diff --git a/README.org b/README.org index 3e1f470..34ad4c4 100644 --- a/README.org +++ b/README.org @@ -3,18 +3,13 @@ #+TITLE: Draco Draco is a script to convert reddit thread to Org document. It accepts a -url & prints the Org document to STDOUT. - -It'll also print comments along with their replies. It's limited by -the reddit API. +url & prints the Org document to STDOUT. It'll also print comments along +with their replies. | Project Home | [[https://andinus.nand.sh/draco/][Draco]] | | Source Code | [[https://git.tilde.institute/andinus/draco/][Andinus / Draco]] | | GitHub (Mirror) | [[https://github.com/andinus/draco/][Draco - GitHub]] | -*Tested on*: -- OpenBSD 6.8 (Perl v5.30.3) - * Why? I reference things from the web in my Journal & don't want those links to break so I save them locally. Previously I used to manually archive @@ -29,11 +24,13 @@ This was recorded with =asciinema(1)=. + Draco 2020-11-19: https://asciinema.org/a/373851 - alt-link: https://andinus.nand.sh/static/draco/2020-11-19.cast * Installation -#+BEGIN_SRC sh -# Clone the project. -git clone https://git.tilde.institute/andinus/draco && \ - cd draco +Follow these instructions to get draco & then install the dependencies, +they're listed below. All dependencies are in Debian & Fedora +repositories. +Check the /News/ section before updating or downloading latest release. +** Dependencies +#+BEGIN_SRC sh # Install dependencies. (OpenBSD) doas pkg_add p5-Unicode-LineBreak p5-JSON-MaybeXS cpan install HTTP::Tiny @@ -44,17 +41,38 @@ sudo apt install libunicode-linebreak-perl libjson-maybexs-perl \ # Install dependencies. (Fedora - dnf based) sudo dnf install perl-JSON-MaybeXS perl-HTTP-Tiny perl-Unicode-LineBreak +#+END_SRC +** v0.1.3 +#+BEGIN_SRC sh +# Get the release tar. +curl -O \ + https://git.tilde.institute/andinus/draco/snapshot/draco-0.1.3.tar.gz + +# Extract the files. +tar xzf draco-0.1.3.tar.gz + +# `cd' into the directory. +cd draco-0.1.3/ # Install. (Use sudo if doas is not present) doas make install #+END_SRC -* Dependencies -** Unicode::LineBreak -This is used to wrap the text at 76 columns. Draco can work without this -module. -** JSON::MaybeXS -This module is required for Draco to work, it can be replaced with any -other module that parses JSON & produces same data structure. -** =HTTP::Tiny= -Draco can work without this module if you can get the file some other -way. +* News +** v0.2.0 2020-11-23 +This version makes the script lot more complex. If you download only +small threads then this update is not required. + +Previous version (v0.1.3) might throw some errors on threads that have +comments hidden behind "load more comments" but the rest of thread will +be saved. + +This version will load all those comments hidden behind "load more +comments". But not those hidden behind "continue this thread". This is a +known bug. + ++ Add "[S]" after submitter's comments. ++ Print comments hidden under "load more comments". ++ Document environment variables in manual. ++ Add "limit=500" & "sort=top" to all posts/comments. ++ Print more information when debug is on. ++ Add help option. diff --git a/draco.1 b/draco.1 index 797252b..a9d2856 100644 --- a/draco.1 +++ b/draco.1 @@ -22,6 +22,14 @@ The options are as follows: Turn on debug messages. Debug messages will be printed to STDERR. .It Fl v Print version. +.It Fl h +Print this help. +.Pp +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width Ds +.It FETCH_ALL +Fetch all comments. This will make multiple HTTP calls to reddit. This +doesn't fetch *all* the comments. .Sh NOTES Draco will add 2 spaces before every new line. Comments/Posts may contain `*' at the start of line & that confuses Org. Org might @@ -32,6 +40,9 @@ every new line the maximum number of columns becomes 78. .Pp The date of archive is saved under :PROPERTIES: of the post as :ARCHIVE_DATE:. +.Pp +Total number of top level comments is saved under :PROPERTIES: of the +post as :TOTAL_TOP_LEVEL_COMMENTS:. .Sh WHY? I reference things from the web in my Journal & don't want those links to break so I save them locally. Previously I used to manually archive diff --git a/draco.pl b/draco.pl index 08779a1..b33fb62 100755 --- a/draco.pl +++ b/draco.pl @@ -15,39 +15,64 @@ my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76. # Printing UTF-8 to STDOUT. binmode(STDOUT, "encoding(UTF-8)"); -die "usage: draco [-dv] <url>\n" unless scalar @ARGV; +die "usage: draco [-dhv] <url>\n" unless scalar @ARGV; my $DEBUG; -my $VERSION = "v0.1.3"; +my $VERSION = "v0.2.0"; # Dispatch table to be parsed before url. my %dispatch = ( '-v' => sub { print "Draco $VERSION\n"; exit; }, '-d' => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; }, + '-h' => sub { print qq{Draco $VERSION + +Options: + -d + Turn on debug messages. Debug messages will be printed to + STDERR. + -h + Print this help. + -v + Print version. + +Environment Variables: + FETCH_ALL + Fetch all comments. This will make multiple HTTP calls to + reddit. This doesn't fetch *all* the comments. +}; + exit; + }, ); if (exists $dispatch{$ARGV[0]}) { # shift @ARGV to get $url in next shift. $dispatch{shift @ARGV}->(); } -# $url contains the reddit post. +# $url contains the reddit post. Raise the limit to 500 comments which +# is the maximum reddit allows. my $url = shift @ARGV; -my $json_url = "${url}.json"; +my $json_url = "${url}.json?limit=500&sort=top"; my $http = HTTP::Tiny->new( verify_SSL => 1 ); # Fetch the post. -print STDERR "draco: fetching `$json_url'.\n" if $DEBUG; -my $response = $http->get($json_url); -die "Unexpected response - $response->{status}: $response->{reason}" - unless $response->{success}; +print STDERR "fetching `$json_url'.\n" if $DEBUG; +my $response = get_response($json_url); # Decode json. -print STDERR "draco: decoding json response.\n" if $DEBUG; +print STDERR "decoding json response.\n" if $DEBUG; my $json_data = decode_json($response->{content}); # $post contains post data my $post = $json_data->[0]->{data}->{children}->[0]->{data}; +# $comments contains comment data. We are interested in: replies, +# author, body, created_utc & permalink. +my $comments = $json_data->[1]->{data}->{children}; + +# Print total top-level comments. +print STDERR "total top-level comments: ", + scalar($comments->@*), "\n" if $DEBUG; + # Start the Org document. print "#+", "STARTUP:content\n"; @@ -66,8 +91,9 @@ foreach my $detail (qw( subreddit created_utc author permalink print ":${detail}: =$post->{$detail}=\n" if scalar $post->{$detail}; } -# Include the archive date in properties. +# Include the archive date & total top-level comments in properties. print ":ARCHIVE_DATE: $date\n"; +print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n"; print ":END:\n"; # Add selftext if present. @@ -77,20 +103,87 @@ print "\n#+BEGIN_SRC markdown\n", "#+END_SRC\n" if scalar $post->{selftext}; -# $comments contains comment data. We are interested in: replies, -# author, body, created_utc & permalink. -my $comments = $json_data->[1]->{data}->{children}; +my (@http_calls, @shell_comments, %counter); +$counter{skipped_due_to_more} = 0; +$counter{print_comment_chain_call} = 0; + # Iterate over top-level comments. foreach my $comment ($comments->@*) { + if ($comment->{kind} eq "more" + and $comment->{data}->{id} eq "_") { + $counter{skipped_due_to_more}++; + next; + } print_comment_chain($comment->{data}, 0); } +print STDERR "total http calls: ", + scalar @http_calls, "\n" if $DEBUG; +print STDERR "total shell comments: ", + scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments; +print STDERR "total print_comment_chain calls: ", + $counter{print_comment_chain_call}, "\n" if $DEBUG; + +# This is equivalent to "continue this thread ->" we see on +# old.reddit.com threads. +print STDERR "total comments skipped due to more: ", + $counter{skipped_due_to_more}, "\n" if $DEBUG; + +sub get_response { + my $url = shift @_; + my $response = $http->get($url); + push @http_calls, $url; + die "Unexpected response - $response->{status}: $response->{reason} : $url" + unless $response->{success}; + return $response; +} + # print_comment_chain will print the whole chain of comment while # accounting for level. sub print_comment_chain { my $comment = shift @_; my $level = shift @_; + $counter{print_comment_chain_call}++; + + # $comment->{author} & $comment->{body} not being present means + # that it's a shell comment. We can get it by making another HTTP + # call. + unless ($comment->{author}) { + push @shell_comments, $comment->{id}; + return unless $ENV{FETCH_ALL}; + unless ( eval { + my $json_url = "${url}/$comment->{id}.json?limit=500&sort=top"; + + # Fetch the comment. + my $response = get_response($json_url); + + # Decode json. + my $json_data = decode_json($response->{content}); + + # $comments contains comment data. We are interested in: replies, + # author, body, created_utc & permalink. + my $comments = $json_data->[1]->{data}->{children}; + + foreach my $comment ($comments->@*) { + if ($comment->{kind} eq "more" + and $comment->{data}->{id} eq "_") { + $counter{skipped_due_to_more}++; + next; + } + print_comment_chain($comment->{data}, $level); + } + + return 1; + } ) { + print STDERR "parsing shell comment: $comment->{id} : failed\n"; + } + + # This comment thread has been parsed, move on to the text + # one. + return; + } + print "*" x ($level + 2), " ", "$comment->{author}"; print " [S]" if $comment->{is_submitter}; print "\n"; @@ -114,6 +207,11 @@ sub print_comment_chain { # If the comment has replies then iterate over those too. if (scalar $comment->{replies}) { foreach my $reply ($comment->{replies}->{data}->{children}->@*) { + if ($reply->{kind} eq "more" + and $reply->{data}->{id} eq "_") { + $counter{skipped_due_to_more}++; + next; + } print_comment_chain($reply->{data}, $level + 1); } } |