#!/usr/bin/perl
use strict;
use warnings;
use HTTP::Tiny;
use JSON::MaybeXS;
use POSIX qw(strftime);
# For wrapping comment blocks.
use Unicode::LineBreak;
my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.
# Printing UTF-8 to STDOUT.
binmode(STDOUT, "encoding(UTF-8)");
die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;
my $DEBUG;
my $VERSION = "v0.2.0";
# Dispatch table to be parsed before url.
my %dispatch = (
'-v' => sub { print "Draco $VERSION\n"; exit; },
'-d' => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
'-h' => sub { print qq{Draco $VERSION
Options:
-d
Turn on debug messages. Debug messages will be printed to
STDERR.
-h
Print this help.
-v
Print version.
Environment Variables:
FETCH_ALL
Fetch all comments. This will make multiple HTTP calls to
reddit. This doesn't fetch *all* the comments.
};
exit;
},
);
if (exists $dispatch{$ARGV[0]}) {
# shift @ARGV to get $url in next shift.
$dispatch{shift @ARGV}->();
}
# $url contains the reddit post. Raise the limit to 500 comments which
# is the maximum reddit allows.
my $url = shift @ARGV;
my $json_url = "${url}.json?limit=500&sort=top";
my $http = HTTP::Tiny->new( verify_SSL => 1 );
# Fetch the post.
print STDERR "fetching `$json_url'.\n" if $DEBUG;
my $response = get_response($json_url);
# Decode json.
print STDERR "decoding json response.\n" if $DEBUG;
my $json_data = decode_json($response->{content});
# $post contains post data
my $post = $json_data->[0]->{data}->{children}->[0]->{data};
# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};
# Print total top-level comments.
print STDERR "total top-level comments: ",
scalar($comments->@*), "\n" if $DEBUG;
# Start the Org document.
print "#+", "STARTUP:content\n";
# Print the date.
my $date = strftime '%+', localtime();
print "#+", "DATE: $date\n";
print "\n";
# Print the post title & it's link.
print "* ", "[[$post->{url}][$post->{title}]]\n";
# Add various details to :PROPERTIES:.
print ":PROPERTIES:\n";
foreach my $detail (qw( subreddit created_utc author permalink
upvote_ratio ups downs score )) {
print ":${detail}: =$post->{$detail}=\n"
if scalar $post->{$detail};
}
# Include the archive date & total top-level comments in properties.
print ":ARCHIVE_DATE: $date\n";
print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
print ":END:\n";
# Add selftext if present.
print "\n#+BEGIN_SRC markdown\n",
# Break the text at 76 column & add 2 space before every new line.
" ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n",
"#+END_SRC\n"
if scalar $post->{selftext};
my (@http_calls, @shell_comments, %counter);
$counter{skipped_due_to_more} = 0;
$counter{print_comment_chain_call} = 0;
# Iterate over top-level comments.
foreach my $comment ($comments->@*) {
if ($comment->{kind} eq "more"
and $comment->{data}->{id} eq "_") {
$counter{skipped_due_to_more}++;
next;
}
print_comment_chain($comment->{data}, 0);
}
print STDERR "total http calls: ",
scalar @http_calls, "\n" if $DEBUG;
print STDERR "total shell comments: ",
scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
print STDERR "total print_comment_chain calls: ",
$counter{print_comment_chain_call}, "\n" if $DEBUG;
# This is equivalent to "continue this thread ->" we see on
# old.reddit.com threads.
print STDERR "total comments skipped due to more: ",
$counter{skipped_due_to_more}, "\n" if $DEBUG;
sub get_response {
my $url = shift @_;
my $response = $http->get($url);
push @http_calls, $url;
die "Unexpected response - $response->{status}: $response->{reason} : $url"
unless $response->{success};
return $response;
}
# print_comment_chain will print the whole chain of comment while
# accounting for level.
sub print_comment_chain {
my $comment = shift @_;
my $level = shift @_;
$counter{print_comment_chain_call}++;
# $comment->{author} & $comment->{body} not being present means
# that it's a shell comment. We can get it by making another HTTP
# call.
unless ($comment->{author}) {
push @shell_comments, $comment->{id};
return unless $ENV{FETCH_ALL};
unless ( eval {
my $json_url = "${url}/$comment->{id}.json?limit=500&sort=top";
# Fetch the comment.
my $response = get_response($json_url);
# Decode json.
my $json_data = decode_json($response->{content});
# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};
foreach my $comment ($comments->@*) {
if ($comment->{kind} eq "more"
and $comment->{data}->{id} eq "_") {
$counter{skipped_due_to_more}++;
next;
}
print_comment_chain($comment->{data}, $level);
}
return 1;
} ) {
print STDERR "parsing shell comment: $comment->{id} : failed\n";
}
# This comment thread has been parsed, move on to the text
# one.
return;
}
print "*" x ($level + 2), " ", "$comment->{author}";
print " [S]" if $comment->{is_submitter};
print "\n";
# Print comment details.
print ":PROPERTIES:\n";
foreach my $detail (qw( created_utc author permalink upvote_ratio
ups downs score edited is_submitter
stickied controversiality )) {
print ":${detail}: =$comment->{$detail}=\n"
if scalar $comment->{$detail};
}
print ":END:\n";
print "\n#+BEGIN_SRC markdown\n",
# Break the text at 76 column & add 2 space before every new
# line.
" ", $lb->break($comment->{body}) =~ s/\n/\n\ \ /gr, "\n",
"#+END_SRC\n";
# If the comment has replies then iterate over those too.
if (scalar $comment->{replies}) {
foreach my $reply ($comment->{replies}->{data}->{children}->@*) {
if ($reply->{kind} eq "more"
and $reply->{data}->{id} eq "_") {
$counter{skipped_due_to_more}++;
next;
}
print_comment_chain($reply->{data}, $level + 1);
}
}
}