#!/usr/bin/perl use strict; use warnings; use HTTP::Tiny; use JSON::MaybeXS; use POSIX qw(strftime); # For wrapping comment blocks. use Unicode::LineBreak; my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76. # Printing UTF-8 to STDOUT. binmode(STDOUT, "encoding(UTF-8)"); die "usage: draco [-dhv] \n" unless scalar @ARGV; my $DEBUG; my $VERSION = "v0.2.0"; # Dispatch table to be parsed before url. my %dispatch = ( '-v' => sub { print "Draco $VERSION\n"; exit; }, '-d' => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; }, '-h' => sub { print qq{Draco $VERSION Options: -d Turn on debug messages. Debug messages will be printed to STDERR. -h Print this help. -v Print version. Environment Variables: FETCH_ALL Fetch all comments. This will make multiple HTTP calls to reddit. This doesn't fetch *all* the comments. }; exit; }, ); if (exists $dispatch{$ARGV[0]}) { # shift @ARGV to get $url in next shift. $dispatch{shift @ARGV}->(); } # $url contains the reddit post. Raise the limit to 500 comments which # is the maximum reddit allows. my $url = shift @ARGV; my $json_url = "${url}.json?limit=500&sort=top"; my $http = HTTP::Tiny->new( verify_SSL => 1 ); # Fetch the post. print STDERR "fetching `$json_url'.\n" if $DEBUG; my $response = get_response($json_url); # Decode json. print STDERR "decoding json response.\n" if $DEBUG; my $json_data = decode_json($response->{content}); # $post contains post data my $post = $json_data->[0]->{data}->{children}->[0]->{data}; # $comments contains comment data. We are interested in: replies, # author, body, created_utc & permalink. my $comments = $json_data->[1]->{data}->{children}; # Print total top-level comments. print STDERR "total top-level comments: ", scalar($comments->@*), "\n" if $DEBUG; # Start the Org document. print "#+", "STARTUP:content\n"; # Print the date. my $date = strftime '%+', localtime(); print "#+", "DATE: $date\n"; print "\n"; # Print the post title & it's link. print "* ", "[[$post->{url}][$post->{title}]]\n"; # Add various details to :PROPERTIES:. print ":PROPERTIES:\n"; foreach my $detail (qw( subreddit created_utc author permalink upvote_ratio ups downs score )) { print ":${detail}: =$post->{$detail}=\n" if scalar $post->{$detail}; } # Include the archive date & total top-level comments in properties. print ":ARCHIVE_DATE: $date\n"; print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n"; print ":END:\n"; # Add selftext if present. print "\n#+BEGIN_SRC markdown\n", # Break the text at 76 column & add 2 space before every new line. " ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n", "#+END_SRC\n" if scalar $post->{selftext}; my (@http_calls, @shell_comments, %counter); $counter{skipped_due_to_more} = 0; $counter{print_comment_chain_call} = 0; # Iterate over top-level comments. foreach my $comment ($comments->@*) { if ($comment->{kind} eq "more" and $comment->{data}->{id} eq "_") { $counter{skipped_due_to_more}++; next; } print_comment_chain($comment->{data}, 0); } print STDERR "total http calls: ", scalar @http_calls, "\n" if $DEBUG; print STDERR "total shell comments: ", scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments; print STDERR "total print_comment_chain calls: ", $counter{print_comment_chain_call}, "\n" if $DEBUG; # This is equivalent to "continue this thread ->" we see on # old.reddit.com threads. print STDERR "total comments skipped due to more: ", $counter{skipped_due_to_more}, "\n" if $DEBUG; sub get_response { my $url = shift @_; my $response = $http->get($url); push @http_calls, $url; die "Unexpected response - $response->{status}: $response->{reason} : $url" unless $response->{success}; return $response; } # print_comment_chain will print the whole chain of comment while # accounting for level. sub print_comment_chain { my $comment = shift @_; my $level = shift @_; $counter{print_comment_chain_call}++; # $comment->{author} & $comment->{body} not being present means # that it's a shell comment. We can get it by making another HTTP # call. unless ($comment->{author}) { push @shell_comments, $comment->{id}; return unless $ENV{FETCH_ALL}; unless ( eval { my $json_url = "${url}/$comment->{id}.json?limit=500&sort=top"; # Fetch the comment. my $response = get_response($json_url); # Decode json. my $json_data = decode_json($response->{content}); # $comments contains comment data. We are interested in: replies, # author, body, created_utc & permalink. my $comments = $json_data->[1]->{data}->{children}; foreach my $comment ($comments->@*) { if ($comment->{kind} eq "more" and $comment->{data}->{id} eq "_") { $counter{skipped_due_to_more}++; next; } print_comment_chain($comment->{data}, $level); } return 1; } ) { print STDERR "parsing shell comment: $comment->{id} : failed\n"; } # This comment thread has been parsed, move on to the text # one. return; } print "*" x ($level + 2), " ", "$comment->{author}"; print " [S]" if $comment->{is_submitter}; print "\n"; # Print comment details. print ":PROPERTIES:\n"; foreach my $detail (qw( created_utc author permalink upvote_ratio ups downs score edited is_submitter stickied controversiality )) { print ":${detail}: =$comment->{$detail}=\n" if scalar $comment->{$detail}; } print ":END:\n"; print "\n#+BEGIN_SRC markdown\n", # Break the text at 76 column & add 2 space before every new # line. " ", $lb->break($comment->{body}) =~ s/\n/\n\ \ /gr, "\n", "#+END_SRC\n"; # If the comment has replies then iterate over those too. if (scalar $comment->{replies}) { foreach my $reply ($comment->{replies}->{data}->{children}->@*) { if ($reply->{kind} eq "more" and $reply->{data}->{id} eq "_") { $counter{skipped_due_to_more}++; next; } print_comment_chain($reply->{data}, $level + 1); } } }