path: root/draco.pl



#!/usr/bin/perl

use strict;
use warnings;
use Time::Piece;

use HTTP::Tiny;
use JSON::MaybeXS;

# For wrapping comment blocks.
use Unicode::LineBreak;
my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.

# Printing UTF-8 to STDOUT.
binmode(STDOUT, "encoding(UTF-8)");

die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;

my $DEBUG;
my $VERSION = "v0.2.2";
# Dispatch table to be parsed before url.
my %dispatch = (
    '-v'  => sub { print "Draco $VERSION\n"; exit; },
    '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
    '-h'  => sub { print qq{Draco $VERSION

Options:
    -d
        Turn on debug messages. Debug messages will be printed to
        STDERR.
    -h
        Print this help.
    -v
        Print version.

Environment Variables:
    FETCH_ALL
        Fetch all comments. This will make multiple HTTP calls to
        reddit. This doesn't fetch *all* the comments.
};
                   exit;
               },
);
if (exists $dispatch{$ARGV[0]}) {
    # shift @ARGV to get $url in next shift.
    $dispatch{shift @ARGV}->();
}

# This is the start time.
my $start_time = time;
my $last_log = $start_time;

# $url contains the reddit post. Raise the limit to 500 comments which
# is the maximum reddit allows.
my $url = shift @ARGV;
my $json_url = "${url}.json?limit=500&sort=top";

my $http = HTTP::Tiny->new( verify_SSL => 1 );

# Fetch the post.
print_time() if $DEBUG;
print STDERR "fetching `$json_url'.\n" if $DEBUG;
my $response = get_response($json_url);

# Decode json.
print STDERR "decoding json response.\n" if $DEBUG;
my $json_data = decode_json($response->{content});

# $post contains post data
my $post = $json_data->[0]->{data}->{children}->[0]->{data};

# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};

# Start the Org document.
print "#+", "STARTUP:content\n";

# Print the date.
my $current_date = Time::Piece->new->strftime('%+');
print "#+", "DATE: $current_date\n";
print "\n";

# Print the post title & it's link.
print "* ", "[[$post->{url}][$post->{title}]]\n";

# Add various details to :PROPERTIES:.
print ":PROPERTIES:\n";
# Include the created date, archive date & total top-level comments in
# properties.
print ":CREATED_UTC: ",
    Time::Piece->strptime($post->{created_utc}, '%s')
    ->strftime('%+'), "\n";

print ":ARCHIVE_DATE: $current_date\n";
print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
foreach my $detail (qw( subreddit created_utc author permalink
                        upvote_ratio ups downs score )) {
    print ":${detail}: =$post->{$detail}=\n"
        if scalar $post->{$detail};
}
print ":END:\n";

# Add selftext if present.
print "\n#+BEGIN_SRC markdown\n",
    # Break the text at 76 column & add 2 space before every new line.
    "  ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n",
    "#+END_SRC\n"
    if scalar $post->{selftext};

my (@http_calls, @shell_comments, %counter);
$counter{skipped_due_to_more} = 0;
$counter{print_comment_chain_call} = 0;
$counter{iterate_over_comments_call} = 0;

print_time() if $DEBUG;
print STDERR "iterating over top-level comments.\n" if $DEBUG;
# Iterate over top-level comments. The second argument is level
# (depth), it should be 0 for top-level comments.
iterate_over_comments($comments, 0);
print_time() if $DEBUG;

# Print important stats.
print STDERR "\n" if $DEBUG;
print STDERR "total http calls: ",
    scalar @http_calls, "\n" if $DEBUG;
print STDERR "total top-level comments: ",
    scalar($comments->@*), "\n" if $DEBUG;
print STDERR "total shell comments: ",
    scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
print STDERR "total print_comment_chain calls: ",
    $counter{print_comment_chain_call}, "\n" if $DEBUG;
print STDERR "total iterate_over_comments calls: ",
    $counter{iterate_over_comments_call}, "\n" if $DEBUG;

# This is equivalent to "continue this thread ->" we see on
# old.reddit.com threads.
print STDERR "total comments skipped due to more: ",
    $counter{skipped_due_to_more}, "\n" if $DEBUG;

sub print_time {
    print STDERR "    ";
    print STDERR "time since [start, last log]: [", time - $start_time,
        ", ", time - $last_log, "] seconds\n";
    $last_log = time;
}

sub get_response {
    my $url = shift @_;
    my $response = $http->get($url);
    push @http_calls, $url;
    die "Unexpected response - $response->{status}: $response->{reason} : $url"
        unless $response->{success};
    return $response;
}

# First argument requires $comments & second is the level (depth).
sub iterate_over_comments {
    my $comments = shift @_;
    my $level = shift @_;

    $counter{iterate_over_comments_call}++;

    foreach my $comment ($comments->@*) {
        my $comment_data = $comment->{data};

        # There are 3 kind of comments.
        #
        # 1. normal comments (includes top-level comments & replies).
        # 2. comments hidden under "load more comments".
        # 3. comments hidden under "continue this thread".

        # We will be dealing with them in reverse order, i.e. 3rd ->
        # 2nd -> 1st.

        # This comment we are skipping is the third kind of comment,
        # i.e. comment hidden under "continue this thread". We can't
        # parse it yet.
        if ($comment->{kind} eq "more"
            and $comment_data->{id} eq "_") {
            $counter{skipped_due_to_more}++;
            next;
        }

        # These are second kind of comments, i.e. comments hidden
        # under "load more comments". We can get it by making another
        # HTTP call. This is skipped by default & user has to pass
        # `FETCH_ALL' to enable it.
        unless ($comment_data->{author}) {
            push @shell_comments, $comment_data->{id};

            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            unless ( eval {
                # Reddit doesn't like this kind of url:
                #     http://<reddit>/<post_id>//<comment_id>.json
                #
                # It wants this kind of url:
                #     http://<reddit>/<post_id>/<comment_id>.json
                #
                # Notice the extra '/' in first url.
                my $json_url = $url;
                $json_url .= "/" unless substr $url, -1 eq "/";
                $json_url .= "$comment_data->{id}.json?limit=500&sort=top";

                # Fetch the comment.
                my $response = get_response($json_url);

                # Decode json.
                my $json_data = decode_json($response->{content});

                # $comments contains comment data. We are interested in: replies,
                # author, body, created_utc & permalink.
                my $comments = $json_data->[1]->{data}->{children};

                # Now this is like a normal comment chain, i.e. first
                # kind of comment. We just have to iterate over it &
                # pass to print_comment_chain, iterate_over_comments
                # will handle it.
                iterate_over_comments($comments, $level);
                return 1;
            } ) {
                my $err = $@;
                print STDERR "parsing `$comment_data->{id}' failed: $err\n";
            }

            # This comment thread has been parsed, move on to the text
            # one.
            next;
        }

        # This is first kind of comment, we can pass it directly to
        # print_comment_chain.
        print_comment_chain($comment_data, $level);
    }
}


# print_comment_chain will print the whole chain of comment while
# accounting for level. It can only parse the first kind of comment,
# i.e. top-level comments & their replies. To learn about kinds of
# comments, check iterate_over_comments() subroutine.
sub print_comment_chain {
    # This was earlier called $comment & was changed to $comment_data
    # to prevent confusion because it is $comment->{data}.
    my $comment_data = shift @_;
    my $level = shift @_;

    $counter{print_comment_chain_call}++;

    print "*" x ($level + 2), " ", "$comment_data->{author}";
    print " [S]" if $comment_data->{is_submitter};
    print "\n";

    # Print comment details.
    print ":PROPERTIES:\n";
    print ":CREATED_UTC: ",
        Time::Piece->strptime($comment_data->{created_utc}, '%s')
          ->strftime('%+'), "\n";
    foreach my $detail (qw( created_utc author permalink upvote_ratio
                            ups downs score edited stickied
                            controversiality )) {
        print ":${detail}: =$comment_data->{$detail}=\n"
            if scalar $comment_data->{$detail};
    }
    print ":END:\n";

    print "\n#+BEGIN_SRC markdown\n",
        # Break the text at 76 column & add 2 space before every new
        # line.
        "  ", $lb->break($comment_data->{body}) =~ s/\n/\n\ \ /gr, "\n",
        "#+END_SRC\n";

    # If the comment has replies then iterate over those too.
    if (scalar $comment_data->{replies}) {
        iterate_over_comments($comment_data->{replies}->{data}->{children},
                              $level + 1);
    }
}
#!/usr/bin/perl

use strict;
use warnings;
use Time::Piece;

use HTTP::Tiny;
use JSON::MaybeXS;

# For wrapping comment blocks.
use Unicode::LineBreak;
my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.

# Printing UTF-8 to STDOUT.
binmode(STDOUT, "encoding(UTF-8)");

die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;

my $DEBUG;
my $VERSION = "v0.2.2";
# Dispatch table to be parsed before url.
my %dispatch = (
    '-v'  => sub { print "Draco $VERSION\n"; exit; },
    '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
    '-h'  => sub { print qq{Draco $VERSION

Options:
    -d
        Turn on debug messages. Debug messages will be printed to
        STDERR.
    -h
        Print this help.
    -v
        Print version.

Environment Variables:
    FETCH_ALL
        Fetch all comments. This will make multiple HTTP calls to
        reddit. This doesn't fetch *all* the comments.
};
                   exit;
               },
);
if (exists $dispatch{$ARGV[0]}) {
    # shift @ARGV to get $url in next shift.
    $dispatch{shift @ARGV}->();
}

# This is the start time.
my $start_time = time;
my $last_log = $start_time;

# $url contains the reddit post. Raise the limit to 500 comments which
# is the maximum reddit allows.
my $url = shift @ARGV;
my $json_url = "${url}.json?limit=500&sort=top";

my $http = HTTP::Tiny->new( verify_SSL => 1 );

# Fetch the post.
print_time() if $DEBUG;
print STDERR "fetching `$json_url'.\n" if $DEBUG;
my $response = get_response($json_url);

# Decode json.
print STDERR "decoding json response.\n" if $DEBUG;
my $json_data = decode_json($response->{content});

# $post contains post data
my $post = $json_data->[0]->{data}->{children}->[0]->{data};

# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};

# Start the Org document.
print "#+", "STARTUP:content\n";

# Print the date.
my $current_date = Time::Piece->new->strftime('%+');
print "#+", "DATE: $current_date\n";
print "\n";

# Print the post title & it's link.
print "* ", "[[$post->{url}][$post->{title}]]\n";

# Add various details to :PROPERTIES:.
print ":PROPERTIES:\n";
# Include the created date, archive date & total top-level comments in
# properties.
print ":CREATED_UTC: ",
    Time::Piece->strptime($post->{created_utc}, '%s')
    ->strftime('%+'), "\n";

print ":ARCHIVE_DATE: $current_date\n";
print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n";
foreach my $detail (qw( subreddit created_utc author permalink
                        upvote_ratio ups downs score )) {
    print ":${detail}: =$post->{$detail}=\n"
        if scalar $post->{$detail};
}
print ":END:\n";

# Add selftext if present.
print "\n#+BEGIN_SRC markdown\n",
    # Break the text at 76 column & add 2 space before every new line.
    "  ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n",
    "#+END_SRC\n"
    if scalar $post->{selftext};

my (@http_calls, @shell_comments, %counter);
$counter{skipped_due_to_more} = 0;
$counter{print_comment_chain_call} = 0;
$counter{iterate_over_comments_call} = 0;

print_time() if $DEBUG;
print STDERR "iterating over top-level comments.\n" if $DEBUG;
# Iterate over top-level comments. The second argument is level
# (depth), it should be 0 for top-level comments.
iterate_over_comments($comments, 0);
print_time() if $DEBUG;

# Print important stats.
print STDERR "\n" if $DEBUG;
print STDERR "total http calls: ",
    scalar @http_calls, "\n" if $DEBUG;
print STDERR "total top-level comments: ",
    scalar($comments->@*), "\n" if $DEBUG;
print STDERR "total shell comments: ",
    scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments;
print STDERR "total print_comment_chain calls: ",
    $counter{print_comment_chain_call}, "\n" if $DEBUG;
print STDERR "total iterate_over_comments calls: ",
    $counter{iterate_over_comments_call}, "\n" if $DEBUG;

# This is equivalent to "continue this thread ->" we see on
# old.reddit.com threads.
print STDERR "total comments skipped due to more: ",
    $counter{skipped_due_to_more}, "\n" if $DEBUG;

sub print_time {
    print STDERR "    ";
    print STDERR "time since [start, last log]: [", time - $start_time,
        ", ", time - $last_log, "] seconds\n";
    $last_log = time;
}

sub get_response {
    my $url = shift @_;
    my $response = $http->get($url);
    push @http_calls, $url;
    die "Unexpected response - $response->{status}: $response->{reason} : $url"
        unless $response->{success};
    return $response;
}

# First argument requires $comments & second is the level (depth).
sub iterate_over_comments {
    my $comments = shift @_;
    my $level = shift @_;

    $counter{iterate_over_comments_call}++;

    foreach my $comment ($comments->@*) {
        my $comment_data = $comment->{data};

        # There are 3 kind of comments.
        #
        # 1. normal comments (includes top-level comments & replies).
        # 2. comments hidden under "load more comments".
        # 3. comments hidden under "continue this thread".

        # We will be dealing with them in reverse order, i.e. 3rd ->
        # 2nd -> 1st.

        # This comment we are skipping is the third kind of comment,
        # i.e. comment hidden under "continue this thread". We can't
        # parse it yet.
        if ($comment->{kind} eq "more"
            and $comment_data->{id} eq "_") {
            $counter{skipped_due_to_more}++;
            next;
        }

        # These are second kind of comments, i.e. comments hidden
        # under "load more comments". We can get it by making another
        # HTTP call. This is skipped by default & user has to pass
        # `FETCH_ALL' to enable it.
        unless ($comment_data->{author}) {
            push @shell_comments, $comment_data->{id};

            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            unless ( eval {
                # Reddit doesn't like this kind of url:
                #     http://<reddit>/<post_id>//<comment_id>.json
                #
                # It wants this kind of url:
                #     http://<reddit>/<post_id>/<comment_id>.json
                #
                # Notice the extra '/' in first url.
                my $json_url = $url;
                $json_url .= "/" unless substr $url, -1 eq "/";
                $json_url .= "$comment_data->{id}.json?limit=500&sort=top";

                # Fetch the comment.
                my $response = get_response($json_url);

                # Decode json.
                my $json_data = decode_json($response->{content});

                # $comments contains comment data. We are interested in: replies,
                # author, body, created_utc & permalink.
                my $comments = $json_data->[1]->{data}->{children};

                # Now this is like a normal comment chain, i.e. first
                # kind of comment. We just have to iterate over it &
                # pass to print_comment_chain, iterate_over_comments
                # will handle it.
                iterate_over_comments($comments, $level);
                return 1;
            } ) {
                my $err = $@;
                print STDERR "parsing `$comment_data->{id}' failed: $err\n";
            }

            # This comment thread has been parsed, move on to the text
            # one.
            next;
        }

        # This is first kind of comment, we can pass it directly to
        # print_comment_chain.
        print_comment_chain($comment_data, $level);
    }
}


# print_comment_chain will print the whole chain of comment while
# accounting for level. It can only parse the first kind of comment,
# i.e. top-level comments & their replies. To learn about kinds of
# comments, check iterate_over_comments() subroutine.
sub print_comment_chain {
    # This was earlier called $comment & was changed to $comment_data
    # to prevent confusion because it is $comment->{data}.
    my $comment_data = shift @_;
    my $level = shift @_;

    $counter{print_comment_chain_call}++;

    print "*" x ($level + 2), " ", "$comment_data->{author}";
    print " [S]" if $comment_data->{is_submitter};
    print "\n";

    # Print comment details.
    print ":PROPERTIES:\n";
    print ":CREATED_UTC: ",
        Time::Piece->strptime($comment_data->{created_utc}, '%s')
          ->strftime('%+'), "\n";
    foreach my $detail (qw( created_utc author permalink upvote_ratio
                            ups downs score edited stickied
                            controversiality )) {
        print ":${detail}: =$comment_data->{$detail}=\n"
            if scalar $comment_data->{$detail};
    }
    print ":END:\n";

    print "\n#+BEGIN_SRC markdown\n",
        # Break the text at 76 column & add 2 space before every new
        # line.
        "  ", $lb->break($comment_data->{body}) =~ s/\n/\n\ \ /gr, "\n",
        "#+END_SRC\n";

    # If the comment has replies then iterate over those too.
    if (scalar $comment_data->{replies}) {
        iterate_over_comments($comment_data->{replies}->{data}->{children},
                              $level + 1);
    }
}