diff options
-rwxr-xr-x | draco.pl | 208 |
1 files changed, 128 insertions, 80 deletions
diff --git a/draco.pl b/draco.pl index 7d3c75c..4a85384 100755 --- a/draco.pl +++ b/draco.pl @@ -2,12 +2,11 @@ use strict; use warnings; +use Time::Piece; use HTTP::Tiny; use JSON::MaybeXS; -use POSIX qw(strftime); - # For wrapping comment blocks. use Unicode::LineBreak; my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76. @@ -47,6 +46,10 @@ if (exists $dispatch{$ARGV[0]}) { $dispatch{shift @ARGV}->(); } +# This is the start time. +my $start_time = time; +my $last_log = $start_time; + # $url contains the reddit post. Raise the limit to 500 comments which # is the maximum reddit allows. my $url = shift @ARGV; @@ -55,6 +58,7 @@ my $json_url = "${url}.json?limit=500&sort=top"; my $http = HTTP::Tiny->new( verify_SSL => 1 ); # Fetch the post. +print_time() if $DEBUG; print STDERR "fetching `$json_url'.\n" if $DEBUG; my $response = get_response($json_url); @@ -69,16 +73,12 @@ my $post = $json_data->[0]->{data}->{children}->[0]->{data}; # author, body, created_utc & permalink. my $comments = $json_data->[1]->{data}->{children}; -# Print total top-level comments. -print STDERR "total top-level comments: ", - scalar($comments->@*), "\n" if $DEBUG; - # Start the Org document. print "#+", "STARTUP:content\n"; # Print the date. -my $date = strftime '%+', localtime(); -print "#+", "DATE: $date\n"; +my $current_date = Time::Piece->new->strftime('%+'); +print "#+", "DATE: $current_date\n"; print "\n"; # Print the post title & it's link. @@ -86,14 +86,19 @@ print "* ", "[[$post->{url}][$post->{title}]]\n"; # Add various details to :PROPERTIES:. print ":PROPERTIES:\n"; +# Include the created date, archive date & total top-level comments in +# properties. +print ":CREATED_UTC: ", + Time::Piece->strptime($post->{created_utc}, '%s') + ->strftime('%+'), "\n"; + +print ":ARCHIVE_DATE: $current_date\n"; +print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n"; foreach my $detail (qw( subreddit created_utc author permalink upvote_ratio ups downs score )) { print ":${detail}: =$post->{$detail}=\n" if scalar $post->{$detail}; } -# Include the archive date & total top-level comments in properties. -print ":ARCHIVE_DATE: $date\n"; -print ":TOTAL_TOP_LEVEL_COMMENTS: ", scalar($comments->@*), "\n"; print ":END:\n"; # Add selftext if present. @@ -106,29 +111,40 @@ print "\n#+BEGIN_SRC markdown\n", my (@http_calls, @shell_comments, %counter); $counter{skipped_due_to_more} = 0; $counter{print_comment_chain_call} = 0; +$counter{iterate_over_comments_call} = 0; -# Iterate over top-level comments. -foreach my $comment ($comments->@*) { - if ($comment->{kind} eq "more" - and $comment->{data}->{id} eq "_") { - $counter{skipped_due_to_more}++; - next; - } - print_comment_chain($comment->{data}, 0); -} +print_time() if $DEBUG; +print STDERR "iterating over top-level comments.\n" if $DEBUG; +# Iterate over top-level comments. The second argument is level +# (depth), it should be 0 for top-level comments. +iterate_over_comments($comments, 0); +print_time() if $DEBUG; +# Print important stats. +print STDERR "\n" if $DEBUG; print STDERR "total http calls: ", scalar @http_calls, "\n" if $DEBUG; +print STDERR "total top-level comments: ", + scalar($comments->@*), "\n" if $DEBUG; print STDERR "total shell comments: ", scalar @shell_comments, "\n" if $DEBUG and scalar @shell_comments; print STDERR "total print_comment_chain calls: ", $counter{print_comment_chain_call}, "\n" if $DEBUG; +print STDERR "total iterate_over_comments calls: ", + $counter{iterate_over_comments_call}, "\n" if $DEBUG; # This is equivalent to "continue this thread ->" we see on # old.reddit.com threads. print STDERR "total comments skipped due to more: ", $counter{skipped_due_to_more}, "\n" if $DEBUG; +sub print_time { + print STDERR " "; + print STDERR "time since [start, last log]: [", time - $start_time, + ", ", time - $last_log, "] seconds\n"; + $last_log = time; +} + sub get_response { my $url = shift @_; my $response = $http->get($url); @@ -138,14 +154,93 @@ sub get_response { return $response; } -# There are 3 kind of comments. -# -# 1. normal comments (includes top-level comments). -# 2. comments hidden under "load more comments". -# 3. comments hidden under "continue this thread". +# First argument requires $comments & second is the level (depth). +sub iterate_over_comments { + my $comments = shift @_; + my $level = shift @_; + + $counter{iterate_over_comments_call}++; + + foreach my $comment ($comments->@*) { + my $comment_data = $comment->{data}; + + # There are 3 kind of comments. + # + # 1. normal comments (includes top-level comments & replies). + # 2. comments hidden under "load more comments". + # 3. comments hidden under "continue this thread". + + # We will be dealing with them in reverse order, i.e. 3rd -> + # 2nd -> 1st. + + # This comment we are skipping is the third kind of comment, + # i.e. comment hidden under "continue this thread". We can't + # parse it yet. + if ($comment->{kind} eq "more" + and $comment_data->{id} eq "_") { + $counter{skipped_due_to_more}++; + next; + } + + # These are second kind of comments, i.e. comments hidden + # under "load more comments". We can get it by making another + # HTTP call. This is skipped by default & user has to pass + # `FETCH_ALL' to enable it. + unless ($comment_data->{author}) { + push @shell_comments, $comment_data->{id}; + + # Don't proceed unless user has set `FETCH_ALL'. + next unless $ENV{FETCH_ALL}; + + unless ( eval { + # Reddit doesn't like this kind of url: + # http://<reddit>/<post_id>//<comment_id>.json + # + # It wants this kind of url: + # http://<reddit>/<post_id>/<comment_id>.json + # + # Notice the extra '/' in first url. + my $json_url = $url; + $json_url .= "/" unless substr $url, -1 eq "/"; + $json_url .= "$comment_data->{id}.json?limit=500&sort=top"; + + # Fetch the comment. + my $response = get_response($json_url); + + # Decode json. + my $json_data = decode_json($response->{content}); + + # $comments contains comment data. We are interested in: replies, + # author, body, created_utc & permalink. + my $comments = $json_data->[1]->{data}->{children}; + + # Now this is like a normal comment chain, i.e. first + # kind of comment. We just have to iterate over it & + # pass to print_comment_chain, iterate_over_comments + # will handle it. + iterate_over_comments($comments, $level); + return 1; + } ) { + my $err = $@; + print STDERR "parsing `$comment_data->{id}' failed: $err\n"; + } + + # This comment thread has been parsed, move on to the text + # one. + next; + } + + # This is first kind of comment, we can pass it directly to + # print_comment_chain. + print_comment_chain($comment_data, $level); + } +} + # print_comment_chain will print the whole chain of comment while -# accounting for level. +# accounting for level. It can only parse the first kind of comment, +# i.e. top-level comments & their replies. To learn about kinds of +# comments, check iterate_over_comments() subroutine. sub print_comment_chain { # This was earlier called $comment & was changed to $comment_data # to prevent confusion because it is $comment->{data}. @@ -154,59 +249,18 @@ sub print_comment_chain { $counter{print_comment_chain_call}++; - # $comment_data->{author} not being present means that it's a - # comment hidden under "load more comments". We can get it by - # making another HTTP call. - unless ($comment_data->{author}) { - push @shell_comments, $comment_data->{id}; - return unless $ENV{FETCH_ALL}; - unless ( eval { - # It'll fail if we fetch "${url}/$comment_data->{id}.json" - # & ${url} already has "/" at the end. So, we check if "/" - # is present, if not then we add it. - my $json_url = $url; - $json_url .= "/" unless substr $url, -1 eq "/"; - $json_url .= "$comment_data->{id}.json?limit=500&sort=top"; - - # Fetch the comment. - my $response = get_response($json_url); - - # Decode json. - my $json_data = decode_json($response->{content}); - - # $comments contains comment data. We are interested in: replies, - # author, body, created_utc & permalink. - my $comments = $json_data->[1]->{data}->{children}; - - foreach my $comment ($comments->@*) { - if ($comment->{kind} eq "more" - and $comment->{data}->{id} eq "_") { - $counter{skipped_due_to_more}++; - next; - } - print_comment_chain($comment->{data}, $level); - } - - return 1; - } ) { - my $err = $@; - print STDERR "parsing `$comment_data->{id}' failed: $err\n"; - } - - # This comment thread has been parsed, move on to the text - # one. - return; - } - print "*" x ($level + 2), " ", "$comment_data->{author}"; print " [S]" if $comment_data->{is_submitter}; print "\n"; # Print comment details. print ":PROPERTIES:\n"; + print ":CREATED_UTC: ", + Time::Piece->strptime($comment_data->{created_utc}, '%s') + ->strftime('%+'), "\n"; foreach my $detail (qw( created_utc author permalink upvote_ratio - ups downs score edited is_submitter - stickied controversiality )) { + ups downs score edited stickied + controversiality )) { print ":${detail}: =$comment_data->{$detail}=\n" if scalar $comment_data->{$detail}; } @@ -220,13 +274,7 @@ sub print_comment_chain { # If the comment has replies then iterate over those too. if (scalar $comment_data->{replies}) { - foreach my $reply ($comment_data->{replies}->{data}->{children}->@*) { - if ($reply->{kind} eq "more" - and $reply->{data}->{id} eq "_") { - $counter{skipped_due_to_more}++; - next; - } - print_comment_chain($reply->{data}, $level + 1); - } + iterate_over_comments($comment_data->{replies}->{data}->{children}, + $level + 1); } } |