summary refs log tree commit diff stats
path: root/draco.pl
blob: ee21f064abe83d496bcc5e0506a18b353d00de1b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#!/usr/bin/perl

use strict;
use warnings;
use Time::Piece;

use HTTP::Tiny;
use JSON::MaybeXS;

# For wrapping comment blocks.
use Unicode::LineBreak;
my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.

# Printing UTF-8 to STDOUT.
binmode(STDOUT, "encoding(UTF-8)");

die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;

my $DEBUG;
my $VERSION = "v0.3.1";
# Dispatch table to be parsed before url.
my %dispatch = (
    '-v'  => sub { print "Draco $VERSION\n"; exit; },
    '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
    '-h'  => sub { print qq{Draco $VERSION

Options:
    -d
        Turn on debug messages. Debug messages will be printed to
        STDERR.
    -h
        Print this help.
    -v
        Print version.

Environment Variables:
    FETCH_ALL
        Fetch all comments. This will make multiple HTTP calls to
        reddit.
};
                   exit;
               },
);
if (exists $dispatch{$ARGV[0]}) {
    # shift @ARGV to get $url in next shift.
    $dispatch{shift @ARGV}->();
}

# This is the start time.
my $start_time = time;
my $last_log = $start_time;

# $url contains the reddit post. Raise the limit to 500 comments which
# is the maximum reddit allows.
my $url = shift @ARGV;
my $json_url = "${url}.json?limit=500&sort=top";

my $http = HTTP::Tiny->new( verify_SSL => 1 );

# Fetch the post.
print_time() if $DEBUG;
print STDERR "fetching `$json_url'.\n" if $DEBUG;
my $response = get_response($json_url);

# Decode json.
print STDERR "decoding json response.\n" if $DEBUG;
my $json_data = decode_json($response->{content});

# $post contains post data
my $post = $json_data->[0]->{data}->{children}->[0]->{data};

# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};

# Start the Org document.
print "#+", "STARTUP:content\n";

# Print the date.
my $current_date = Time::Piece->new->strftime('%+');
print "#+", "DATE: $current_date\n";
print "\n";

# Print the post title & it's link.
print "* ", "[[$post->{url}][$post->{title}]]\n";

# Add various details to :PROPERTIES:.
print ":PROPERTIES:\n";
# Include the created date & archive date in properties.
print ":CREATED_UTC: ",
    Time::Piece->strptime($post->{created_utc}, '%s')
    ->strftime('%+'), "\n";

print ":ARCHIVE_DATE: $current_date\n";
foreach my $detail (qw( subreddit created_utc author permalink
                        upvote_ratio ups downs score )) {
    print ":${detail}: =$post->{$detail}=\n"
        if scalar $post->{$detail};
}
print ":END:\n";

# Add selftext if present.
print "\n#+BEGIN_SRC markdown\n",
    # Break the text at 76 column & add 2 space before every new line.
    "  ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n",
    "#+END_SRC\n"
    if scalar $post->{selftext};

my (@http_calls, %counter);
$counter{print_comment_chain_call} = 0;
$counter{iterate_over_comments_call} = 0;

print_time() if $DEBUG;
print STDERR "iterating over top-level comments.\n" if $DEBUG;

# We are going to put a dot after each HTTP call. Dot on first HTTP
# call is skipped so we print this message only if user has set
# `FETCH_ALL' because that'll be the reason Draco makes multiple HTTP
# calls.
print STDERR "each dot is a HTTP call.\n"
    if $DEBUG and $ENV{FETCH_ALL};

# Iterate over top-level comments. The second argument is level
# (depth), it should be 0 for top-level comments.
iterate_over_comments($comments, 0);

# Seperate the dots with the rest by a "\n".
print STDERR "\n"
    if $DEBUG and $ENV{FETCH_ALL};

print_time() if $DEBUG;

# Print important stats.
print STDERR "\n" if $DEBUG;
print STDERR "total http calls: ",
    scalar @http_calls, "\n" if $DEBUG;
print STDERR "total print_comment_chain calls: ",
    $counter{print_comment_chain_call}, "\n" if $DEBUG;
print STDERR "total iterate_over_comments calls: ",
    $counter{iterate_over_comments_call}, "\n" if $DEBUG;

sub print_time {
    print STDERR "    ";
    print STDERR "time since [start, last log]: [", time - $start_time,
        ", ", time - $last_log, "] seconds\n";
    $last_log = time;
}

sub get_response {
    my $url = shift @_;
    # Skip a dot on first HTTP call.
    print STDERR "." if $DEBUG and scalar @http_calls > 0;
    my $response = $http->get($url);
    push @http_calls, $url;
    die "Unexpected response - $response->{status}: $response->{reason} : $url"
        unless $response->{success};
    return $response;
}

# Pass <comment id> as argument and it'll return you the json url to
# that comment thread.
sub get_comment_thread_from_id {
    my $comment_id = shift @_;

    # Reddit doesn't like this kind of url:
    #     http://<reddit>/<post_id>//<comment_id>.json
    #
    # It wants this kind of url:
    #     http://<reddit>/<post_id>/<comment_id>.json
    #
    # Notice the extra '/' in first url.
    my $json_url = $url;
    $json_url .= "/" unless substr $url, -1 eq "/";
    $json_url .= "${comment_id}.json?limit=500&sort=top";
    return $json_url;
}

# This was being used multiple times so I moved it to a subroutine.
# It'll take $comment_id & return $comments.
sub get_all_comments_from_id {
    my $comment_id = shift @_;
    my $json_url = get_comment_thread_from_id($comment_id);

    # Fetch the comment.
    my $response = get_response($json_url);

    # Decode json.
    my $json_data = decode_json($response->{content});

    # $comments contains comment data.
    my $comments = $json_data->[1]->{data}->{children};

    return $comments;
}

# First argument requires $comments & second is the level (depth).
sub iterate_over_comments {
    my $comments = shift @_;
    my $level = shift @_;

    $counter{iterate_over_comments_call}++;

    foreach my $comment ($comments->@*) {
        my $comment_data = $comment->{data};

        # There are 3 kind of comments.
        #
        # 1. normal comments (includes top-level comments & replies).
        # 2. comments hidden under "load more comments".
        # 3. comments hidden under "continue this thread".

        # We will be dealing with them in reverse order, i.e. 3rd ->
        # 2nd -> 1st.

        # This comment we are skipping is the third kind of comment,
        # i.e. comment hidden under "continue this thread".
        if ($comment->{kind} eq "more"
            and $comment_data->{id} eq "_") {
            # $comment_data->{parent_id} starts with "t1_" so we get
            # the {id} from there.
            my $comment_id = substr $comment_data->{parent_id}, 3;

            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            unless ( eval {
                # get $comments.
                my $comments = get_all_comments_from_id($comment_id);

                # 0th index will contain the comment we are looking for.
                my $comment_data = $comments->[0]->{data};

                # We already have printed this comment so we will just
                # look it has replies, if it does then we'll print
                # those.
                if (scalar $comment_data->{replies}) {
                    iterate_over_comments(
                        $comment_data->{replies}->{data}->{children},
                        $level + 1);
                }
                return 1;
            } ) {
                my $err = $@;
                # Keep the "\n" at the start.
                warn "\nparsing `$comment_id' failed: $err\n";
            }

            # This comment thread has been parsed, move on to the text
            # one.
            next;
        }

        # These are second kind of comments, i.e. comments hidden
        # under "load more comments". Their kind is "more" & they have
        # an id. This part is a bit complex so read the comments.
        #
        # We can get it by making another HTTP call. This is skipped
        # by default & user has to pass `FETCH_ALL' to enable it.
        if ($comment->{kind} eq "more"
                and $comment_data->{id}) {
            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            # The problem here is that if the thread is too large then
            # at the end reddit will include similar block in which
            # it'll put all other top-level comments in children of
            # 2nd kind.
            #
            # "kind": "more",
            # "data": {
            # "id": "gde31fk",
            # ...
            # "children": [
            # "gde31fk",
            # "gdbrnyd",
            # ...
            #
            # If the "load more comments" only hides a single thread
            # then it's the only one included in "children". Note how
            # the "id" & first element of "children" is same.
            #
            # So, reddit wants us to check for the length of this list
            # "children", if it is greater than 1 then we need to pull
            # those comments independently. If not then we just have
            # to pull the "id".

            if ($comment_data->{children}
                and scalar $comment_data->{children} < 2) {
                unless ( eval {
                    # get $comments.
                    my $comments = get_all_comments_from_id(
                        $comment_data->{id}
                    );

                    # Now this is like a normal comment chain, i.e.
                    # first kind of comment. We just have to iterate
                    # over it & pass to print_comment_chain,
                    # iterate_over_comments will handle it.
                    iterate_over_comments($comments, $level);
                    return 1;
                } ) {
                    my $err = $@;
                    # Keep the "\n" at the start.
                    warn "\nparsing `$comment_data->{id}' failed: $err\n";
                }
            } else {
                # If we reach this block then it means that multiple
                # comments are hiding under "load more comments", we
                # will make one call for each comment, this can mean a
                # lot of HTTP calls.
                foreach my $comment_id ($comment_data->{children}->@*) {
                    unless ( eval {
                        # get $comments.
                        my $comments = get_all_comments_from_id($comment_id);

                        # Now this is like a normal comment chain, i.e.
                        # first kind of comment. We just have to iterate
                        # over it & pass to print_comment_chain,
                        # iterate_over_comments will handle it.
                        iterate_over_comments($comments, $level);
                        return 1;
                    } ) {
                        my $err = $@;
                        # Keep the "\n" at the start.
                        warn "\nparsing `$comment_data->{id}' failed: $err\n";
                    }
                }
            }
            # This comment thread has been parsed, move on to the next
            # one.
            next;
        }
        # This is first kind of comment, we can pass it directly to
        # print_comment_chain.
        print_comment_chain($comment_data, $level);
    }
}

# print_comment_chain will print the whole chain of comment while
# accounting for level. It can only parse the first kind of comment,
# i.e. top-level comments & their replies. To learn about kinds of
# comments, check iterate_over_comments() subroutine.
sub print_comment_chain {
    # This was earlier called $comment & was changed to $comment_data
    # to prevent confusion because it is $comment->{data}.
    my $comment_data = shift @_;
    my $level = shift @_;

    $counter{print_comment_chain_call}++;

    print "*" x ($level + 2), " ";
    # If the author name starts & ends with "_" then Org will
    # underline it, so we put those names in a code block.
    if (substr($comment_data->{author}, 0, 1) eq "_"
        and substr($comment_data->{author}, -1) eq "_") {
        print "=$comment_data->{author}=";
    } else {
        print "$comment_data->{author}";
    }
    print " [S]" if $comment_data->{is_submitter};
    print "\n";

    # Print comment details.
    print ":PROPERTIES:\n";
    print ":CREATED_UTC: ",
        Time::Piece->strptime($comment_data->{created_utc}, '%s')
          ->strftime('%+'), "\n";
    foreach my $detail (qw( created_utc author permalink upvote_ratio
                            ups downs score edited stickied
                            controversiality )) {
        print ":${detail}: =$comment_data->{$detail}=\n"
            if scalar $comment_data->{$detail};
    }
    print ":END:\n";

    print "\n#+BEGIN_SRC markdown\n",
        # Break the text at 76 column & add 2 space before every new
        # line.
        "  ", $lb->break($comment_data->{body}) =~ s/\n/\n\ \ /gr, "\n",
        "#+END_SRC\n";

    # If the comment has replies then iterate over those too.
    if (scalar $comment_data->{replies}) {
        iterate_over_comments($comment_data->{replies}->{data}->{children},
                              $level + 1);
    }
}