about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKartik K. Agaram <vc@akkartik.com>2021-08-10 05:09:19 -0700
committerKartik K. Agaram <vc@akkartik.com>2021-08-10 05:14:59 -0700
commit74dad4c484df86c03037deaaf683fd6d86e05119 (patch)
tree3ea67f94e29d210c606dad296f6de7f11922bd82
parent49352e16164e34a329193cc49d5b121fbbd7b529 (diff)
downloadmu-74dad4c484df86c03037deaaf683fd6d86e05119.tar.gz
slack: emit comment parent indices in converter
They're easier to process when loading the data disk.

In the process we lose a few more items because they're comments to
items we were dropping earlier.
-rw-r--r--browse_slack/convert_slack.py23
1 files changed, 15 insertions, 8 deletions
diff --git a/browse_slack/convert_slack.py b/browse_slack/convert_slack.py
index a19717cc..976c223d 100644
--- a/browse_slack/convert_slack.py
+++ b/browse_slack/convert_slack.py
@@ -20,8 +20,8 @@
 #   cd ../..  # go back to parent of images/
 #   dd if=/dev/zero of=data.img count=201600  # 100MB
 #   python path/to/convert_slack.py |dd of=data.img conv=notrunc
-# Currently this process yields errors for ~70 items on the Future of Software
-# group. We fail to load those.
+# Currently this process yields errors for ~300 items (~70 items and their comments)
+# on the Future of Software group (https://futureofcoding.org/community). We fail to load those.
 #
 # Notes on input format:
 #   Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.
@@ -50,16 +50,23 @@ with open('users.json') as f:
 def by(item):
     return user_idx[item['user']]
 
+item_idx = {}
+def parent(item):
+    if 'thread_ts' in item and item['thread_ts'] != item['ts']:
+        # comment
+        return item_idx[item['thread_ts']]
+    else:
+        return -1
+
+idx = 0
 for channel in json.load(open('channels.json')):
     for filename in sorted(listdir(channel['name'])):
         with open(join(channel['name'], filename)) as f:
             for item in json.load(f):
                 try:
-                    if 'thread_ts' in item:
-                        # comment
-                        print(f"({json.dumps(item['ts'])} {json.dumps(item['thread_ts'])} {json.dumps(channel['name'])} {by(item)} {json.dumps(item['text'])})")
-                    else:
-                        # top-level post
-                        print(f"({json.dumps(item['ts'])} {json.dumps(               '')} {json.dumps(channel['name'])} {by(item)} {json.dumps(item['text'])})")
+#?                     stderr.write(repr(item)+'\n')
+                    print(f"({json.dumps(item['ts'])} {parent(item)} {json.dumps(channel['name'])} {by(item)} {json.dumps(item['text'])})")
+                    item_idx[item['ts']] = idx
+                    idx += 1
                 except KeyError:
                     stderr.write(repr(item)+'\n')