From 0717d9b3e6904a4dcd5d2dc9e680cc5b21c50e51 Mon Sep 17 00:00:00 2001
From: Eugen Rochko <eugen@zeonfederated.com>
Date: Sun, 8 Oct 2017 17:34:34 +0200
Subject: Set snowflake IDs for backdated statuses (#5260)

- Rename Mastodon::TimestampIds into Mastodon::Snowflake for clarity
- Skip for statuses coming from inbox, aka delivered in real-time
- Skip for statuses that claim to be from the future
---
 lib/mastodon/snowflake.rb     | 162 ++++++++++++++++++++++++++++++++++++++++++
 lib/mastodon/timestamp_ids.rb | 131 ----------------------------------
 lib/tasks/db.rake             |   6 +-
 3 files changed, 165 insertions(+), 134 deletions(-)
 create mode 100644 lib/mastodon/snowflake.rb
 delete mode 100644 lib/mastodon/timestamp_ids.rb

(limited to 'lib')

diff --git a/lib/mastodon/snowflake.rb b/lib/mastodon/snowflake.rb
new file mode 100644
index 000000000..219e323d4
--- /dev/null
+++ b/lib/mastodon/snowflake.rb
@@ -0,0 +1,162 @@
+# frozen_string_literal: true
+
+module Mastodon::Snowflake
+  DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
+
+  class Callbacks
+    def self.around_create(record)
+      now = Time.now.utc
+
+      if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at
+        yield
+      else
+        record.id = Mastodon::Snowflake.id_at(record.created_at)
+        tries     = 0
+
+        begin
+          yield
+        rescue ActiveRecord::RecordNotUnique
+          raise if tries > 100
+
+          tries     += 1
+          record.id += rand(100)
+
+          retry
+        end
+      end
+    end
+  end
+
+  class << self
+    # Our ID will be composed of the following:
+    # 6 bytes (48 bits) of millisecond-level timestamp
+    # 2 bytes (16 bits) of sequence data
+    #
+    # The 'sequence data' is intended to be unique within a
+    # given millisecond, yet obscure the 'serial number' of
+    # this row.
+    #
+    # To do this, we hash the following data:
+    # * Table name (if provided, skipped if not)
+    # * Secret salt (should not be guessable)
+    # * Timestamp (again, millisecond-level granularity)
+    #
+    # We then take the first two bytes of that value, and add
+    # the lowest two bytes of the table ID sequence number
+    # (`table_name`_id_seq). This means that even if we insert
+    # two rows at the same millisecond, they will have
+    # distinct 'sequence data' portions.
+    #
+    # If this happens, and an attacker can see both such IDs,
+    # they can determine which of the two entries was inserted
+    # first, but not the total number of entries in the table
+    # (even mod 2**16).
+    #
+    # The table name is included in the hash to ensure that
+    # different tables derive separate sequence bases so rows
+    # inserted in the same millisecond in different tables do
+    # not reveal the table ID sequence number for one another.
+    #
+    # The secret salt is included in the hash to ensure that
+    # external users cannot derive the sequence base given the
+    # timestamp and table name, which would allow them to
+    # compute the table ID sequence number.
+    def define_timestamp_id
+      return if already_defined?
+
+      connection.execute(<<~SQL)
+        CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
+        RETURNS bigint AS
+        $$
+          DECLARE
+            time_part bigint;
+            sequence_base bigint;
+            tail bigint;
+          BEGIN
+            time_part := (
+              -- Get the time in milliseconds
+              ((date_part('epoch', now()) * 1000))::bigint
+              -- And shift it over two bytes
+              << 16);
+
+            sequence_base := (
+              'x' ||
+              -- Take the first two bytes (four hex characters)
+              substr(
+                -- Of the MD5 hash of the data we documented
+                md5(table_name ||
+                  '#{SecureRandom.hex(16)}' ||
+                  time_part::text
+                ),
+                1, 4
+              )
+            -- And turn it into a bigint
+            )::bit(16)::bigint;
+
+            -- Finally, add our sequence number to our base, and chop
+            -- it to the last two bytes
+            tail := (
+              (sequence_base + nextval(table_name || '_id_seq'))
+              & 65535);
+
+            -- Return the time part and the sequence part. OR appears
+            -- faster here than addition, but they're equivalent:
+            -- time_part has no trailing two bytes, and tail is only
+            -- the last two bytes.
+            RETURN time_part | tail;
+          END
+        $$ LANGUAGE plpgsql VOLATILE;
+      SQL
+    end
+
+    def ensure_id_sequences_exist
+      # Find tables using timestamp IDs.
+      connection.tables.each do |table|
+        # We're only concerned with "id" columns.
+        next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
+
+        # And only those that are using timestamp_id.
+        next unless (data = DEFAULT_REGEX.match(id_col.default_function))
+
+        seq_name = data[:seq_prefix] + '_id_seq'
+
+        # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
+        # NOT EXISTS, but we can't depend on that. Instead, catch the
+        # possible exception and ignore it.
+        # Note that seq_name isn't a column name, but it's a
+        # relation, like a column, and follows the same quoting rules
+        # in Postgres.
+        connection.execute(<<~SQL)
+          DO $$
+            BEGIN
+              CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
+            EXCEPTION WHEN duplicate_table THEN
+              -- Do nothing, we have the sequence already.
+            END
+          $$ LANGUAGE plpgsql;
+        SQL
+      end
+    end
+
+    def id_at(timestamp)
+      id  = timestamp.to_i * 1000 + rand(1000)
+      id  = id << 16
+      id += rand(2**16)
+      id
+    end
+
+    private
+
+    def already_defined?
+      connection.execute(<<~SQL).values.first.first
+        SELECT EXISTS(
+          SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
+        );
+      SQL
+    end
+
+    def connection
+      ActiveRecord::Base.connection
+    end
+  end
+end
diff --git a/lib/mastodon/timestamp_ids.rb b/lib/mastodon/timestamp_ids.rb
deleted file mode 100644
index 3b048a50c..000000000
--- a/lib/mastodon/timestamp_ids.rb
+++ /dev/null
@@ -1,131 +0,0 @@
-# frozen_string_literal: true
-
-module Mastodon::TimestampIds
-  DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
-
-  class << self
-    # Our ID will be composed of the following:
-    # 6 bytes (48 bits) of millisecond-level timestamp
-    # 2 bytes (16 bits) of sequence data
-    #
-    # The 'sequence data' is intended to be unique within a
-    # given millisecond, yet obscure the 'serial number' of
-    # this row.
-    #
-    # To do this, we hash the following data:
-    # * Table name (if provided, skipped if not)
-    # * Secret salt (should not be guessable)
-    # * Timestamp (again, millisecond-level granularity)
-    #
-    # We then take the first two bytes of that value, and add
-    # the lowest two bytes of the table ID sequence number
-    # (`table_name`_id_seq). This means that even if we insert
-    # two rows at the same millisecond, they will have
-    # distinct 'sequence data' portions.
-    #
-    # If this happens, and an attacker can see both such IDs,
-    # they can determine which of the two entries was inserted
-    # first, but not the total number of entries in the table
-    # (even mod 2**16).
-    #
-    # The table name is included in the hash to ensure that
-    # different tables derive separate sequence bases so rows
-    # inserted in the same millisecond in different tables do
-    # not reveal the table ID sequence number for one another.
-    #
-    # The secret salt is included in the hash to ensure that
-    # external users cannot derive the sequence base given the
-    # timestamp and table name, which would allow them to
-    # compute the table ID sequence number.
-    def define_timestamp_id
-      return if already_defined?
-
-      connection.execute(<<~SQL)
-        CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
-        RETURNS bigint AS
-        $$
-          DECLARE
-            time_part bigint;
-            sequence_base bigint;
-            tail bigint;
-          BEGIN
-            time_part := (
-              -- Get the time in milliseconds
-              ((date_part('epoch', now()) * 1000))::bigint
-              -- And shift it over two bytes
-              << 16);
-
-            sequence_base := (
-              'x' ||
-              -- Take the first two bytes (four hex characters)
-              substr(
-                -- Of the MD5 hash of the data we documented
-                md5(table_name ||
-                  '#{SecureRandom.hex(16)}' ||
-                  time_part::text
-                ),
-                1, 4
-              )
-            -- And turn it into a bigint
-            )::bit(16)::bigint;
-
-            -- Finally, add our sequence number to our base, and chop
-            -- it to the last two bytes
-            tail := (
-              (sequence_base + nextval(table_name || '_id_seq'))
-              & 65535);
-
-            -- Return the time part and the sequence part. OR appears
-            -- faster here than addition, but they're equivalent:
-            -- time_part has no trailing two bytes, and tail is only
-            -- the last two bytes.
-            RETURN time_part | tail;
-          END
-        $$ LANGUAGE plpgsql VOLATILE;
-      SQL
-    end
-
-    def ensure_id_sequences_exist
-      # Find tables using timestamp IDs.
-      connection.tables.each do |table|
-        # We're only concerned with "id" columns.
-        next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
-
-        # And only those that are using timestamp_id.
-        next unless (data = DEFAULT_REGEX.match(id_col.default_function))
-
-        seq_name = data[:seq_prefix] + '_id_seq'
-
-        # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
-        # NOT EXISTS, but we can't depend on that. Instead, catch the
-        # possible exception and ignore it.
-        # Note that seq_name isn't a column name, but it's a
-        # relation, like a column, and follows the same quoting rules
-        # in Postgres.
-        connection.execute(<<~SQL)
-          DO $$
-            BEGIN
-              CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
-            EXCEPTION WHEN duplicate_table THEN
-              -- Do nothing, we have the sequence already.
-            END
-          $$ LANGUAGE plpgsql;
-        SQL
-      end
-    end
-
-    private
-
-    def already_defined?
-      connection.execute(<<~SQL).values.first.first
-        SELECT EXISTS(
-          SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
-        );
-      SQL
-    end
-
-    def connection
-      ActiveRecord::Base.connection
-    end
-  end
-end
diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake
index 6af6bb6fb..32039c31d 100644
--- a/lib/tasks/db.rake
+++ b/lib/tasks/db.rake
@@ -1,6 +1,6 @@
 # frozen_string_literal: true
 
-require Rails.root.join('lib', 'mastodon', 'timestamp_ids')
+require_relative '../mastodon/snowflake'
 
 def each_schema_load_environment
   # If we're in development, also run this for the test environment.
@@ -63,13 +63,13 @@ namespace :db do
 
   task :define_timestamp_id do
     each_schema_load_environment do
-      Mastodon::TimestampIds.define_timestamp_id
+      Mastodon::Snowflake.define_timestamp_id
     end
   end
 
   task :ensure_id_sequences_exist do
     each_schema_load_environment do
-      Mastodon::TimestampIds.ensure_id_sequences_exist
+      Mastodon::Snowflake.ensure_id_sequences_exist
     end
   end
 end
-- 
cgit