2 files changed, 182 insertions, 0 deletions
diff --git a/lib/mastodon/timestamp_ids.rb b/lib/mastodon/timestamp_ids.rb
new file mode 100644
index 000000000..d49b5c1b5
--- /dev/null
+++ b/lib/mastodon/timestamp_ids.rb
@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+
+module Mastodon
+  module TimestampIds
+    def self.define_timestamp_id
+      conn = ActiveRecord::Base.connection
+
+      # Make sure we don't already have a `timestamp_id` function.
+      unless conn.execute(<<~SQL).values.first.first
+        SELECT EXISTS(
+          SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
+        );
+      SQL
+        # The function doesn't exist, so we'll define it.
+        conn.execute(<<~SQL)
+          CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
+          RETURNS bigint AS
+          $$
+            DECLARE
+              time_part bigint;
+              sequence_base bigint;
+              tail bigint;
+            BEGIN
+              -- Our ID will be composed of the following:
+              -- 6 bytes (48 bits) of millisecond-level timestamp
+              -- 2 bytes (16 bits) of sequence data
+
+              -- The 'sequence data' is intended to be unique within a
+              -- given millisecond, yet obscure the 'serial number' of
+              -- this row.
+
+              -- To do this, we hash the following data:
+              -- * Table name (if provided, skipped if not)
+              -- * Secret salt (should not be guessable)
+              -- * Timestamp (again, millisecond-level granularity)
+
+              -- We then take the first two bytes of that value, and add
+              -- the lowest two bytes of the table ID sequence number
+              -- (`table_name`_id_seq). This means that even if we insert
+              -- two rows at the same millisecond, they will have
+              -- distinct 'sequence data' portions.
+
+              -- If this happens, and an attacker can see both such IDs,
+              -- they can determine which of the two entries was inserted
+              -- first, but not the total number of entries in the table
+              -- (even mod 2**16).
+
+              -- The table name is included in the hash to ensure that
+              -- different tables derive separate sequence bases so rows
+              -- inserted in the same millisecond in different tables do
+              -- not reveal the table ID sequence number for one another.
+
+              -- The secret salt is included in the hash to ensure that
+              -- external users cannot derive the sequence base given the
+              -- timestamp and table name, which would allow them to
+              -- compute the table ID sequence number.
+
+              time_part := (
+                -- Get the time in milliseconds
+                ((date_part('epoch', now()) * 1000))::bigint
+                -- And shift it over two bytes
+                << 16);
+
+              sequence_base := (
+                'x' ||
+                -- Take the first two bytes (four hex characters)
+                substr(
+                  -- Of the MD5 hash of the data we documented
+                  md5(table_name ||
+                    '#{SecureRandom.hex(16)}' ||
+                    time_part::text
+                  ),
+                  1, 4
+                )
+              -- And turn it into a bigint
+              )::bit(16)::bigint;
+
+              -- Finally, add our sequence number to our base, and chop
+              -- it to the last two bytes
+              tail := (
+                (sequence_base + nextval(table_name || '_id_seq'))
+                & 65535);
+
+              -- Return the time part and the sequence part. OR appears
+              -- faster here than addition, but they're equivalent:
+              -- time_part has no trailing two bytes, and tail is only
+              -- the last two bytes.
+              RETURN time_part | tail;
+            END
+          $$ LANGUAGE plpgsql VOLATILE;
+        SQL
+      end
+    end
+
+    def self.ensure_id_sequences_exist
+      conn = ActiveRecord::Base.connection
+
+      # Find tables using timestamp IDs.
+      default_regex = /timestamp_id\('(?<seq_prefix>\w+)'/
+      conn.tables.each do |table|
+        # We're only concerned with "id" columns.
+        next unless (id_col = conn.columns(table).find { |col| col.name == 'id' })
+
+        # And only those that are using timestamp_id.
+        next unless (data = default_regex.match(id_col.default_function))
+
+        seq_name = data[:seq_prefix] + '_id_seq'
+        # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
+        # NOT EXISTS, but we can't depend on that. Instead, catch the
+        # possible exception and ignore it.
+        # Note that seq_name isn't a column name, but it's a
+        # relation, like a column, and follows the same quoting rules
+        # in Postgres.
+        conn.execute(<<~SQL)
+          DO $$
+            BEGIN
+              CREATE SEQUENCE #{conn.quote_column_name(seq_name)};
+            EXCEPTION WHEN duplicate_table THEN
+              -- Do nothing, we have the sequence already.
+            END
+          $$ LANGUAGE plpgsql;
+        SQL
+      end
+    end
+  end
+end
diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake
index 7a055bf25..66468d999 100644
--- a/lib/tasks/db.rake
+++ b/lib/tasks/db.rake
@@ -1,5 +1,36 @@
 # frozen_string_literal: true
 
+require Rails.root.join('lib', 'mastodon', 'timestamp_ids')
+
+def each_schema_load_environment
+  # If we're in development, also run this for the test environment.
+  # This is a somewhat hacky way to do this, so here's why:
+  # 1. We have to define this before we load the schema, or we won't
+  #    have a timestamp_id function when we get to it in the schema.
+  # 2. db:setup calls db:schema:load_if_ruby, which calls
+  #    db:schema:load, which we define above as having a prerequisite
+  #    of this task.
+  # 3. db:schema:load ends up running
+  #    ActiveRecord::Tasks::DatabaseTasks.load_schema_current, which
+  #    calls a private method `each_current_configuration`, which
+  #    explicitly also does the loading for the `test` environment
+  #    if the current environment is `development`, so we end up
+  #    needing to do the same, and we can't even use the same method
+  #    to do it.
+
+  if Rails.env == 'development'
+    test_conf = ActiveRecord::Base.configurations['test']
+    if test_conf['database']&.present?
+      ActiveRecord::Base.establish_connection(:test)
+      yield
+
+      ActiveRecord::Base.establish_connection(Rails.env.to_sym)
+    end
+  end
+
+  yield
+end
+
 namespace :db do
   namespace :migrate do
     desc 'Setup the db or migrate depending on state of db'
@@ -16,4 +47,29 @@ namespace :db do
       end
     end
   end
+
+  # Before we load the schema, define the timestamp_id function.
+  # Idiomatically, we might do this in a migration, but then it
+  # wouldn't end up in schema.rb, so we'd need to figure out a way to
+  # get it in before doing db:setup as well. This is simpler, and
+  # ensures it's always in place.
+  Rake::Task['db:schema:load'].enhance ['db:define_timestamp_id']
+
+  # After we load the schema, make sure we have sequences for each
+  # table using timestamp IDs.
+  Rake::Task['db:schema:load'].enhance do
+    Rake::Task['db:ensure_id_sequences_exist'].invoke
+  end
+
+  task :define_timestamp_id do
+    each_schema_load_environment do
+      Mastodon::TimestampIds.define_timestamp_id
+    end
+  end
+
+  task :ensure_id_sequences_exist do
+    each_schema_load_environment do
+      Mastodon::TimestampIds.ensure_id_sequences_exist
+    end
+  end
 end