From 0717d9b3e6904a4dcd5d2dc9e680cc5b21c50e51 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Sun, 8 Oct 2017 17:34:34 +0200 Subject: Set snowflake IDs for backdated statuses (#5260) - Rename Mastodon::TimestampIds into Mastodon::Snowflake for clarity - Skip for statuses coming from inbox, aka delivered in real-time - Skip for statuses that claim to be from the future --- lib/mastodon/snowflake.rb | 162 ++++++++++++++++++++++++++++++++++++++++++ lib/mastodon/timestamp_ids.rb | 131 ---------------------------------- lib/tasks/db.rake | 6 +- 3 files changed, 165 insertions(+), 134 deletions(-) create mode 100644 lib/mastodon/snowflake.rb delete mode 100644 lib/mastodon/timestamp_ids.rb (limited to 'lib') diff --git a/lib/mastodon/snowflake.rb b/lib/mastodon/snowflake.rb new file mode 100644 index 000000000..219e323d4 --- /dev/null +++ b/lib/mastodon/snowflake.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +module Mastodon::Snowflake + DEFAULT_REGEX = /timestamp_id\('(?\w+)'/ + + class Callbacks + def self.around_create(record) + now = Time.now.utc + + if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at + yield + else + record.id = Mastodon::Snowflake.id_at(record.created_at) + tries = 0 + + begin + yield + rescue ActiveRecord::RecordNotUnique + raise if tries > 100 + + tries += 1 + record.id += rand(100) + + retry + end + end + end + end + + class << self + # Our ID will be composed of the following: + # 6 bytes (48 bits) of millisecond-level timestamp + # 2 bytes (16 bits) of sequence data + # + # The 'sequence data' is intended to be unique within a + # given millisecond, yet obscure the 'serial number' of + # this row. + # + # To do this, we hash the following data: + # * Table name (if provided, skipped if not) + # * Secret salt (should not be guessable) + # * Timestamp (again, millisecond-level granularity) + # + # We then take the first two bytes of that value, and add + # the lowest two bytes of the table ID sequence number + # (`table_name`_id_seq). This means that even if we insert + # two rows at the same millisecond, they will have + # distinct 'sequence data' portions. + # + # If this happens, and an attacker can see both such IDs, + # they can determine which of the two entries was inserted + # first, but not the total number of entries in the table + # (even mod 2**16). + # + # The table name is included in the hash to ensure that + # different tables derive separate sequence bases so rows + # inserted in the same millisecond in different tables do + # not reveal the table ID sequence number for one another. + # + # The secret salt is included in the hash to ensure that + # external users cannot derive the sequence base given the + # timestamp and table name, which would allow them to + # compute the table ID sequence number. + def define_timestamp_id + return if already_defined? + + connection.execute(<<~SQL) + CREATE OR REPLACE FUNCTION timestamp_id(table_name text) + RETURNS bigint AS + $$ + DECLARE + time_part bigint; + sequence_base bigint; + tail bigint; + BEGIN + time_part := ( + -- Get the time in milliseconds + ((date_part('epoch', now()) * 1000))::bigint + -- And shift it over two bytes + << 16); + + sequence_base := ( + 'x' || + -- Take the first two bytes (four hex characters) + substr( + -- Of the MD5 hash of the data we documented + md5(table_name || + '#{SecureRandom.hex(16)}' || + time_part::text + ), + 1, 4 + ) + -- And turn it into a bigint + )::bit(16)::bigint; + + -- Finally, add our sequence number to our base, and chop + -- it to the last two bytes + tail := ( + (sequence_base + nextval(table_name || '_id_seq')) + & 65535); + + -- Return the time part and the sequence part. OR appears + -- faster here than addition, but they're equivalent: + -- time_part has no trailing two bytes, and tail is only + -- the last two bytes. + RETURN time_part | tail; + END + $$ LANGUAGE plpgsql VOLATILE; + SQL + end + + def ensure_id_sequences_exist + # Find tables using timestamp IDs. + connection.tables.each do |table| + # We're only concerned with "id" columns. + next unless (id_col = connection.columns(table).find { |col| col.name == 'id' }) + + # And only those that are using timestamp_id. + next unless (data = DEFAULT_REGEX.match(id_col.default_function)) + + seq_name = data[:seq_prefix] + '_id_seq' + + # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF + # NOT EXISTS, but we can't depend on that. Instead, catch the + # possible exception and ignore it. + # Note that seq_name isn't a column name, but it's a + # relation, like a column, and follows the same quoting rules + # in Postgres. + connection.execute(<<~SQL) + DO $$ + BEGIN + CREATE SEQUENCE #{connection.quote_column_name(seq_name)}; + EXCEPTION WHEN duplicate_table THEN + -- Do nothing, we have the sequence already. + END + $$ LANGUAGE plpgsql; + SQL + end + end + + def id_at(timestamp) + id = timestamp.to_i * 1000 + rand(1000) + id = id << 16 + id += rand(2**16) + id + end + + private + + def already_defined? + connection.execute(<<~SQL).values.first.first + SELECT EXISTS( + SELECT * FROM pg_proc WHERE proname = 'timestamp_id' + ); + SQL + end + + def connection + ActiveRecord::Base.connection + end + end +end diff --git a/lib/mastodon/timestamp_ids.rb b/lib/mastodon/timestamp_ids.rb deleted file mode 100644 index 3b048a50c..000000000 --- a/lib/mastodon/timestamp_ids.rb +++ /dev/null @@ -1,131 +0,0 @@ -# frozen_string_literal: true - -module Mastodon::TimestampIds - DEFAULT_REGEX = /timestamp_id\('(?\w+)'/ - - class << self - # Our ID will be composed of the following: - # 6 bytes (48 bits) of millisecond-level timestamp - # 2 bytes (16 bits) of sequence data - # - # The 'sequence data' is intended to be unique within a - # given millisecond, yet obscure the 'serial number' of - # this row. - # - # To do this, we hash the following data: - # * Table name (if provided, skipped if not) - # * Secret salt (should not be guessable) - # * Timestamp (again, millisecond-level granularity) - # - # We then take the first two bytes of that value, and add - # the lowest two bytes of the table ID sequence number - # (`table_name`_id_seq). This means that even if we insert - # two rows at the same millisecond, they will have - # distinct 'sequence data' portions. - # - # If this happens, and an attacker can see both such IDs, - # they can determine which of the two entries was inserted - # first, but not the total number of entries in the table - # (even mod 2**16). - # - # The table name is included in the hash to ensure that - # different tables derive separate sequence bases so rows - # inserted in the same millisecond in different tables do - # not reveal the table ID sequence number for one another. - # - # The secret salt is included in the hash to ensure that - # external users cannot derive the sequence base given the - # timestamp and table name, which would allow them to - # compute the table ID sequence number. - def define_timestamp_id - return if already_defined? - - connection.execute(<<~SQL) - CREATE OR REPLACE FUNCTION timestamp_id(table_name text) - RETURNS bigint AS - $$ - DECLARE - time_part bigint; - sequence_base bigint; - tail bigint; - BEGIN - time_part := ( - -- Get the time in milliseconds - ((date_part('epoch', now()) * 1000))::bigint - -- And shift it over two bytes - << 16); - - sequence_base := ( - 'x' || - -- Take the first two bytes (four hex characters) - substr( - -- Of the MD5 hash of the data we documented - md5(table_name || - '#{SecureRandom.hex(16)}' || - time_part::text - ), - 1, 4 - ) - -- And turn it into a bigint - )::bit(16)::bigint; - - -- Finally, add our sequence number to our base, and chop - -- it to the last two bytes - tail := ( - (sequence_base + nextval(table_name || '_id_seq')) - & 65535); - - -- Return the time part and the sequence part. OR appears - -- faster here than addition, but they're equivalent: - -- time_part has no trailing two bytes, and tail is only - -- the last two bytes. - RETURN time_part | tail; - END - $$ LANGUAGE plpgsql VOLATILE; - SQL - end - - def ensure_id_sequences_exist - # Find tables using timestamp IDs. - connection.tables.each do |table| - # We're only concerned with "id" columns. - next unless (id_col = connection.columns(table).find { |col| col.name == 'id' }) - - # And only those that are using timestamp_id. - next unless (data = DEFAULT_REGEX.match(id_col.default_function)) - - seq_name = data[:seq_prefix] + '_id_seq' - - # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF - # NOT EXISTS, but we can't depend on that. Instead, catch the - # possible exception and ignore it. - # Note that seq_name isn't a column name, but it's a - # relation, like a column, and follows the same quoting rules - # in Postgres. - connection.execute(<<~SQL) - DO $$ - BEGIN - CREATE SEQUENCE #{connection.quote_column_name(seq_name)}; - EXCEPTION WHEN duplicate_table THEN - -- Do nothing, we have the sequence already. - END - $$ LANGUAGE plpgsql; - SQL - end - end - - private - - def already_defined? - connection.execute(<<~SQL).values.first.first - SELECT EXISTS( - SELECT * FROM pg_proc WHERE proname = 'timestamp_id' - ); - SQL - end - - def connection - ActiveRecord::Base.connection - end - end -end diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake index 6af6bb6fb..32039c31d 100644 --- a/lib/tasks/db.rake +++ b/lib/tasks/db.rake @@ -1,6 +1,6 @@ # frozen_string_literal: true -require Rails.root.join('lib', 'mastodon', 'timestamp_ids') +require_relative '../mastodon/snowflake' def each_schema_load_environment # If we're in development, also run this for the test environment. @@ -63,13 +63,13 @@ namespace :db do task :define_timestamp_id do each_schema_load_environment do - Mastodon::TimestampIds.define_timestamp_id + Mastodon::Snowflake.define_timestamp_id end end task :ensure_id_sequences_exist do each_schema_load_environment do - Mastodon::TimestampIds.ensure_id_sequences_exist + Mastodon::Snowflake.ensure_id_sequences_exist end end end -- cgit