authorClaire <claire.github-309c@sitedethib.com>2021-11-26 22:53:55 +0100
committerClaire <claire.github-309c@sitedethib.com>2021-11-26 22:53:55 +0100
commit9b861d56a9646e75caf1f7d60fa9eade566d3740 (patch)
treea45059ea258e035c6f63e966973629b9cd0766b8 /lib
parent97151840b02499a0cec1360907a6b86a1df02b3b (diff)
parent1c826471e7d964f0fdb2dc2b89dcd5a19c017538 (diff)
Merge branch 'main' into glitch-soc/merge-upstream
- `.env.production.sample`:
  Copied upstream changes.
- `app/controllers/settings/identity_proofs_controller.rb`:
  Minor conflict due to glitch-soc's extra “enable_keybase” setting.
  Upstream removed keybase support altogether, so did the same.
- `app/controllers/well_known/keybase_proof_config_controller.rb`:
  Minor conflict due to glitch-soc's extra “enable_keybase” setting.
  Upstream removed keybase support altogether, so did the same.
- `lib/mastodon/statuses_cli.rb`:
  Minor conflict due to an optimization that wasn't shared between
  the two versions. Copied upstream's version.
2 files changed, 76 insertions, 30 deletions
diff --git a/lib/mastodon/search_cli.rb b/lib/mastodon/search_cli.rb
index 2d1ca1c05..6ad9d7b6a 100644
--- a/lib/mastodon/search_cli.rb
+++ b/lib/mastodon/search_cli.rb
@@ -17,10 +17,11 @@ module Mastodon
     option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
+    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
-    desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'
+    desc 'deploy', 'Create or upgrade Elasticsearch indices and populate them'
     long_desc <<~LONG_DESC
-      If ElasticSearch is empty, this command will create the necessary indices
+      If Elasticsearch is empty, this command will create the necessary indices
       and then import data from the database into those indices.
       This command will also upgrade indices if the underlying schema has been
@@ -35,6 +36,11 @@ module Mastodon
+      if options[:batch_size] < 1
+        say('Cannot run with this batch_size setting, must be at least 1', :red)
+        exit(1)
+      end
       indices = begin
         if options[:only]
           options[:only].map { |str| "#{str.camelize}Index".constantize }
@@ -73,7 +79,7 @@ module Mastodon
       # is uneconomical. So we only ever add.
       indices.each do |index|
         progress.title = "Importing #{index} "
-        batch_size     = 1_000
+        batch_size     = options[:batch_size]
         slice_size     = (batch_size / options[:concurrency]).ceil
         index.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|
diff --git a/lib/mastodon/statuses_cli.rb b/lib/mastodon/statuses_cli.rb
index 8a18a3b2f..f841529e0 100644
--- a/lib/mastodon/statuses_cli.rb
+++ b/lib/mastodon/statuses_cli.rb
@@ -6,6 +6,7 @@ require_relative 'cli_helper'
 module Mastodon
   class StatusesCLI < Thor
+    include CLIHelper
     include ActionView::Helpers::NumberHelper
     def self.exit_on_failure?
@@ -15,6 +16,8 @@ module Mastodon
     option :days, type: :numeric, default: 90
     option :clean_followed, type: :boolean
     option :skip_media_remove, type: :boolean
+    option :vacuum, type: :boolean, default: false, desc: 'Reduce the file size and update the statistics. This option locks the table for a long time, so run it offline'
+    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     desc 'remove', 'Remove unreferenced statuses'
     long_desc <<~LONG_DESC
       Remove statuses that are not referenced by local user activity, such as
@@ -25,52 +28,89 @@ module Mastodon
       indices before commencing, and removes them afterward.
     def remove
+      if options[:batch_size] < 1
+        say('Cannot run with this batch_size setting, must be at least 1', :red)
+        exit(1)
+      end
       say('Creating temporary database indices...')
-      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently, if_not_exists: true)
       max_id   = Mastodon::Snowflake.id_at(options[:days].days.ago)
       start_at = Time.now.to_f
+      say('Extract the deletion target... This might take a while...')
+      ActiveRecord::Base.connection.create_table('statuses_to_be_deleted', temporary: true)
+      # Skip accounts followed by local accounts
+      clean_followed_sql = 'AND NOT EXISTS (SELECT 1 FROM follows WHERE statuses.account_id = follows.target_account_id)' unless options[:clean_followed]
+      ActiveRecord::Base.connection.exec_insert(<<-SQL.squish, 'SQL', [[nil, max_id]])
+        INSERT INTO statuses_to_be_deleted (id)
+        SELECT statuses.id FROM statuses WHERE deleted_at IS NULL AND NOT local AND uri IS NOT NULL AND (id < $1)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= $1))
+        AND NOT EXISTS (SELECT 1 FROM status_pins WHERE statuses.id = status_id)
+        AND NOT EXISTS (SELECT 1 FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        #{clean_followed_sql}
+      SQL
+      say('Removing temporary database indices to restore write performance...')
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
       say('Beginning removal... This might take a while...')
-      scope = Status.remote.where('id < ?', max_id)
-      # Skip reblogs of local statuses
-      scope = scope.where('reblog_of_id NOT IN (SELECT statuses1.id FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))')
-      # Skip statuses that are pinned on profiles
-      scope = scope.where('id NOT IN (SELECT status_pins.status_id FROM status_pins WHERE statuses.id = status_id)')
-      # Skip statuses that mention local accounts
-      scope = scope.where('id NOT IN (SELECT mentions.status_id FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses which have replies
-      scope = scope.where('id NOT IN (SELECT statuses1.in_reply_to_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)')
-      # Skip statuses reblogged by local accounts or with recent boosts
-      scope = scope.where('id NOT IN (SELECT statuses1.reblog_of_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= ?))', max_id)
-      # Skip statuses favourited by local users
-      scope = scope.where('id NOT IN (SELECT favourites.status_id FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses bookmarked by local users
-      scope = scope.where('id NOT IN (SELECT bookmarks.status_id FROM bookmarks WHERE statuses.id = bookmarks.status_id)')
-      unless options[:clean_followed]
-        # Skip accounts followed by local accounts
-        scope = scope.where('account_id NOT IN (SELECT follows.target_account_id FROM follows WHERE statuses.account_id = follows.target_account_id)')
+      klass = Class.new(ApplicationRecord) do |c|
+        c.table_name = 'statuses_to_be_deleted'
-      scope.in_batches.delete_all
+      Object.const_set('StatusToBeDeleted', klass)
+      scope     = StatusToBeDeleted
+      processed = 0
+      removed   = 0
+      progress  = create_progress_bar(scope.count.fdiv(options[:batch_size]).ceil)
+      scope.reorder(nil).in_batches(of: options[:batch_size]) do |relation|
+        ids        = relation.pluck(:id)
+        processed += ids.count
+        removed   += Status.unscoped.where(id: ids).delete_all
+        progress.increment
+      end
+      progress.stop
+      if options[:vacuum]
+        say('Run VACUUM and ANALYZE to statuses...')
+        ActiveRecord::Base.connection.execute('VACUUM FULL ANALYZE statuses')
+      else
+        say('Run ANALYZE to statuses...')
+        ActiveRecord::Base.connection.execute('ANALYZE statuses')
+      end
       unless options[:skip_media_remove]
         say('Beginning removal of now-orphaned media attachments to free up disk space...')
-      say("Done after #{Time.now.to_f - start_at}s", :green)
+      say("Done after #{Time.now.to_f - start_at}s, removed #{removed} out of #{processed} statuses.", :green)
       say('Removing temporary database indices to restore write performance...')
-      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local) if ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id) if ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url) if ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url, if_exists: true)