about summary refs log tree commit diff
path: root/lib
diff options
context:
space:
mode:
authorTakeshi Umeda <noel.yoshiba@gmail.com>2021-11-27 06:08:47 +0900
committerGitHub <noreply@github.com>2021-11-26 22:08:47 +0100
commit0ac7efdc50200020f42b2d94151ecd80144e3d8c (patch)
tree19ac94631381e00f76e0eb348b5d0e5c41b4e065 /lib
parentddcb9da74fb4ee1233c4c5ba1ab3d9c8edc9aba2 (diff)
Fix performance of tootctl statuses remove (#17052)
* Fix performance of tootctl statuses remove

* Fix model class
Diffstat (limited to 'lib')
-rw-r--r--lib/mastodon/statuses_cli.rb94
1 files changed, 67 insertions, 27 deletions
diff --git a/lib/mastodon/statuses_cli.rb b/lib/mastodon/statuses_cli.rb
index b9dccdd8a..f841529e0 100644
--- a/lib/mastodon/statuses_cli.rb
+++ b/lib/mastodon/statuses_cli.rb
@@ -6,6 +6,7 @@ require_relative 'cli_helper'
 
 module Mastodon
   class StatusesCLI < Thor
+    include CLIHelper
     include ActionView::Helpers::NumberHelper
 
     def self.exit_on_failure?
@@ -15,6 +16,8 @@ module Mastodon
     option :days, type: :numeric, default: 90
     option :clean_followed, type: :boolean
     option :skip_media_remove, type: :boolean
+    option :vacuum, type: :boolean, default: false, desc: 'Reduce the file size and update the statistics. This option locks the table for a long time, so run it offline'
+    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     desc 'remove', 'Remove unreferenced statuses'
     long_desc <<~LONG_DESC
       Remove statuses that are not referenced by local user activity, such as
@@ -25,52 +28,89 @@ module Mastodon
       indices before commencing, and removes them afterward.
     LONG_DESC
     def remove
+      if options[:batch_size] < 1
+        say('Cannot run with this batch_size setting, must be at least 1', :red)
+        exit(1)
+      end
+
       say('Creating temporary database indices...')
 
-      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently, if_not_exists: true)
 
       max_id   = Mastodon::Snowflake.id_at(options[:days].days.ago)
       start_at = Time.now.to_f
 
+      say('Extract the deletion target... This might take a while...')
+
+      ActiveRecord::Base.connection.create_table('statuses_to_be_deleted', temporary: true)
+
+      # Skip accounts followed by local accounts
+      clean_followed_sql = 'AND NOT EXISTS (SELECT 1 FROM follows WHERE statuses.account_id = follows.target_account_id)' unless options[:clean_followed]
+
+      ActiveRecord::Base.connection.exec_insert(<<-SQL.squish, 'SQL', [[nil, max_id]])
+        INSERT INTO statuses_to_be_deleted (id)
+        SELECT statuses.id FROM statuses WHERE deleted_at IS NULL AND NOT local AND uri IS NOT NULL AND (id < $1)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= $1))
+        AND NOT EXISTS (SELECT 1 FROM status_pins WHERE statuses.id = status_id)
+        AND NOT EXISTS (SELECT 1 FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        #{clean_followed_sql}
+      SQL
+
+      say('Removing temporary database indices to restore write performance...')
+
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
+
       say('Beginning removal... This might take a while...')
 
-      scope = Status.remote.where('id < ?', max_id)
-      # Skip reblogs of local statuses
-      scope = scope.where('reblog_of_id NOT IN (SELECT statuses1.id FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))')
-      # Skip statuses that are pinned on profiles
-      scope = scope.where('id NOT IN (SELECT status_pins.status_id FROM status_pins WHERE statuses.id = status_id)')
-      # Skip statuses that mention local accounts
-      scope = scope.where('id NOT IN (SELECT mentions.status_id FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses which have replies
-      scope = scope.where('id NOT IN (SELECT statuses1.in_reply_to_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)')
-      # Skip statuses reblogged by local accounts or with recent boosts
-      scope = scope.where('id NOT IN (SELECT statuses1.reblog_of_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= ?))', max_id)
-      # Skip statuses favourited by local users
-      scope = scope.where('id NOT IN (SELECT favourites.status_id FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses bookmarked by local users
-      scope = scope.where('id NOT IN (SELECT bookmarks.status_id FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-
-      unless options[:clean_followed]
-        # Skip accounts followed by local accounts
-        scope = scope.where('account_id NOT IN (SELECT follows.target_account_id FROM follows WHERE statuses.account_id = follows.target_account_id)')
+      klass = Class.new(ApplicationRecord) do |c|
+        c.table_name = 'statuses_to_be_deleted'
       end
 
-      scope.in_batches.delete_all
+      Object.const_set('StatusToBeDeleted', klass)
+
+      scope     = StatusToBeDeleted
+      processed = 0
+      removed   = 0
+      progress  = create_progress_bar(scope.count.fdiv(options[:batch_size]).ceil)
+
+      scope.reorder(nil).in_batches(of: options[:batch_size]) do |relation|
+        ids        = relation.pluck(:id)
+        processed += ids.count
+        removed   += Status.unscoped.where(id: ids).delete_all
+        progress.increment
+      end
+
+      progress.stop
+
+      if options[:vacuum]
+        say('Run VACUUM and ANALYZE to statuses...')
+
+        ActiveRecord::Base.connection.execute('VACUUM FULL ANALYZE statuses')
+      else
+        say('Run ANALYZE to statuses...')
+
+        ActiveRecord::Base.connection.execute('ANALYZE statuses')
+      end
 
       unless options[:skip_media_remove]
         say('Beginning removal of now-orphaned media attachments to free up disk space...')
         Scheduler::MediaCleanupScheduler.new.perform
       end
 
-      say("Done after #{Time.now.to_f - start_at}s", :green)
+      say("Done after #{Time.now.to_f - start_at}s, removed #{removed} out of #{processed} statuses.", :green)
     ensure
       say('Removing temporary database indices to restore write performance...')
 
-      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local) if ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id) if ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url) if ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url, if_exists: true)
     end
   end
 end