From 53aca8aecf33ca47825da8b2d10ecff089df5c64 Mon Sep 17 00:00:00 2001
From: OSAMU SATO <satosamu@gmail.com>
Date: Fri, 26 Nov 2021 16:29:53 +0900
Subject: Add batch_size option to bin/tootctl search deploy (#17049)

---
 lib/mastodon/search_cli.rb | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/mastodon/search_cli.rb b/lib/mastodon/search_cli.rb
index 2d1ca1c05..e36af24f4 100644
--- a/lib/mastodon/search_cli.rb
+++ b/lib/mastodon/search_cli.rb
@@ -17,6 +17,7 @@ module Mastodon
     ].freeze
 
     option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
+    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
     desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'
     long_desc <<~LONG_DESC
@@ -35,6 +36,11 @@ module Mastodon
         exit(1)
       end
 
+      if options[:batch_size] < 1
+        say('Cannot run with this batch_size setting, must be at least 1', :red)
+        exit(1)
+      end
+
       indices = begin
         if options[:only]
           options[:only].map { |str| "#{str.camelize}Index".constantize }
@@ -73,7 +79,7 @@ module Mastodon
       # is uneconomical. So we only ever add.
       indices.each do |index|
         progress.title = "Importing #{index} "
-        batch_size     = 1_000
+        batch_size     = options[:batch_size]
         slice_size     = (batch_size / options[:concurrency]).ceil
 
         index.adapter.default_scope.reorder(nil).find_in_batches(batch_size: batch_size) do |batch|
-- 
cgit 


From 06631fdc53d6c464aa9db4e31d35f1fe354bcc95 Mon Sep 17 00:00:00 2001
From: Takeshi Umeda <noel.yoshiba@gmail.com>
Date: Fri, 26 Nov 2021 16:30:02 +0900
Subject: Fix ElasticSearch to Elasticsearch (#17050)

---
 .env.nanobox                 | 2 +-
 .env.production.sample       | 2 +-
 config/initializers/chewy.rb | 6 +++---
 lib/mastodon/search_cli.rb   | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'lib')

diff --git a/.env.nanobox b/.env.nanobox
index d61673836..ad941c947 100644
--- a/.env.nanobox
+++ b/.env.nanobox
@@ -13,7 +13,7 @@ DB_PORT=5432
 
 # DATABASE_URL=postgresql://$DATA_DB_USER:$DATA_DB_PASS@$DATA_DB_HOST/gonano
 
-# Optional ElasticSearch configuration
+# Optional Elasticsearch configuration
 ES_ENABLED=true
 ES_HOST=$DATA_ELASTIC_HOST
 ES_PORT=9200
diff --git a/.env.production.sample b/.env.production.sample
index c24c31c9b..8eeff3794 100644
--- a/.env.production.sample
+++ b/.env.production.sample
@@ -29,7 +29,7 @@ DB_NAME=mastodon_production
 DB_PASS=
 DB_PORT=5432
 
-# ElasticSearch (optional)
+# Elasticsearch (optional)
 # ------------------------
 ES_ENABLED=true
 ES_HOST=localhost
diff --git a/config/initializers/chewy.rb b/config/initializers/chewy.rb
index fbbcbbcde..f303fc54d 100644
--- a/config/initializers/chewy.rb
+++ b/config/initializers/chewy.rb
@@ -17,7 +17,7 @@ Chewy.settings = {
 }
 
 # We use our own async strategy even outside the request-response
-# cycle, which takes care of checking if ElasticSearch is enabled
+# cycle, which takes care of checking if Elasticsearch is enabled
 # or not. However, mind that for the Rails console, the :urgent
 # strategy is set automatically with no way to override it.
 Chewy.root_strategy              = :custom_sidekiq
@@ -32,8 +32,8 @@ module Chewy
   end
 end
 
-# ElasticSearch uses Faraday internally. Faraday interprets the
+# Elasticsearch uses Faraday internally. Faraday interprets the
 # http_proxy env variable by default which leads to issues when
 # Mastodon is run with hidden services enabled, because
-# ElasticSearch is *not* supposed to be accessed through a proxy
+# Elasticsearch is *not* supposed to be accessed through a proxy
 Faraday.ignore_env_proxy = true
diff --git a/lib/mastodon/search_cli.rb b/lib/mastodon/search_cli.rb
index e36af24f4..6ad9d7b6a 100644
--- a/lib/mastodon/search_cli.rb
+++ b/lib/mastodon/search_cli.rb
@@ -19,9 +19,9 @@ module Mastodon
     option :concurrency, type: :numeric, default: 2, aliases: [:c], desc: 'Workload will be split between this number of threads'
     option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     option :only, type: :array, enum: %w(accounts tags statuses), desc: 'Only process these indices'
-    desc 'deploy', 'Create or upgrade ElasticSearch indices and populate them'
+    desc 'deploy', 'Create or upgrade Elasticsearch indices and populate them'
     long_desc <<~LONG_DESC
-      If ElasticSearch is empty, this command will create the necessary indices
+      If Elasticsearch is empty, this command will create the necessary indices
       and then import data from the database into those indices.
 
       This command will also upgrade indices if the underlying schema has been
-- 
cgit 


From 0ac7efdc50200020f42b2d94151ecd80144e3d8c Mon Sep 17 00:00:00 2001
From: Takeshi Umeda <noel.yoshiba@gmail.com>
Date: Sat, 27 Nov 2021 06:08:47 +0900
Subject: Fix performance of tootctl statuses remove (#17052)

* Fix performance of tootctl statuses remove

* Fix model class
---
 lib/mastodon/statuses_cli.rb | 94 +++++++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 27 deletions(-)

(limited to 'lib')

diff --git a/lib/mastodon/statuses_cli.rb b/lib/mastodon/statuses_cli.rb
index b9dccdd8a..f841529e0 100644
--- a/lib/mastodon/statuses_cli.rb
+++ b/lib/mastodon/statuses_cli.rb
@@ -6,6 +6,7 @@ require_relative 'cli_helper'
 
 module Mastodon
   class StatusesCLI < Thor
+    include CLIHelper
     include ActionView::Helpers::NumberHelper
 
     def self.exit_on_failure?
@@ -15,6 +16,8 @@ module Mastodon
     option :days, type: :numeric, default: 90
     option :clean_followed, type: :boolean
     option :skip_media_remove, type: :boolean
+    option :vacuum, type: :boolean, default: false, desc: 'Reduce the file size and update the statistics. This option locks the table for a long time, so run it offline'
+    option :batch_size, type: :numeric, default: 1_000, aliases: [:b], desc: 'Number of records in each batch'
     desc 'remove', 'Remove unreferenced statuses'
     long_desc <<~LONG_DESC
       Remove statuses that are not referenced by local user activity, such as
@@ -25,52 +28,89 @@ module Mastodon
       indices before commencing, and removes them afterward.
     LONG_DESC
     def remove
+      if options[:batch_size] < 1
+        say('Cannot run with this batch_size setting, must be at least 1', :red)
+        exit(1)
+      end
+
       say('Creating temporary database indices...')
 
-      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently) unless ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.add_index(:accounts, :id, name: :index_accounts_local, where: 'domain is null', algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:status_pins, :status_id, name: :index_status_pins_status_id, algorithm: :concurrently, if_not_exists: true)
+      ActiveRecord::Base.connection.add_index(:media_attachments, :remote_url, name: :index_media_attachments_remote_url, where: 'remote_url is not null', algorithm: :concurrently, if_not_exists: true)
 
       max_id   = Mastodon::Snowflake.id_at(options[:days].days.ago)
       start_at = Time.now.to_f
 
+      say('Extract the deletion target... This might take a while...')
+
+      ActiveRecord::Base.connection.create_table('statuses_to_be_deleted', temporary: true)
+
+      # Skip accounts followed by local accounts
+      clean_followed_sql = 'AND NOT EXISTS (SELECT 1 FROM follows WHERE statuses.account_id = follows.target_account_id)' unless options[:clean_followed]
+
+      ActiveRecord::Base.connection.exec_insert(<<-SQL.squish, 'SQL', [[nil, max_id]])
+        INSERT INTO statuses_to_be_deleted (id)
+        SELECT statuses.id FROM statuses WHERE deleted_at IS NULL AND NOT local AND uri IS NOT NULL AND (id < $1)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))
+        AND NOT EXISTS (SELECT 1 FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= $1))
+        AND NOT EXISTS (SELECT 1 FROM status_pins WHERE statuses.id = status_id)
+        AND NOT EXISTS (SELECT 1 FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        AND NOT EXISTS (SELECT 1 FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))
+        #{clean_followed_sql}
+      SQL
+
+      say('Removing temporary database indices to restore write performance...')
+
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
+
       say('Beginning removal... This might take a while...')
 
-      scope = Status.remote.where('id < ?', max_id)
-      # Skip reblogs of local statuses
-      scope = scope.where('reblog_of_id NOT IN (SELECT statuses1.id FROM statuses AS statuses1 WHERE statuses1.id = statuses.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local))')
-      # Skip statuses that are pinned on profiles
-      scope = scope.where('id NOT IN (SELECT status_pins.status_id FROM status_pins WHERE statuses.id = status_id)')
-      # Skip statuses that mention local accounts
-      scope = scope.where('id NOT IN (SELECT mentions.status_id FROM mentions WHERE statuses.id = mentions.status_id AND mentions.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses which have replies
-      scope = scope.where('id NOT IN (SELECT statuses1.in_reply_to_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.in_reply_to_id)')
-      # Skip statuses reblogged by local accounts or with recent boosts
-      scope = scope.where('id NOT IN (SELECT statuses1.reblog_of_id FROM statuses AS statuses1 WHERE statuses.id = statuses1.reblog_of_id AND (statuses1.uri IS NULL OR statuses1.local OR statuses1.id >= ?))', max_id)
-      # Skip statuses favourited by local users
-      scope = scope.where('id NOT IN (SELECT favourites.status_id FROM favourites WHERE statuses.id = favourites.status_id AND favourites.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-      # Skip statuses bookmarked by local users
-      scope = scope.where('id NOT IN (SELECT bookmarks.status_id FROM bookmarks WHERE statuses.id = bookmarks.status_id AND bookmarks.account_id IN (SELECT accounts.id FROM accounts WHERE domain IS NULL))')
-
-      unless options[:clean_followed]
-        # Skip accounts followed by local accounts
-        scope = scope.where('account_id NOT IN (SELECT follows.target_account_id FROM follows WHERE statuses.account_id = follows.target_account_id)')
+      klass = Class.new(ApplicationRecord) do |c|
+        c.table_name = 'statuses_to_be_deleted'
       end
 
-      scope.in_batches.delete_all
+      Object.const_set('StatusToBeDeleted', klass)
+
+      scope     = StatusToBeDeleted
+      processed = 0
+      removed   = 0
+      progress  = create_progress_bar(scope.count.fdiv(options[:batch_size]).ceil)
+
+      scope.reorder(nil).in_batches(of: options[:batch_size]) do |relation|
+        ids        = relation.pluck(:id)
+        processed += ids.count
+        removed   += Status.unscoped.where(id: ids).delete_all
+        progress.increment
+      end
+
+      progress.stop
+
+      if options[:vacuum]
+        say('Run VACUUM and ANALYZE to statuses...')
+
+        ActiveRecord::Base.connection.execute('VACUUM FULL ANALYZE statuses')
+      else
+        say('Run ANALYZE to statuses...')
+
+        ActiveRecord::Base.connection.execute('ANALYZE statuses')
+      end
 
       unless options[:skip_media_remove]
         say('Beginning removal of now-orphaned media attachments to free up disk space...')
         Scheduler::MediaCleanupScheduler.new.perform
       end
 
-      say("Done after #{Time.now.to_f - start_at}s", :green)
+      say("Done after #{Time.now.to_f - start_at}s, removed #{removed} out of #{processed} statuses.", :green)
     ensure
       say('Removing temporary database indices to restore write performance...')
 
-      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local) if ActiveRecord::Base.connection.index_name_exists?(:accounts, :index_accounts_local)
-      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id) if ActiveRecord::Base.connection.index_name_exists?(:status_pins, :index_status_pins_status_id)
-      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url) if ActiveRecord::Base.connection.index_name_exists?(:media_attachments, :index_media_attachments_remote_url)
+      ActiveRecord::Base.connection.remove_index(:accounts, name: :index_accounts_local, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:status_pins, name: :index_status_pins_status_id, if_exists: true)
+      ActiveRecord::Base.connection.remove_index(:media_attachments, name: :index_media_attachments_remote_url, if_exists: true)
     end
   end
 end
-- 
cgit